switchback 0.2.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {switchback-0.2.0 → switchback-0.5.0}/.env.example +24 -8
  2. switchback-0.5.0/CHANGELOG.md +119 -0
  3. {switchback-0.2.0 → switchback-0.5.0}/PKG-INFO +52 -20
  4. {switchback-0.2.0 → switchback-0.5.0}/README.md +51 -19
  5. {switchback-0.2.0 → switchback-0.5.0}/pyproject.toml +1 -1
  6. {switchback-0.2.0 → switchback-0.5.0}/switchback/api.py +5 -0
  7. switchback-0.5.0/switchback/doctor.py +59 -0
  8. {switchback-0.2.0 → switchback-0.5.0}/switchback/flags.py +2 -2
  9. {switchback-0.2.0 → switchback-0.5.0}/switchback/orchestrator.py +172 -63
  10. {switchback-0.2.0 → switchback-0.5.0}/switchback/policy/botwall.py +36 -2
  11. {switchback-0.2.0 → switchback-0.5.0}/switchback/policy/gates.py +59 -0
  12. {switchback-0.2.0 → switchback-0.5.0}/switchback/reporting.py +1 -1
  13. switchback-0.5.0/switchback/tiers/__init__.py +24 -0
  14. switchback-0.2.0/switchback/tiers/tier0_apis.py → switchback-0.5.0/switchback/tiers/tier_1.py +8 -4
  15. switchback-0.2.0/switchback/tiers/tier1_http.py → switchback-0.5.0/switchback/tiers/tier_2.py +6 -2
  16. switchback-0.2.0/switchback/tiers/tier2_cloudscraper.py → switchback-0.5.0/switchback/tiers/tier_3.py +39 -4
  17. switchback-0.2.0/switchback/tiers/tier3_browser.py → switchback-0.5.0/switchback/tiers/tier_4.py +46 -5
  18. switchback-0.2.0/switchback/tiers/tier3b_camoufox.py → switchback-0.5.0/switchback/tiers/tier_5.py +8 -2
  19. switchback-0.2.0/switchback/tiers/tier_residential.py → switchback-0.5.0/switchback/tiers/tier_6.py +8 -2
  20. switchback-0.2.0/switchback/tiers/tier4_firecrawl.py → switchback-0.5.0/switchback/tiers/tier_7.py +13 -3
  21. {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/PKG-INFO +52 -20
  22. {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/SOURCES.txt +8 -7
  23. switchback-0.2.0/CHANGELOG.md +0 -46
  24. switchback-0.2.0/switchback/tiers/__init__.py +0 -24
  25. {switchback-0.2.0 → switchback-0.5.0}/CONTRIBUTING.md +0 -0
  26. {switchback-0.2.0 → switchback-0.5.0}/LICENSE +0 -0
  27. {switchback-0.2.0 → switchback-0.5.0}/MANIFEST.in +0 -0
  28. {switchback-0.2.0 → switchback-0.5.0}/NOTICE +0 -0
  29. {switchback-0.2.0 → switchback-0.5.0}/SECURITY.md +0 -0
  30. {switchback-0.2.0 → switchback-0.5.0}/clients/node_bridge.md +0 -0
  31. {switchback-0.2.0 → switchback-0.5.0}/clients/python_client.py +0 -0
  32. {switchback-0.2.0 → switchback-0.5.0}/config/botwall_skip_urls.txt +0 -0
  33. {switchback-0.2.0 → switchback-0.5.0}/config/extraction.example.json +0 -0
  34. {switchback-0.2.0 → switchback-0.5.0}/setup.cfg +0 -0
  35. {switchback-0.2.0 → switchback-0.5.0}/switchback/__init__.py +0 -0
  36. {switchback-0.2.0 → switchback-0.5.0}/switchback/__main__.py +0 -0
  37. {switchback-0.2.0 → switchback-0.5.0}/switchback/concurrency.py +0 -0
  38. {switchback-0.2.0 → switchback-0.5.0}/switchback/content_cache.py +0 -0
  39. {switchback-0.2.0 → switchback-0.5.0}/switchback/egress.py +0 -0
  40. {switchback-0.2.0 → switchback-0.5.0}/switchback/extract.py +0 -0
  41. {switchback-0.2.0 → switchback-0.5.0}/switchback/normalize.py +0 -0
  42. {switchback-0.2.0 → switchback-0.5.0}/switchback/policy/__init__.py +0 -0
  43. {switchback-0.2.0 → switchback-0.5.0}/switchback/py.typed +0 -0
  44. {switchback-0.2.0 → switchback-0.5.0}/switchback/search.py +0 -0
  45. {switchback-0.2.0 → switchback-0.5.0}/switchback/server.py +0 -0
  46. {switchback-0.2.0 → switchback-0.5.0}/switchback/session_cache.py +0 -0
  47. {switchback-0.2.0 → switchback-0.5.0}/switchback/session_trace.py +0 -0
  48. {switchback-0.2.0 → switchback-0.5.0}/switchback/tiers/_browser.py +0 -0
  49. {switchback-0.2.0 → switchback-0.5.0}/switchback/tracing.py +0 -0
  50. {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/dependency_links.txt +0 -0
  51. {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/entry_points.txt +0 -0
  52. {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/requires.txt +0 -0
  53. {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/top_level.txt +0 -0
@@ -11,7 +11,7 @@
11
11
  OTEL_SERVICE_NAME=switchback
12
12
  OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
13
13
 
14
- # ── Search (Tier-0 SearXNG, query → URLs) ───────────────────────────────────
14
+ # ── Search (SearXNG, query → URLs — separate from the fetch cascade) ─────────
15
15
  SEARXNG_URL=http://localhost:8888
16
16
 
17
17
  # ── Output format ───────────────────────────────────────────────────────────
@@ -27,22 +27,38 @@ SEARXNG_URL=http://localhost:8888
27
27
  # fall back to their text for those sources.
28
28
  SCRAPER_OUTPUT_FORMAT=markdown
29
29
 
30
- # ── Tier 2.5 · Jina Reader (r.jina.ai) ──────────────────────────────────────
31
- # Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
32
- JINA_API_KEY=
33
- SCRAPER_JINA_TIMEOUT_S=20
30
+ # ── tier_3 · Cloudflare solver (cloudscraper) ───────────────────────────────
31
+ # Needs the 3.x Enhanced Edition fork (see README); with the frozen PyPI build
32
+ # the tier reports `unavailable` and fails fast. Wall-clock cap on a single solve
33
+ # so an unsolvable challenge can't eat the per-URL deadline before the browser
34
+ # tier runs. Lower (e.g. 12) if tier_3 rarely wins on your hosts.
35
+ SCRAPER_TIER_3_TIMEOUT_S=25
34
36
 
35
- # ── Tier 3b · Camoufox (Firefox stealth) ────────────────────────────────────
37
+ # ── tier_5 · Camoufox (Firefox stealth) ─────────────────────────────────────
36
38
  # ON by default. Needs: pip install camoufox && camoufox fetch
37
39
  # Set to 1 to turn the tier off entirely.
38
40
  SCRAPER_DISABLE_CAMOUFOX=
39
- SCRAPER_CAMOUFOX_TIMEOUT_MS=45000
41
+ SCRAPER_TIER_5_TIMEOUT_S=45
40
42
 
41
- # ── Tier 4 · Firecrawl (paid, last resort) ──────────────────────────────────
43
+ # ── tier_7 · Firecrawl (paid, last resort) ──────────────────────────────────
42
44
  # Required only if this tier runs. Set SCRAPER_DISABLE_FIRECRAWL=1 to skip it.
43
45
  FIRECRAWL_API_KEY=
44
46
  SCRAPER_DISABLE_FIRECRAWL=
45
47
 
48
+ # ── Per-tier timeouts (seconds) ─────────────────────────────────────────────
49
+ # Each tier's wall-clock/socket cap; override any of them. Defaults shown below.
50
+ # tier_3 (=25) and tier_5 (=45) are set live in their sections above; the rest
51
+ # fall back to these defaults. The pre-0.5.0 names are still honored when the new
52
+ # var is unset: SCRAPER_CLOUDSCRAPER_TIMEOUT_S → tier_3,
53
+ # SCRAPER_CAMOUFOX_TIMEOUT_MS → tier_5, SCRAPER_RESIDENTIAL_TIMEOUT_MS → tier_6.
54
+ #SCRAPER_TIER_1_TIMEOUT_S=15 # direct APIs / open mirrors
55
+ #SCRAPER_TIER_2_TIMEOUT_S=15 # plain HTTP + TLS impersonation
56
+ #SCRAPER_TIER_3_TIMEOUT_S=25 # cloudscraper (Cloudflare solver)
57
+ #SCRAPER_TIER_4_TIMEOUT_S=15 # stealth headless browser (patchright)
58
+ #SCRAPER_TIER_5_TIMEOUT_S=45 # camoufox (slowest rung; hard CF solves ~40s)
59
+ #SCRAPER_TIER_6_TIMEOUT_S=30 # residential-IP CDP browser
60
+ #SCRAPER_TIER_7_TIMEOUT_S=15 # Firecrawl (paid) — was unbounded; raise if scrapes get cut off
61
+
46
62
  # ── Orchestrator ────────────────────────────────────────────────────────────
47
63
  # Per-URL wall-clock budget (s), checked between tiers. 45s balances latency vs
48
64
  # coverage — roughly fits a Camoufox solve (~40s) that starts after the cheaper
@@ -0,0 +1,119 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. Format loosely follows
4
+ [Keep a Changelog](https://keepachangelog.com/); this project uses semantic-ish
5
+ versioning while pre-1.0.
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.5.0] - 2026-06-30
10
+
11
+ ### Changed
12
+ - **Tiers renamed to plain `tier_1`…`tier_7`** (cost-ordered, contiguous) in place
13
+ of the old mixed scheme (`tier0_apis`, `tier1_http`, `tier2_cloudscraper`,
14
+ `tier3_browser`, `tier3b_camoufox`, `tier_residential`, `tier4_firecrawl`). The
15
+ mapping is positional: `tier_1`=apis, `tier_2`=http, `tier_3`=cloudscraper,
16
+ `tier_4`=browser, `tier_5`=camoufox, `tier_6`=residential, `tier_7`=firecrawl.
17
+ **Backwards-compatible:** an existing `state/botwall_db.json` is migrated on load
18
+ (a host's learned `winning_tier` / `tier_stats` keys are remapped to the new
19
+ names), so routing survives the upgrade instead of re-probing from scratch.
20
+
21
+ ### Added
22
+ - **Per-tier timeout knobs** — every tier now reads `SCRAPER_TIER_<N>_TIMEOUT_S`
23
+ (seconds, `N` = 1–7). Defaults: `15` for tiers without a prior budget
24
+ (apis/http/browser), and the existing budgets are preserved — `tier_3`=25,
25
+ `tier_5`=45, `tier_6`=30. The previously-unconfigurable/unbounded tiers (apis,
26
+ http, browser, **firecrawl**) are now bounded and overridable. The pre-rename
27
+ `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` /
28
+ `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset.
29
+ Note: `tier_7` (paid Firecrawl) was previously unbounded; its new `15`s default
30
+ bounds it — raise `SCRAPER_TIER_7_TIMEOUT_S` if hard hosts get cut off (a scrape
31
+ killed at the cap may still be billed). `SCRAPER_TIER_RETRIES_<TIER>` overrides
32
+ follow the new names (e.g. `SCRAPER_TIER_RETRIES_TIER_4`).
33
+
34
+ ## [0.4.0] - 2026-06-29
35
+
36
+ ### Added
37
+ - **Configurable per-tier retries** — a tier can now re-attempt before falling
38
+ through to the next, more capable one. `SCRAPER_TIER_RETRIES` (global, default
39
+ `0` = off; `N` → up to `1+N` tries per tier), per-tier overrides
40
+ `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`), and
41
+ `SCRAPER_TIER_RETRY_ON` (retryable failure classes; default
42
+ `timeout,rate_limited,connection` — widen to include `botwall,http_block` behind
43
+ a rotating residential proxy, where each retry gets a fresh IP). Retries stay
44
+ bounded by `SCRAPER_DEADLINE_S`, and intermediate retries are traced/logged but
45
+ **not** persisted to the botwall policy DB, so they never inflate the
46
+ self-healing skip / `needs_egress` counters. Default `0` keeps behaviour
47
+ unchanged. Enabling retries on the paid Firecrawl tier bills per attempt.
48
+
49
+ ### Fixed
50
+ - **Quality gate rejects content shells** — the gate no longer passes a page just
51
+ because it clears the length floor; thin "shell" pages (nav/boilerplate with no
52
+ real article body) are now treated as a tier miss so the cascade falls through.
53
+ - **Paid last-resort budget reserve** — `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S`
54
+ (default 25s) stops starting local tiers once enough of the per-URL deadline has
55
+ elapsed and an enabled paid tier is still ahead, so a hard host can't burn the
56
+ whole budget before Firecrawl gets a turn.
57
+
58
+ ## [0.3.0] - 2026-06-27
59
+
60
+ ### Added
61
+ - **`unavailable` tier outcome** — when a tier's optional dependency is missing,
62
+ the wrong version, or not installed yet (frozen PyPI `cloudscraper` instead of
63
+ the 3.x stealth fork; patchright's Chromium not downloaded during an async
64
+ cold-start install), the tier now fails fast (~0ms) with a distinct
65
+ `unavailable` outcome carrying the exact install command, logged once per tier.
66
+ It ranks above bot-wall in the verdict, so an environment problem is no longer
67
+ masked as `botwall` — and a missing Tier 2 dependency no longer burns the
68
+ per-URL solve budget before the browser tier runs.
69
+ - **`switchback --doctor`** — preflight tier-readiness check (doubles as a
70
+ healthcheck: exit 0 when the capable tiers are ready). Reports whether
71
+ cloudscraper is the stealth-capable 3.x fork, patchright + Chromium are
72
+ installed, Camoufox/Node are present, and Firecrawl is configured. Built for
73
+ cold-start deploys where the browser is installed by a background thread after
74
+ boot.
75
+
76
+ ### Docs
77
+ - README **Production / cold-start deployment** section and a `.env.example`
78
+ Tier 2 block: install `patchright install chromium` in the post-boot step, the
79
+ cloudscraper 3.x fork requirement, Node.js for Tier 2 concurrency, and the
80
+ `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` budget knob.
81
+
82
+ ## [0.2.0] - 2026-06-25
83
+
84
+ ### Added
85
+ - **Selectable output formats** — `SCRAPER_OUTPUT_FORMAT` (or per-call
86
+ `scrape(fmt=...)`, CLI `--format`, `/scrape` `{"format": ...}`) selects the
87
+ content shape: `markdown` (default, unchanged), `markdown_trimmed` (extra
88
+ ad/nav/boilerplate removed), `html` (raw), or `html_selectors` (cleaned HTML
89
+ with per-domain `drop`/`selector` applied). Default output is byte-identical;
90
+ html-family results use a `html` JSON key instead of `markdown`.
91
+
92
+ ## [0.1.0] - 2026-06-23
93
+
94
+ ### Added
95
+ - **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
96
+ DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
97
+ botwall DB; the vendor is attached to each event and OTel span (`scrape.challenge`).
98
+ - **Metrics & reporting** — `switchback.reporting` rolls the event log + botwall DB
99
+ into cost-savings-vs-Firecrawl, coverage, overall/per-tier/per-domain latency
100
+ (mean/median/min/max/p50/p95), outcomes, error codes by domain, and challenges
101
+ by domain. Exposed via `GET /metrics` and `GET /metrics/domains` (both accept
102
+ `?minutes=N`).
103
+ - **Periodic flagging** — `python -m switchback.flags` emits a cron-friendly digest
104
+ (domains stuck on Firecrawl, escalated to egress, most-challenged) to logs/OTel.
105
+ - **Content cache** — optional URL→result cache (`SCRAPER_CONTENT_TTL_S`, sqlite,
106
+ off by default) short-circuits re-scrapes before any tier runs.
107
+ - **Login-session refresh** — `SCRAPER_LOGIN_HOOK` (`pkg.module:func`) refreshes a
108
+ dead logged-in session on demand; cookies overlay every tier and persist.
109
+ - **Exponential backoff** — between-tier backoff with jitter after rate-limit /
110
+ timeout (`SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS`, off by default).
111
+ - **Per-domain extraction prefs** — `config/extraction.json` (CSS scope selector +
112
+ extra drops) applied automatically in the normalize step for every tier.
113
+ - **Session traces** — opt-in Playwright trace capture (`SCRAPER_TRACE_SESSION=1`)
114
+ for browser tiers, with `GET/DELETE /traces` management endpoints.
115
+
116
+ ### Changed
117
+ - Tier 2's `cloudscraper` moved from a core dependency (which pinned a git-URL
118
+ fork PyPI can't publish) to the `cloudflare` extra; see the README for installing
119
+ the 3.x Enhanced Edition fork for full stealth.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: switchback
3
- Version: 0.2.0
3
+ Version: 0.5.0
4
4
  Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
5
5
  Author-email: Akash Kodavuru <akash@theaklabs.com>
6
6
  License: MIT
@@ -121,13 +121,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
121
121
 
122
122
  | Tier | Strategy | Cost |
123
123
  |---|---|---|
124
- | 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
125
- | 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
126
- | 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
127
- | 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
128
- | 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
129
- | 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
130
- | 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
124
+ | tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
125
+ | tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
126
+ | tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
127
+ | tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
128
+ | tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
129
+ | tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
130
+ | tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
131
131
 
132
132
  Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
133
133
  tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
@@ -142,17 +142,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
142
142
  ## Install
143
143
 
144
144
  ```bash
145
- pip install switchback # core: normalization + cheap tiers (0/1) + search
146
- pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
145
+ pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
146
+ pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
147
147
  pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
148
- pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
149
- pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
150
- pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
148
+ pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
149
+ pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
150
+ pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
151
151
  pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
152
152
  pip install "switchback[all]" # everything
153
153
  ```
154
154
 
155
- For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
155
+ For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
156
156
  3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
157
157
  git-URL dep inside a published package, so install it alongside):
158
158
 
@@ -163,6 +163,35 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
163
163
  Or run the whole thing as a container:
164
164
  `docker build -t switchback . && docker run -p 8799:8799 switchback`.
165
165
 
166
+ ### Production / cold-start deployment
167
+
168
+ The two heavy tiers pull dependencies that often can't be baked into a base image
169
+ and land *after* boot (e.g. an async install thread on Azure). Until they're
170
+ ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
171
+ fix) and the cascade falls through — they are never silently skipped. Checklist:
172
+
173
+ - **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
174
+ is installed: `patchright install chromium` (note: **patchright**, not vanilla
175
+ `playwright`). On a cold start, run this in your post-boot install step/thread;
176
+ tier_4 flips to ready once it finishes.
177
+ - **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
178
+ frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
179
+ solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
180
+ modern Cloudflare — treat it as a cheap try before the browser, not the primary.
181
+ - **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
182
+ vs. the pure-Python js2py fallback (relevant under concurrent load).
183
+ - **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
184
+ the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
185
+ challenge can't eat the per-URL deadline before the browser tier runs. Lower it
186
+ (e.g. `12`) if tier_3 rarely wins on your hosts.
187
+
188
+ **Verify readiness on the box** with the preflight check (doubles as a healthcheck
189
+ — exit 0 when the capable tiers are ready):
190
+
191
+ ```bash
192
+ switchback --doctor # or: python -m switchback --doctor
193
+ ```
194
+
166
195
  ## Use it from your app
167
196
 
168
197
  Three interchangeable entry points — all return the same shape
@@ -254,16 +283,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
254
283
  <details>
255
284
  <summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
256
285
 
257
- - `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
258
- - `FIRECRAWL_API_KEY` — enable Tier 4
259
- - `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
260
- - `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
286
+ - `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
287
+ - `FIRECRAWL_API_KEY` — enable tier_7
288
+ - `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
289
+ - `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
261
290
  - `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
262
291
  - `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
263
292
  - `SEARXNG_URL` — defaults to `http://localhost:8888`
264
293
  - `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
265
294
  - `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
266
- - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
295
+ - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
267
296
  </details>
268
297
 
269
298
  <details>
@@ -271,7 +300,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
271
300
 
272
301
  - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
273
302
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
274
- - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
303
+ - `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
304
+ - `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
275
305
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
276
306
  - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
277
307
  - `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
@@ -279,6 +309,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
279
309
  - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
280
310
  - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
281
311
  - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
312
+ - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
313
+ - `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
282
314
  - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
283
315
  - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
284
316
  - `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
@@ -62,13 +62,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
62
62
 
63
63
  | Tier | Strategy | Cost |
64
64
  |---|---|---|
65
- | 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
66
- | 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
67
- | 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
68
- | 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
69
- | 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
70
- | 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
71
- | 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
65
+ | tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
66
+ | tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
67
+ | tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
68
+ | tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
69
+ | tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
70
+ | tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
71
+ | tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
72
72
 
73
73
  Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
74
74
  tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
@@ -83,17 +83,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
83
83
  ## Install
84
84
 
85
85
  ```bash
86
- pip install switchback # core: normalization + cheap tiers (0/1) + search
87
- pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
86
+ pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
87
+ pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
88
88
  pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
89
- pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
90
- pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
91
- pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
89
+ pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
90
+ pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
91
+ pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
92
92
  pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
93
93
  pip install "switchback[all]" # everything
94
94
  ```
95
95
 
96
- For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
96
+ For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
97
97
  3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
98
98
  git-URL dep inside a published package, so install it alongside):
99
99
 
@@ -104,6 +104,35 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
104
104
  Or run the whole thing as a container:
105
105
  `docker build -t switchback . && docker run -p 8799:8799 switchback`.
106
106
 
107
+ ### Production / cold-start deployment
108
+
109
+ The two heavy tiers pull dependencies that often can't be baked into a base image
110
+ and land *after* boot (e.g. an async install thread on Azure). Until they're
111
+ ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
112
+ fix) and the cascade falls through — they are never silently skipped. Checklist:
113
+
114
+ - **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
115
+ is installed: `patchright install chromium` (note: **patchright**, not vanilla
116
+ `playwright`). On a cold start, run this in your post-boot install step/thread;
117
+ tier_4 flips to ready once it finishes.
118
+ - **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
119
+ frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
120
+ solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
121
+ modern Cloudflare — treat it as a cheap try before the browser, not the primary.
122
+ - **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
123
+ vs. the pure-Python js2py fallback (relevant under concurrent load).
124
+ - **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
125
+ the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
126
+ challenge can't eat the per-URL deadline before the browser tier runs. Lower it
127
+ (e.g. `12`) if tier_3 rarely wins on your hosts.
128
+
129
+ **Verify readiness on the box** with the preflight check (doubles as a healthcheck
130
+ — exit 0 when the capable tiers are ready):
131
+
132
+ ```bash
133
+ switchback --doctor # or: python -m switchback --doctor
134
+ ```
135
+
107
136
  ## Use it from your app
108
137
 
109
138
  Three interchangeable entry points — all return the same shape
@@ -195,16 +224,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
195
224
  <details>
196
225
  <summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
197
226
 
198
- - `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
199
- - `FIRECRAWL_API_KEY` — enable Tier 4
200
- - `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
201
- - `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
227
+ - `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
228
+ - `FIRECRAWL_API_KEY` — enable tier_7
229
+ - `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
230
+ - `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
202
231
  - `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
203
232
  - `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
204
233
  - `SEARXNG_URL` — defaults to `http://localhost:8888`
205
234
  - `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
206
235
  - `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
207
- - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
236
+ - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
208
237
  </details>
209
238
 
210
239
  <details>
@@ -212,7 +241,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
212
241
 
213
242
  - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
214
243
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
215
- - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
244
+ - `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
245
+ - `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
216
246
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
217
247
  - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
218
248
  - `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
@@ -220,6 +250,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
220
250
  - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
221
251
  - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
222
252
  - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
253
+ - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
254
+ - `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
223
255
  - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
224
256
  - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
225
257
  - `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "switchback"
7
- version = "0.2.0"
7
+ version = "0.5.0"
8
8
  description = "One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -55,6 +55,7 @@ def _main() -> int:
55
55
  _os.environ[_k] = _v.strip()
56
56
  usage = ("usage: switchback [--format FMT] <url> [<url> ...]\n"
57
57
  " switchback --search <query ...>\n"
58
+ " switchback --doctor\n"
58
59
  " (or: python -m switchback <url> ...)\n"
59
60
  " FMT: markdown (default) | markdown_trimmed | html | html_selectors")
60
61
  # --help/-h is an explicit request: usage to stdout, exit 0 (don't treat it
@@ -62,6 +63,10 @@ def _main() -> int:
62
63
  if any(a in ("--help", "-h") for a in sys.argv[1:]):
63
64
  print(usage)
64
65
  return 0
66
+ # --doctor: preflight tier-readiness report (no scrape). Side-effect-free.
67
+ if "--doctor" in sys.argv[1:]:
68
+ from .doctor import report
69
+ return report()
65
70
  logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
66
71
  setup_logs() # also ship logs to the OTLP backend when configured
67
72
  if len(sys.argv) < 2:
@@ -0,0 +1,59 @@
1
+ """Preflight readiness check — `switchback doctor`.
2
+
3
+ Reports which tiers can actually run on this box and, when one can't, the exact
4
+ fix. Built for cold-start deploys (e.g. Azure) where the stealth browser is
5
+ installed by a background thread *after* boot: run this to confirm the tiers are
6
+ live before sending traffic, or to see why Tier 2/3 aren't catching anything.
7
+
8
+ Exit code: 0 if both capable local tiers (cloudscraper + browser) are ready,
9
+ else 1 — so it doubles as a healthcheck.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import shutil
15
+
16
+ from .tiers import tier_3, tier_4
17
+
18
+
19
+ def _camoufox() -> tuple[bool, str]:
20
+ if os.getenv("SCRAPER_DISABLE_CAMOUFOX"):
21
+ return False, "off (SCRAPER_DISABLE_CAMOUFOX set)"
22
+ try:
23
+ import camoufox # noqa: F401
24
+ except ImportError:
25
+ return False, 'not installed — pip install "switchback[camoufox]" && camoufox fetch'
26
+ return True, "camoufox installed"
27
+
28
+
29
+ def probe() -> list[tuple[str, bool, str]]:
30
+ """(label, ok, detail) for each tier/dependency that matters at runtime."""
31
+ cs_ok, cs_detail = tier_3.available()
32
+ br_ok, br_detail = tier_4.available()
33
+ node = shutil.which("node")
34
+ return [
35
+ ("tier_3 (cloudscraper)", cs_ok, cs_detail),
36
+ ("tier_4 (browser)", br_ok, br_detail),
37
+ ("tier_5 (camoufox)", *_camoufox()),
38
+ ("node (tier_3 v3 concurrency)", bool(node),
39
+ node or "not on PATH — tier_3 falls back to slower, thread-fragile js2py"),
40
+ ("tier_7 (firecrawl)", bool(os.getenv("FIRECRAWL_API_KEY")),
41
+ "FIRECRAWL_API_KEY set" if os.getenv("FIRECRAWL_API_KEY")
42
+ else "off (no FIRECRAWL_API_KEY)"),
43
+ ]
44
+
45
+
46
+ def report() -> int:
47
+ rows = probe()
48
+ print("switchback doctor — tier readiness\n")
49
+ for label, ok, detail in rows:
50
+ mark = "OK " if ok else "MISS"
51
+ print(f" [{mark}] {label:30} {detail}")
52
+ cs_ok = rows[0][1]
53
+ br_ok = rows[1][1]
54
+ if cs_ok and br_ok:
55
+ print("\nCapable tiers ready.")
56
+ return 0
57
+ print("\nOne or more capable tiers are unavailable (see above). On a cold "
58
+ "start this may resolve once the async install thread finishes.")
59
+ return 1
@@ -10,7 +10,7 @@ Run it from cron / the /loop skill / any scheduler:
10
10
  python -m switchback.flags --json # machine-readable digest
11
11
 
12
12
  What it flags:
13
- • domains still landing on paid Firecrawl (winning_tier == tier4_firecrawl)
13
+ • domains still landing on paid Firecrawl (winning_tier == tier_7)
14
14
  • domains escalated to residential egress (needs_egress)
15
15
  • domains throwing the most bot-wall challenges (by vendor)
16
16
  • low coverage / negative cost savings in the window
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
29
29
 
30
30
  # A domain is "stuck" if its winning tier is the paid one — these are the hosts
31
31
  # that still cost money and are the prime targets for a new tier / cookie / rule.
32
- _PAID_TIER = "tier4_firecrawl"
32
+ _PAID_TIER = "tier_7"
33
33
 
34
34
 
35
35
  def build_digest(minutes: int | None = None) -> dict: