switchback 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {switchback-0.4.0 → switchback-0.5.0}/.env.example +21 -12
  2. {switchback-0.4.0 → switchback-0.5.0}/CHANGELOG.md +25 -0
  3. {switchback-0.4.0 → switchback-0.5.0}/PKG-INFO +30 -29
  4. {switchback-0.4.0 → switchback-0.5.0}/README.md +29 -28
  5. {switchback-0.4.0 → switchback-0.5.0}/pyproject.toml +1 -1
  6. {switchback-0.4.0 → switchback-0.5.0}/switchback/doctor.py +9 -9
  7. {switchback-0.4.0 → switchback-0.5.0}/switchback/flags.py +2 -2
  8. {switchback-0.4.0 → switchback-0.5.0}/switchback/orchestrator.py +2 -2
  9. {switchback-0.4.0 → switchback-0.5.0}/switchback/policy/botwall.py +36 -2
  10. {switchback-0.4.0 → switchback-0.5.0}/switchback/reporting.py +1 -1
  11. switchback-0.5.0/switchback/tiers/__init__.py +24 -0
  12. switchback-0.4.0/switchback/tiers/tier0_apis.py → switchback-0.5.0/switchback/tiers/tier_1.py +8 -4
  13. switchback-0.4.0/switchback/tiers/tier1_http.py → switchback-0.5.0/switchback/tiers/tier_2.py +6 -2
  14. switchback-0.4.0/switchback/tiers/tier2_cloudscraper.py → switchback-0.5.0/switchback/tiers/tier_3.py +6 -3
  15. switchback-0.4.0/switchback/tiers/tier3_browser.py → switchback-0.5.0/switchback/tiers/tier_4.py +5 -2
  16. switchback-0.4.0/switchback/tiers/tier3b_camoufox.py → switchback-0.5.0/switchback/tiers/tier_5.py +8 -2
  17. switchback-0.4.0/switchback/tiers/tier_residential.py → switchback-0.5.0/switchback/tiers/tier_6.py +8 -2
  18. switchback-0.4.0/switchback/tiers/tier4_firecrawl.py → switchback-0.5.0/switchback/tiers/tier_7.py +13 -3
  19. {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/PKG-INFO +30 -29
  20. {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/SOURCES.txt +7 -7
  21. switchback-0.4.0/switchback/tiers/__init__.py +0 -24
  22. {switchback-0.4.0 → switchback-0.5.0}/CONTRIBUTING.md +0 -0
  23. {switchback-0.4.0 → switchback-0.5.0}/LICENSE +0 -0
  24. {switchback-0.4.0 → switchback-0.5.0}/MANIFEST.in +0 -0
  25. {switchback-0.4.0 → switchback-0.5.0}/NOTICE +0 -0
  26. {switchback-0.4.0 → switchback-0.5.0}/SECURITY.md +0 -0
  27. {switchback-0.4.0 → switchback-0.5.0}/clients/node_bridge.md +0 -0
  28. {switchback-0.4.0 → switchback-0.5.0}/clients/python_client.py +0 -0
  29. {switchback-0.4.0 → switchback-0.5.0}/config/botwall_skip_urls.txt +0 -0
  30. {switchback-0.4.0 → switchback-0.5.0}/config/extraction.example.json +0 -0
  31. {switchback-0.4.0 → switchback-0.5.0}/setup.cfg +0 -0
  32. {switchback-0.4.0 → switchback-0.5.0}/switchback/__init__.py +0 -0
  33. {switchback-0.4.0 → switchback-0.5.0}/switchback/__main__.py +0 -0
  34. {switchback-0.4.0 → switchback-0.5.0}/switchback/api.py +0 -0
  35. {switchback-0.4.0 → switchback-0.5.0}/switchback/concurrency.py +0 -0
  36. {switchback-0.4.0 → switchback-0.5.0}/switchback/content_cache.py +0 -0
  37. {switchback-0.4.0 → switchback-0.5.0}/switchback/egress.py +0 -0
  38. {switchback-0.4.0 → switchback-0.5.0}/switchback/extract.py +0 -0
  39. {switchback-0.4.0 → switchback-0.5.0}/switchback/normalize.py +0 -0
  40. {switchback-0.4.0 → switchback-0.5.0}/switchback/policy/__init__.py +0 -0
  41. {switchback-0.4.0 → switchback-0.5.0}/switchback/policy/gates.py +0 -0
  42. {switchback-0.4.0 → switchback-0.5.0}/switchback/py.typed +0 -0
  43. {switchback-0.4.0 → switchback-0.5.0}/switchback/search.py +0 -0
  44. {switchback-0.4.0 → switchback-0.5.0}/switchback/server.py +0 -0
  45. {switchback-0.4.0 → switchback-0.5.0}/switchback/session_cache.py +0 -0
  46. {switchback-0.4.0 → switchback-0.5.0}/switchback/session_trace.py +0 -0
  47. {switchback-0.4.0 → switchback-0.5.0}/switchback/tiers/_browser.py +0 -0
  48. {switchback-0.4.0 → switchback-0.5.0}/switchback/tracing.py +0 -0
  49. {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/dependency_links.txt +0 -0
  50. {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/entry_points.txt +0 -0
  51. {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/requires.txt +0 -0
  52. {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/top_level.txt +0 -0
@@ -11,7 +11,7 @@
11
11
  OTEL_SERVICE_NAME=switchback
12
12
  OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
13
13
 
14
- # ── Search (Tier-0 SearXNG, query → URLs) ───────────────────────────────────
14
+ # ── Search (SearXNG, query → URLs — separate from the fetch cascade) ─────────
15
15
  SEARXNG_URL=http://localhost:8888
16
16
 
17
17
  # ── Output format ───────────────────────────────────────────────────────────
@@ -27,29 +27,38 @@ SEARXNG_URL=http://localhost:8888
27
27
  # fall back to their text for those sources.
28
28
  SCRAPER_OUTPUT_FORMAT=markdown
29
29
 
30
- # ── Tier 2 · Cloudflare solver (cloudscraper) ───────────────────────────────
30
+ # ── tier_3 · Cloudflare solver (cloudscraper) ───────────────────────────────
31
31
  # Needs the 3.x Enhanced Edition fork (see README); with the frozen PyPI build
32
32
  # the tier reports `unavailable` and fails fast. Wall-clock cap on a single solve
33
33
  # so an unsolvable challenge can't eat the per-URL deadline before the browser
34
- # tier runs. Lower (e.g. 12) if Tier 2 rarely wins on your hosts.
35
- SCRAPER_CLOUDSCRAPER_TIMEOUT_S=25
34
+ # tier runs. Lower (e.g. 12) if tier_3 rarely wins on your hosts.
35
+ SCRAPER_TIER_3_TIMEOUT_S=25
36
36
 
37
- # ── Tier 2.5 · Jina Reader (r.jina.ai) ──────────────────────────────────────
38
- # Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
39
- JINA_API_KEY=
40
- SCRAPER_JINA_TIMEOUT_S=20
41
-
42
- # ── Tier 3b · Camoufox (Firefox stealth) ────────────────────────────────────
37
+ # ── tier_5 · Camoufox (Firefox stealth) ─────────────────────────────────────
43
38
  # ON by default. Needs: pip install camoufox && camoufox fetch
44
39
  # Set to 1 to turn the tier off entirely.
45
40
  SCRAPER_DISABLE_CAMOUFOX=
46
- SCRAPER_CAMOUFOX_TIMEOUT_MS=45000
41
+ SCRAPER_TIER_5_TIMEOUT_S=45
47
42
 
48
- # ── Tier 4 · Firecrawl (paid, last resort) ──────────────────────────────────
43
+ # ── tier_7 · Firecrawl (paid, last resort) ──────────────────────────────────
49
44
  # Required only if this tier runs. Set SCRAPER_DISABLE_FIRECRAWL=1 to skip it.
50
45
  FIRECRAWL_API_KEY=
51
46
  SCRAPER_DISABLE_FIRECRAWL=
52
47
 
48
+ # ── Per-tier timeouts (seconds) ─────────────────────────────────────────────
49
+ # Each tier's wall-clock/socket cap; override any of them. Defaults shown below.
50
+ # tier_3 (=25) and tier_5 (=45) are set live in their sections above; the rest
51
+ # fall back to these defaults. The pre-0.5.0 names are still honored when the new
52
+ # var is unset: SCRAPER_CLOUDSCRAPER_TIMEOUT_S → tier_3,
53
+ # SCRAPER_CAMOUFOX_TIMEOUT_MS → tier_5, SCRAPER_RESIDENTIAL_TIMEOUT_MS → tier_6.
54
+ #SCRAPER_TIER_1_TIMEOUT_S=15 # direct APIs / open mirrors
55
+ #SCRAPER_TIER_2_TIMEOUT_S=15 # plain HTTP + TLS impersonation
56
+ #SCRAPER_TIER_3_TIMEOUT_S=25 # cloudscraper (Cloudflare solver)
57
+ #SCRAPER_TIER_4_TIMEOUT_S=15 # stealth headless browser (patchright)
58
+ #SCRAPER_TIER_5_TIMEOUT_S=45 # camoufox (slowest rung; hard CF solves ~40s)
59
+ #SCRAPER_TIER_6_TIMEOUT_S=30 # residential-IP CDP browser
60
+ #SCRAPER_TIER_7_TIMEOUT_S=15 # Firecrawl (paid) — was unbounded; raise if scrapes get cut off
61
+
53
62
  # ── Orchestrator ────────────────────────────────────────────────────────────
54
63
  # Per-URL wall-clock budget (s), checked between tiers. 45s balances latency vs
55
64
  # coverage — roughly fits a Camoufox solve (~40s) that starts after the cheaper
@@ -6,6 +6,31 @@ versioning while pre-1.0.
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.5.0] - 2026-06-30
10
+
11
+ ### Changed
12
+ - **Tiers renamed to plain `tier_1`…`tier_7`** (cost-ordered, contiguous) in place
13
+ of the old mixed scheme (`tier0_apis`, `tier1_http`, `tier2_cloudscraper`,
14
+ `tier3_browser`, `tier3b_camoufox`, `tier_residential`, `tier4_firecrawl`). The
15
+ mapping is positional: `tier_1`=apis, `tier_2`=http, `tier_3`=cloudscraper,
16
+ `tier_4`=browser, `tier_5`=camoufox, `tier_6`=residential, `tier_7`=firecrawl.
17
+ **Backwards-compatible:** an existing `state/botwall_db.json` is migrated on load
18
+ (a host's learned `winning_tier` / `tier_stats` keys are remapped to the new
19
+ names), so routing survives the upgrade instead of re-probing from scratch.
20
+
21
+ ### Added
22
+ - **Per-tier timeout knobs** — every tier now reads `SCRAPER_TIER_<N>_TIMEOUT_S`
23
+ (seconds, `N` = 1–7). Defaults: `15` for tiers without a prior budget
24
+ (apis/http/browser), and the existing budgets are preserved — `tier_3`=25,
25
+ `tier_5`=45, `tier_6`=30. The previously-unconfigurable/unbounded tiers (apis,
26
+ http, browser, **firecrawl**) are now bounded and overridable. The pre-rename
27
+ `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` /
28
+ `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset.
29
+ Note: `tier_7` (paid Firecrawl) was previously unbounded; its new `15`s default
30
+ bounds it — raise `SCRAPER_TIER_7_TIMEOUT_S` if hard hosts get cut off (a scrape
31
+ killed at the cap may still be billed). `SCRAPER_TIER_RETRIES_<TIER>` overrides
32
+ follow the new names (e.g. `SCRAPER_TIER_RETRIES_TIER_4`).
33
+
9
34
  ## [0.4.0] - 2026-06-29
10
35
 
11
36
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: switchback
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
5
5
  Author-email: Akash Kodavuru <akash@theaklabs.com>
6
6
  License: MIT
@@ -121,13 +121,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
121
121
 
122
122
  | Tier | Strategy | Cost |
123
123
  |---|---|---|
124
- | 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
125
- | 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
126
- | 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
127
- | 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
128
- | 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
129
- | 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
130
- | 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
124
+ | tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
125
+ | tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
126
+ | tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
127
+ | tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
128
+ | tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
129
+ | tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
130
+ | tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
131
131
 
132
132
  Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
133
133
  tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
@@ -142,17 +142,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
142
142
  ## Install
143
143
 
144
144
  ```bash
145
- pip install switchback # core: normalization + cheap tiers (0/1) + search
146
- pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
145
+ pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
146
+ pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
147
147
  pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
148
- pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
149
- pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
150
- pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
148
+ pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
149
+ pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
150
+ pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
151
151
  pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
152
152
  pip install "switchback[all]" # everything
153
153
  ```
154
154
 
155
- For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
155
+ For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
156
156
  3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
157
157
  git-URL dep inside a published package, so install it alongside):
158
158
 
@@ -170,19 +170,20 @@ and land *after* boot (e.g. an async install thread on Azure). Until they're
170
170
  ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
171
171
  fix) and the cascade falls through — they are never silently skipped. Checklist:
172
172
 
173
- - **Tier 3 is the real workhorse for Cloudflare/JS sites** — make sure its browser
173
+ - **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
174
174
  is installed: `patchright install chromium` (note: **patchright**, not vanilla
175
175
  `playwright`). On a cold start, run this in your post-boot install step/thread;
176
- Tier 3 flips to ready once it finishes.
177
- - **Tier 2 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
176
+ tier_4 flips to ready once it finishes.
177
+ - **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
178
178
  frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
179
- solve budget) instead of erroring mid-cascade. Tier 2 is a *weak* solver for
179
+ solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
180
180
  modern Cloudflare — treat it as a cheap try before the browser, not the primary.
181
- - **Install Node.js** for Tier 2's v3 JS-VM challenges — faster and thread-safe
181
+ - **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
182
182
  vs. the pure-Python js2py fallback (relevant under concurrent load).
183
- - **Bound Tier 2's solve budget** with `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` (default
184
- `25`) so an unsolvable challenge can't eat the per-URL deadline before the
185
- browser tier runs. Lower it (e.g. `12`) if Tier 2 rarely wins on your hosts.
183
+ - **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
184
+ the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
185
+ challenge can't eat the per-URL deadline before the browser tier runs. Lower it
186
+ (e.g. `12`) if tier_3 rarely wins on your hosts.
186
187
 
187
188
  **Verify readiness on the box** with the preflight check (doubles as a healthcheck
188
189
  — exit 0 when the capable tiers are ready):
@@ -282,16 +283,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
282
283
  <details>
283
284
  <summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
284
285
 
285
- - `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
286
- - `FIRECRAWL_API_KEY` — enable Tier 4
287
- - `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
288
- - `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
286
+ - `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
287
+ - `FIRECRAWL_API_KEY` — enable tier_7
288
+ - `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
289
+ - `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
289
290
  - `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
290
291
  - `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
291
292
  - `SEARXNG_URL` — defaults to `http://localhost:8888`
292
293
  - `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
293
294
  - `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
294
- - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
295
+ - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
295
296
  </details>
296
297
 
297
298
  <details>
@@ -300,7 +301,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
300
301
  - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
301
302
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
302
303
  - `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
303
- - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
304
+ - `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
304
305
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
305
306
  - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
306
307
  - `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
@@ -308,7 +309,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
308
309
  - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
309
310
  - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
310
311
  - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
311
- - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`)
312
+ - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
312
313
  - `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
313
314
  - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
314
315
  - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
@@ -62,13 +62,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
62
62
 
63
63
  | Tier | Strategy | Cost |
64
64
  |---|---|---|
65
- | 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
66
- | 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
67
- | 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
68
- | 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
69
- | 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
70
- | 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
71
- | 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
65
+ | tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
66
+ | tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
67
+ | tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
68
+ | tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
69
+ | tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
70
+ | tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
71
+ | tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
72
72
 
73
73
  Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
74
74
  tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
@@ -83,17 +83,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
83
83
  ## Install
84
84
 
85
85
  ```bash
86
- pip install switchback # core: normalization + cheap tiers (0/1) + search
87
- pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
86
+ pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
87
+ pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
88
88
  pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
89
- pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
90
- pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
91
- pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
89
+ pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
90
+ pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
91
+ pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
92
92
  pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
93
93
  pip install "switchback[all]" # everything
94
94
  ```
95
95
 
96
- For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
96
+ For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
97
97
  3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
98
98
  git-URL dep inside a published package, so install it alongside):
99
99
 
@@ -111,19 +111,20 @@ and land *after* boot (e.g. an async install thread on Azure). Until they're
111
111
  ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
112
112
  fix) and the cascade falls through — they are never silently skipped. Checklist:
113
113
 
114
- - **Tier 3 is the real workhorse for Cloudflare/JS sites** — make sure its browser
114
+ - **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
115
115
  is installed: `patchright install chromium` (note: **patchright**, not vanilla
116
116
  `playwright`). On a cold start, run this in your post-boot install step/thread;
117
- Tier 3 flips to ready once it finishes.
118
- - **Tier 2 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
117
+ tier_4 flips to ready once it finishes.
118
+ - **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
119
119
  frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
120
- solve budget) instead of erroring mid-cascade. Tier 2 is a *weak* solver for
120
+ solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
121
121
  modern Cloudflare — treat it as a cheap try before the browser, not the primary.
122
- - **Install Node.js** for Tier 2's v3 JS-VM challenges — faster and thread-safe
122
+ - **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
123
123
  vs. the pure-Python js2py fallback (relevant under concurrent load).
124
- - **Bound Tier 2's solve budget** with `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` (default
125
- `25`) so an unsolvable challenge can't eat the per-URL deadline before the
126
- browser tier runs. Lower it (e.g. `12`) if Tier 2 rarely wins on your hosts.
124
+ - **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
125
+ the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
126
+ challenge can't eat the per-URL deadline before the browser tier runs. Lower it
127
+ (e.g. `12`) if tier_3 rarely wins on your hosts.
127
128
 
128
129
  **Verify readiness on the box** with the preflight check (doubles as a healthcheck
129
130
  — exit 0 when the capable tiers are ready):
@@ -223,16 +224,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
223
224
  <details>
224
225
  <summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
225
226
 
226
- - `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
227
- - `FIRECRAWL_API_KEY` — enable Tier 4
228
- - `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
229
- - `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
227
+ - `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
228
+ - `FIRECRAWL_API_KEY` — enable tier_7
229
+ - `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
230
+ - `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
230
231
  - `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
231
232
  - `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
232
233
  - `SEARXNG_URL` — defaults to `http://localhost:8888`
233
234
  - `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
234
235
  - `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
235
- - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
236
+ - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
236
237
  </details>
237
238
 
238
239
  <details>
@@ -241,7 +242,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
241
242
  - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
242
243
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
243
244
  - `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
244
- - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
245
+ - `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
245
246
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
246
247
  - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
247
248
  - `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
@@ -249,7 +250,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
249
250
  - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
250
251
  - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
251
252
  - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
252
- - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`)
253
+ - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
253
254
  - `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
254
255
  - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
255
256
  - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "switchback"
7
- version = "0.4.0"
7
+ version = "0.5.0"
8
8
  description = "One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -13,7 +13,7 @@ from __future__ import annotations
13
13
  import os
14
14
  import shutil
15
15
 
16
- from .tiers import tier2_cloudscraper, tier3_browser
16
+ from .tiers import tier_3, tier_4
17
17
 
18
18
 
19
19
  def _camoufox() -> tuple[bool, str]:
@@ -28,16 +28,16 @@ def _camoufox() -> tuple[bool, str]:
28
28
 
29
29
  def probe() -> list[tuple[str, bool, str]]:
30
30
  """(label, ok, detail) for each tier/dependency that matters at runtime."""
31
- cs_ok, cs_detail = tier2_cloudscraper.available()
32
- br_ok, br_detail = tier3_browser.available()
31
+ cs_ok, cs_detail = tier_3.available()
32
+ br_ok, br_detail = tier_4.available()
33
33
  node = shutil.which("node")
34
34
  return [
35
- ("tier2_cloudscraper", cs_ok, cs_detail),
36
- ("tier3_browser", br_ok, br_detail),
37
- ("tier3b_camoufox", *_camoufox()),
38
- ("node (tier2 v3 concurrency)", bool(node),
39
- node or "not on PATH — Tier 2 falls back to slower, thread-fragile js2py"),
40
- ("tier4_firecrawl", bool(os.getenv("FIRECRAWL_API_KEY")),
35
+ ("tier_3 (cloudscraper)", cs_ok, cs_detail),
36
+ ("tier_4 (browser)", br_ok, br_detail),
37
+ ("tier_5 (camoufox)", *_camoufox()),
38
+ ("node (tier_3 v3 concurrency)", bool(node),
39
+ node or "not on PATH — tier_3 falls back to slower, thread-fragile js2py"),
40
+ ("tier_7 (firecrawl)", bool(os.getenv("FIRECRAWL_API_KEY")),
41
41
  "FIRECRAWL_API_KEY set" if os.getenv("FIRECRAWL_API_KEY")
42
42
  else "off (no FIRECRAWL_API_KEY)"),
43
43
  ]
@@ -10,7 +10,7 @@ Run it from cron / the /loop skill / any scheduler:
10
10
  python -m switchback.flags --json # machine-readable digest
11
11
 
12
12
  What it flags:
13
- • domains still landing on paid Firecrawl (winning_tier == tier4_firecrawl)
13
+ • domains still landing on paid Firecrawl (winning_tier == tier_7)
14
14
  • domains escalated to residential egress (needs_egress)
15
15
  • domains throwing the most bot-wall challenges (by vendor)
16
16
  • low coverage / negative cost savings in the window
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
29
29
 
30
30
  # A domain is "stuck" if its winning tier is the paid one — these are the hosts
31
31
  # that still cost money and are the prime targets for a new tier / cookie / rule.
32
- _PAID_TIER = "tier4_firecrawl"
32
+ _PAID_TIER = "tier_7"
33
33
 
34
34
 
35
35
  def build_digest(minutes: int | None = None) -> dict:
@@ -172,7 +172,7 @@ def _start_index(url: str, db: dict) -> int:
172
172
  if botwall.needs_egress(host, db):
173
173
  if egress.has_egress_proxy():
174
174
  return 0
175
- res_i = INDEX.get("tier_residential")
175
+ res_i = INDEX.get("tier_6")
176
176
  if res_i is not None:
177
177
  disabled_fn = getattr(TIERS[res_i], "disabled", None)
178
178
  if not (disabled_fn and disabled_fn()):
@@ -298,7 +298,7 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
298
298
  # solve attempt and go straight to the browser/egress tiers. A real CF
299
299
  # challenge surfaces as `botwall`, not `http_block`, so this never skips a
300
300
  # host cloudscraper could actually clear.
301
- if tier.NAME == "tier2_cloudscraper" and any(
301
+ if tier.NAME == "tier_3" and any(
302
302
  a.outcome == "http_block" for a in attempts):
303
303
  logger.info(f"{tier.NAME} skipped (prior hard IP block): {url}")
304
304
  attempts.append(TierAttempt(tier.NAME, "not_applicable"))
@@ -149,6 +149,40 @@ def _parse_skip_urls_file(path: str) -> dict[str, str]:
149
149
  return out
150
150
 
151
151
 
152
+ # Tiers were renamed to plain tier_1..tier_7 in 0.5.0. A pre-rename DB persists a
153
+ # host's learned winning_tier / tier_stats under the old names; migrate them on
154
+ # load so routing survives the upgrade instead of re-probing the cascade fresh.
155
+ _TIER_RENAME = {
156
+ "tier0_apis": "tier_1",
157
+ "tier1_http": "tier_2",
158
+ "tier2_cloudscraper": "tier_3",
159
+ "tier3_browser": "tier_4",
160
+ "tier3b_camoufox": "tier_5",
161
+ "tier_residential": "tier_6",
162
+ "tier4_firecrawl": "tier_7",
163
+ }
164
+
165
+
166
+ def _migrate_tier_names(hosts: dict) -> bool:
167
+ """Remap pre-rename tier names in persisted host records. Idempotent."""
168
+ changed = False
169
+ for rec in hosts.values():
170
+ if rec.get("winning_tier") in _TIER_RENAME:
171
+ rec["winning_tier"] = _TIER_RENAME[rec["winning_tier"]]
172
+ changed = True
173
+ stats = rec.get("tier_stats")
174
+ if isinstance(stats, dict):
175
+ for old, new in _TIER_RENAME.items():
176
+ if old in stats:
177
+ # Merge into the new key if it somehow already exists, else move.
178
+ dst = stats.setdefault(new, {"ok": 0, "miss": 0})
179
+ src = stats.pop(old)
180
+ dst["ok"] += src.get("ok", 0)
181
+ dst["miss"] += src.get("miss", 0)
182
+ changed = True
183
+ return changed
184
+
185
+
152
186
  def load_db() -> dict:
153
187
  db = {"version": 2, "updated_at": "", "hosts": {}, "urls": {}}
154
188
  if os.path.exists(DB_PATH):
@@ -159,7 +193,7 @@ def load_db() -> dict:
159
193
  logger.error(f"botwall: load failed ({e}); starting fresh")
160
194
  hosts = db.setdefault("hosts", {})
161
195
  urls = db.setdefault("urls", {})
162
- changed = False
196
+ changed = _migrate_tier_names(hosts)
163
197
  for host, reason in SEED_HOSTS.items():
164
198
  if host not in hosts:
165
199
  hosts[host] = _new_record(reason=reason, status="skip")
@@ -305,7 +339,7 @@ def _track_egress(host: str, tier: str, outcome: str, db: dict) -> None:
305
339
  unescalated. We don't count the residential tier's own misses (circular)."""
306
340
  if not PROMOTE_EGRESS_AFTER or outcome not in _EGRESS_OUTCOMES:
307
341
  return
308
- if tier == "tier_residential":
342
+ if tier == "tier_6": # residential egress — don't count its own misses
309
343
  return
310
344
  rec = db.get("hosts", {}).get(host)
311
345
  if rec is None or rec.get("needs_egress"):
@@ -39,7 +39,7 @@ FIRECRAWL_USD = float(os.getenv("BENCH_FIRECRAWL_USD", "0.001"))
39
39
  HARD_MULT = float(os.getenv("BENCH_FIRECRAWL_HARD_MULT", "5"))
40
40
 
41
41
  # Tiers whose win means Firecrawl would have billed the hard (stealth) rate.
42
- _HARD_TIERS = {"tier3_browser", "tier3b_camoufox", "tier_residential", "tier4_firecrawl"}
42
+ _HARD_TIERS = {"tier_4", "tier_5", "tier_6", "tier_7"}
43
43
 
44
44
 
45
45
  def _parse_ts(ts: str) -> datetime | None:
@@ -0,0 +1,24 @@
1
+ """The cost-ordered cascade. Each tier exposes:
2
+
3
+ NAME : str
4
+ PAID : bool # gated/audited if True
5
+ fetch(url) -> str | None # markdown on success; None if not
6
+ # applicable; raises on failure.
7
+
8
+ Order matters — cheapest/cleanest first, paid last.
9
+ """
10
+ from . import (tier_1, tier_2, tier_3, tier_4, tier_5, tier_6, tier_7)
11
+
12
+ # Cost-ordered, cheapest first. Plain names; role noted inline.
13
+ TIERS = [
14
+ tier_1, # direct APIs / open mirrors
15
+ tier_2, # plain HTTP with TLS impersonation
16
+ tier_3, # cloudscraper (Cloudflare/anti-bot solver)
17
+ tier_4, # stealth headless browser (patchright)
18
+ tier_5, # camoufox (Firefox stealth; on by default, orthogonal to tier_4)
19
+ tier_6, # residential-IP CDP browser (off unless BU_CDP_URL set)
20
+ tier_7, # Firecrawl (paid, last resort)
21
+ ]
22
+
23
+ # tier name -> index, for botwall winning-tier routing.
24
+ INDEX = {t.NAME: i for i, t in enumerate(TIERS)}
@@ -10,6 +10,7 @@ Web *search* (query → URLs) is a different shape and lives in switchback/searc
10
10
  """
11
11
  from __future__ import annotations
12
12
 
13
+ import os
13
14
  import re
14
15
  from urllib.parse import unquote
15
16
  from xml.etree import ElementTree as ET
@@ -17,9 +18,12 @@ from xml.etree import ElementTree as ET
17
18
  from ..normalize import html_to_markdown, UA
18
19
  from ..policy.gates import check
19
20
 
20
- NAME = "tier0_apis"
21
+ NAME = "tier_1"
21
22
  PAID = False
22
23
 
24
+ # Per-tier request timeout (seconds); override with SCRAPER_TIER_1_TIMEOUT_S.
25
+ _TIMEOUT_S = float(os.getenv("SCRAPER_TIER_1_TIMEOUT_S", "15"))
26
+
23
27
  ARXIV_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})(?:v\d+)?(?:\.pdf)?", re.I)
24
28
  WIKI_RE = re.compile(r"en\.wikipedia\.org/wiki/([^?#]+)", re.I)
25
29
  PMC_RE = re.compile(r"pmc\.ncbi\.nlm\.nih\.gov/articles/(PMC\d+)", re.I)
@@ -42,7 +46,7 @@ def _arxiv(arxiv_id: str, url: str) -> str:
42
46
  # impersonating Chrome triggers aggressive 429s from their Akamai front-end.
43
47
  import requests
44
48
  r = requests.get(f"https://export.arxiv.org/api/query?id_list={arxiv_id}",
45
- timeout=15,
49
+ timeout=_TIMEOUT_S,
46
50
  headers={"User-Agent": "switchback/1.0 (mailto:akash@theaklabs.com)"})
47
51
  r.raise_for_status()
48
52
  ns = {"atom": "http://www.w3.org/2005/Atom"}
@@ -60,7 +64,7 @@ def _arxiv(arxiv_id: str, url: str) -> str:
60
64
  def _wikipedia(title: str, url: str) -> str:
61
65
  from curl_cffi import requests as cffi
62
66
  r = cffi.get(f"https://en.wikipedia.org/api/rest_v1/page/html/{unquote(title)}",
63
- timeout=15, impersonate="chrome")
67
+ timeout=_TIMEOUT_S, impersonate="chrome")
64
68
  r.raise_for_status()
65
69
  return check(url, html_to_markdown(r.text, base_url=url))
66
70
 
@@ -70,7 +74,7 @@ def _europepmc(url: str) -> str:
70
74
  import requests
71
75
  pmcid = PMC_RE.search(url).group(1)
72
76
  api = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
73
- r = requests.get(api, timeout=20, headers={"User-Agent": UA})
77
+ r = requests.get(api, timeout=_TIMEOUT_S, headers={"User-Agent": UA})
74
78
  r.raise_for_status()
75
79
  if len(r.text) < 1000:
76
80
  raise RuntimeError(f"europepmc empty: {len(r.text)}")
@@ -15,6 +15,7 @@ says one Chrome version, the header says another).
15
15
  from __future__ import annotations
16
16
 
17
17
  import hashlib
18
+ import os
18
19
  from urllib.parse import urlsplit
19
20
 
20
21
  from .. import session_cache
@@ -22,9 +23,12 @@ from ..egress import requests_proxies, add_wire_bytes
22
23
  from ..normalize import html_to_markdown, pdf_bytes_to_text
23
24
  from ..policy.gates import BotWall, check, is_cf_challenge
24
25
 
25
- NAME = "tier1_http"
26
+ NAME = "tier_2"
26
27
  PAID = False
27
28
 
29
+ # Per-tier request timeout (seconds); override with SCRAPER_TIER_2_TIMEOUT_S.
30
+ _TIMEOUT_S = float(os.getenv("SCRAPER_TIER_2_TIMEOUT_S", "15"))
31
+
28
32
  # Recent Chrome JA3 targets available in curl_cffi 0.15.x. A small spread of real
29
33
  # versions mirrors how live traffic is distributed across Chrome releases.
30
34
  _IMPERSONATE_TARGETS = ("chrome131", "chrome136", "chrome142")
@@ -43,7 +47,7 @@ def fetch(url: str) -> str:
43
47
  # against Tier 1's distinct impersonate UA would be a mismatch tell.
44
48
  cookie = session_cache.cookie_header(url, include_cache=False)
45
49
  headers = {"Cookie": cookie} if cookie else None
46
- r = cffi.get(url, timeout=15, allow_redirects=True,
50
+ r = cffi.get(url, timeout=_TIMEOUT_S, allow_redirects=True,
47
51
  impersonate=_impersonate_for(url),
48
52
  proxies=requests_proxies(), headers=headers)
49
53
  add_wire_bytes(len(r.content)) # count even on a block — failed fetches burn bandwidth too
@@ -26,7 +26,7 @@ from ..policy.gates import Unavailable, check
26
26
 
27
27
  logger = logging.getLogger(__name__)
28
28
 
29
- NAME = "tier2_cloudscraper"
29
+ NAME = "tier_3"
30
30
  PAID = False
31
31
 
32
32
  # Install hint surfaced when cloudscraper is missing or the frozen PyPI 1.2.71
@@ -61,7 +61,10 @@ def available() -> tuple[bool, str]:
61
61
  # per-request socket timeout. Capping it here lets the cascade fall through to the
62
62
  # stealth browser (which can handle interactive challenges) instead of burning the
63
63
  # per-URL deadline. ~25s comfortably covers a real JS/v3 solve (~5-15s).
64
- _TIMEOUT_S = float(os.getenv("SCRAPER_CLOUDSCRAPER_TIMEOUT_S", "25"))
64
+ # Back-compat: honor the pre-0.5.0 SCRAPER_CLOUDSCRAPER_TIMEOUT_S if the new var
65
+ # is unset, so a tuned prod value survives the tier rename.
66
+ _TIMEOUT_S = float(os.getenv("SCRAPER_TIER_3_TIMEOUT_S",
67
+ os.getenv("SCRAPER_CLOUDSCRAPER_TIMEOUT_S", "25")))
65
68
 
66
69
  # Stealth pacing. Kept modest: Tier 2 only fires on CF-suspected hosts, and the
67
70
  # real latency win comes from skipping the solve entirely on repeat hits (session
@@ -153,7 +156,7 @@ def fetch(url: str) -> str:
153
156
  except BaseException as e: # noqa: BLE001 — propagated to caller below
154
157
  box["err"] = e
155
158
 
156
- t = threading.Thread(target=work, name="tier2-cloudscraper", daemon=True)
159
+ t = threading.Thread(target=work, name="tier_3-cloudscraper", daemon=True)
157
160
  t.start()
158
161
  t.join(_TIMEOUT_S)
159
162
  if t.is_alive():
@@ -19,9 +19,12 @@ from ..egress import playwright_proxy, add_wire_bytes
19
19
  from ..normalize import html_to_markdown
20
20
  from ..policy.gates import Unavailable, check
21
21
 
22
- NAME = "tier3_browser"
22
+ NAME = "tier_4"
23
23
  PAID = False
24
24
 
25
+ # Per-tier navigation timeout (seconds); override with SCRAPER_TIER_4_TIMEOUT_S.
26
+ _TIMEOUT_S = float(os.getenv("SCRAPER_TIER_4_TIMEOUT_S", "15"))
27
+
25
28
  # Install hint surfaced when patchright or its Chromium isn't ready — notably
26
29
  # during an async cold-start install (the browser binary lands after boot).
27
30
  _INSTALL_HINT = 'pip install "switchback[browser]" && patchright install chromium'
@@ -47,7 +50,7 @@ def available() -> tuple[bool, str]:
47
50
  return True, "patchright + Chromium ready"
48
51
 
49
52
 
50
- def fetch(url: str, timeout_ms: int = 15000) -> str:
53
+ def fetch(url: str, timeout_ms: int = int(_TIMEOUT_S * 1000)) -> str:
51
54
  try:
52
55
  from patchright.sync_api import sync_playwright
53
56
  except ImportError:
@@ -25,10 +25,16 @@ from ..policy.gates import check
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
- NAME = "tier3b_camoufox"
28
+ NAME = "tier_5"
29
29
  PAID = False
30
30
 
31
- _TIMEOUT_MS = int(os.getenv("SCRAPER_CAMOUFOX_TIMEOUT_MS", "45000"))
31
+ # Per-tier navigation timeout (seconds); override with SCRAPER_TIER_5_TIMEOUT_S.
32
+ # Default kept at 45s — camoufox is the slowest rung (~40s on a hard CF solve).
33
+ # Back-compat: honor the pre-0.5.0 SCRAPER_CAMOUFOX_TIMEOUT_MS (ms) if unset.
34
+ _legacy_ms = os.getenv("SCRAPER_CAMOUFOX_TIMEOUT_MS")
35
+ _TIMEOUT_S = float(os.getenv("SCRAPER_TIER_5_TIMEOUT_S",
36
+ str(float(_legacy_ms) / 1000) if _legacy_ms else "45"))
37
+ _TIMEOUT_MS = int(_TIMEOUT_S * 1000)
32
38
 
33
39
 
34
40
  def disabled() -> bool:
@@ -22,10 +22,16 @@ from ..concurrency import browser_slot
22
22
  from ..normalize import html_to_markdown
23
23
  from ..policy.gates import check
24
24
 
25
- NAME = "tier_residential"
25
+ NAME = "tier_6"
26
26
  PAID = False
27
27
 
28
- _TIMEOUT_MS = int(os.getenv("SCRAPER_RESIDENTIAL_TIMEOUT_MS", "30000"))
28
+ # Per-tier navigation timeout (seconds); override with SCRAPER_TIER_6_TIMEOUT_S.
29
+ # Default kept at 30s — remote CDP over a residential proxy is slow to first paint.
30
+ # Back-compat: honor the pre-0.5.0 SCRAPER_RESIDENTIAL_TIMEOUT_MS (ms) if unset.
31
+ _legacy_ms = os.getenv("SCRAPER_RESIDENTIAL_TIMEOUT_MS")
32
+ _TIMEOUT_S = float(os.getenv("SCRAPER_TIER_6_TIMEOUT_S",
33
+ str(float(_legacy_ms) / 1000) if _legacy_ms else "30"))
34
+ _TIMEOUT_MS = int(_TIMEOUT_S * 1000)
29
35
 
30
36
 
31
37
  def disabled() -> bool:
@@ -12,9 +12,15 @@ import threading
12
12
  from ..normalize import active_format, render
13
13
  from ..policy.gates import check
14
14
 
15
- NAME = "tier4_firecrawl"
15
+ NAME = "tier_7"
16
16
  PAID = True
17
17
 
18
+ # Per-tier wall-clock cap (seconds); override with SCRAPER_TIER_7_TIMEOUT_S.
19
+ # This paid last resort was previously unbounded; 15s bounds it like the rest, but
20
+ # Firecrawl scrapes can legitimately run longer — raise this if hard hosts get cut
21
+ # off at the finish line (you may still be billed for a scrape killed here).
22
+ _TIMEOUT_S = float(os.getenv("SCRAPER_TIER_7_TIMEOUT_S", "15"))
23
+
18
24
 
19
25
  def disabled() -> bool:
20
26
  return bool(os.getenv("SCRAPER_DISABLE_FIRECRAWL"))
@@ -49,9 +55,13 @@ def fetch(url: str) -> str:
49
55
  except BaseException as e: # noqa: BLE001 — re-raised to the caller below
50
56
  box["err"] = e
51
57
 
52
- t = threading.Thread(target=work, name="tier4-firecrawl", daemon=True)
58
+ t = threading.Thread(target=work, name="tier_7-firecrawl", daemon=True)
53
59
  t.start()
54
- t.join()
60
+ t.join(_TIMEOUT_S)
61
+ if t.is_alive():
62
+ raise TimeoutError(
63
+ f"firecrawl exceeded {_TIMEOUT_S}s "
64
+ "(raise SCRAPER_TIER_7_TIMEOUT_S for slow hosts)")
55
65
  if "err" in box:
56
66
  raise box["err"]
57
67
  return box["md"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: switchback
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
5
5
  Author-email: Akash Kodavuru <akash@theaklabs.com>
6
6
  License: MIT
@@ -121,13 +121,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
121
121
 
122
122
  | Tier | Strategy | Cost |
123
123
  |---|---|---|
124
- | 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
125
- | 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
126
- | 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
127
- | 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
128
- | 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
129
- | 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
130
- | 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
124
+ | tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
125
+ | tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
126
+ | tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
127
+ | tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
128
+ | tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
129
+ | tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
130
+ | tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
131
131
 
132
132
  Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
133
133
  tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
@@ -142,17 +142,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
142
142
  ## Install
143
143
 
144
144
  ```bash
145
- pip install switchback # core: normalization + cheap tiers (0/1) + search
146
- pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
145
+ pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
146
+ pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
147
147
  pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
148
- pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
149
- pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
150
- pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
148
+ pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
149
+ pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
150
+ pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
151
151
  pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
152
152
  pip install "switchback[all]" # everything
153
153
  ```
154
154
 
155
- For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
155
+ For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
156
156
  3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
157
157
  git-URL dep inside a published package, so install it alongside):
158
158
 
@@ -170,19 +170,20 @@ and land *after* boot (e.g. an async install thread on Azure). Until they're
170
170
  ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
171
171
  fix) and the cascade falls through — they are never silently skipped. Checklist:
172
172
 
173
- - **Tier 3 is the real workhorse for Cloudflare/JS sites** — make sure its browser
173
+ - **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
174
174
  is installed: `patchright install chromium` (note: **patchright**, not vanilla
175
175
  `playwright`). On a cold start, run this in your post-boot install step/thread;
176
- Tier 3 flips to ready once it finishes.
177
- - **Tier 2 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
176
+ tier_4 flips to ready once it finishes.
177
+ - **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
178
178
  frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
179
- solve budget) instead of erroring mid-cascade. Tier 2 is a *weak* solver for
179
+ solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
180
180
  modern Cloudflare — treat it as a cheap try before the browser, not the primary.
181
- - **Install Node.js** for Tier 2's v3 JS-VM challenges — faster and thread-safe
181
+ - **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
182
182
  vs. the pure-Python js2py fallback (relevant under concurrent load).
183
- - **Bound Tier 2's solve budget** with `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` (default
184
- `25`) so an unsolvable challenge can't eat the per-URL deadline before the
185
- browser tier runs. Lower it (e.g. `12`) if Tier 2 rarely wins on your hosts.
183
+ - **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
184
+ the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
185
+ challenge can't eat the per-URL deadline before the browser tier runs. Lower it
186
+ (e.g. `12`) if tier_3 rarely wins on your hosts.
186
187
 
187
188
  **Verify readiness on the box** with the preflight check (doubles as a healthcheck
188
189
  — exit 0 when the capable tiers are ready):
@@ -282,16 +283,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
282
283
  <details>
283
284
  <summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
284
285
 
285
- - `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
286
- - `FIRECRAWL_API_KEY` — enable Tier 4
287
- - `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
288
- - `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
286
+ - `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
287
+ - `FIRECRAWL_API_KEY` — enable tier_7
288
+ - `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
289
+ - `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
289
290
  - `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
290
291
  - `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
291
292
  - `SEARXNG_URL` — defaults to `http://localhost:8888`
292
293
  - `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
293
294
  - `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
294
- - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
295
+ - `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
295
296
  </details>
296
297
 
297
298
  <details>
@@ -300,7 +301,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
300
301
  - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
301
302
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
302
303
  - `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
303
- - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
304
+ - `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
304
305
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
305
306
  - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
306
307
  - `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
@@ -308,7 +309,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
308
309
  - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
309
310
  - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
310
311
  - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
311
- - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`)
312
+ - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
312
313
  - `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
313
314
  - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
314
315
  - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
@@ -40,10 +40,10 @@ switchback/policy/botwall.py
40
40
  switchback/policy/gates.py
41
41
  switchback/tiers/__init__.py
42
42
  switchback/tiers/_browser.py
43
- switchback/tiers/tier0_apis.py
44
- switchback/tiers/tier1_http.py
45
- switchback/tiers/tier2_cloudscraper.py
46
- switchback/tiers/tier3_browser.py
47
- switchback/tiers/tier3b_camoufox.py
48
- switchback/tiers/tier4_firecrawl.py
49
- switchback/tiers/tier_residential.py
43
+ switchback/tiers/tier_1.py
44
+ switchback/tiers/tier_2.py
45
+ switchback/tiers/tier_3.py
46
+ switchback/tiers/tier_4.py
47
+ switchback/tiers/tier_5.py
48
+ switchback/tiers/tier_6.py
49
+ switchback/tiers/tier_7.py
@@ -1,24 +0,0 @@
1
- """The cost-ordered cascade. Each tier exposes:
2
-
3
- NAME : str
4
- PAID : bool # gated/audited if True
5
- fetch(url) -> str | None # markdown on success; None if not
6
- # applicable; raises on failure.
7
-
8
- Order matters — cheapest/cleanest first, paid last.
9
- """
10
- from . import (tier0_apis, tier1_http, tier2_cloudscraper,
11
- tier3_browser, tier3b_camoufox, tier_residential, tier4_firecrawl)
12
-
13
- TIERS = [
14
- tier0_apis,
15
- tier1_http,
16
- tier2_cloudscraper,
17
- tier3_browser,
18
- tier3b_camoufox, # env-gated Firefox stealth (off by default; orthogonal to T3)
19
- tier_residential, # residential-IP CDP browser (off unless BU_CDP_URL set)
20
- tier4_firecrawl,
21
- ]
22
-
23
- # tier name -> index, for botwall winning-tier routing.
24
- INDEX = {t.NAME: i for i, t in enumerate(TIERS)}
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes