switchback 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {switchback-0.4.0 → switchback-0.5.0}/.env.example +21 -12
- {switchback-0.4.0 → switchback-0.5.0}/CHANGELOG.md +25 -0
- {switchback-0.4.0 → switchback-0.5.0}/PKG-INFO +30 -29
- {switchback-0.4.0 → switchback-0.5.0}/README.md +29 -28
- {switchback-0.4.0 → switchback-0.5.0}/pyproject.toml +1 -1
- {switchback-0.4.0 → switchback-0.5.0}/switchback/doctor.py +9 -9
- {switchback-0.4.0 → switchback-0.5.0}/switchback/flags.py +2 -2
- {switchback-0.4.0 → switchback-0.5.0}/switchback/orchestrator.py +2 -2
- {switchback-0.4.0 → switchback-0.5.0}/switchback/policy/botwall.py +36 -2
- {switchback-0.4.0 → switchback-0.5.0}/switchback/reporting.py +1 -1
- switchback-0.5.0/switchback/tiers/__init__.py +24 -0
- switchback-0.4.0/switchback/tiers/tier0_apis.py → switchback-0.5.0/switchback/tiers/tier_1.py +8 -4
- switchback-0.4.0/switchback/tiers/tier1_http.py → switchback-0.5.0/switchback/tiers/tier_2.py +6 -2
- switchback-0.4.0/switchback/tiers/tier2_cloudscraper.py → switchback-0.5.0/switchback/tiers/tier_3.py +6 -3
- switchback-0.4.0/switchback/tiers/tier3_browser.py → switchback-0.5.0/switchback/tiers/tier_4.py +5 -2
- switchback-0.4.0/switchback/tiers/tier3b_camoufox.py → switchback-0.5.0/switchback/tiers/tier_5.py +8 -2
- switchback-0.4.0/switchback/tiers/tier_residential.py → switchback-0.5.0/switchback/tiers/tier_6.py +8 -2
- switchback-0.4.0/switchback/tiers/tier4_firecrawl.py → switchback-0.5.0/switchback/tiers/tier_7.py +13 -3
- {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/PKG-INFO +30 -29
- {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/SOURCES.txt +7 -7
- switchback-0.4.0/switchback/tiers/__init__.py +0 -24
- {switchback-0.4.0 → switchback-0.5.0}/CONTRIBUTING.md +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/LICENSE +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/MANIFEST.in +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/NOTICE +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/SECURITY.md +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/clients/node_bridge.md +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/clients/python_client.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/config/botwall_skip_urls.txt +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/config/extraction.example.json +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/setup.cfg +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/__init__.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/__main__.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/api.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/concurrency.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/content_cache.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/egress.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/extract.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/normalize.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/policy/__init__.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/policy/gates.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/py.typed +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/search.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/server.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/session_cache.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/session_trace.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/tiers/_browser.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback/tracing.py +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/dependency_links.txt +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/entry_points.txt +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/requires.txt +0 -0
- {switchback-0.4.0 → switchback-0.5.0}/switchback.egg-info/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
OTEL_SERVICE_NAME=switchback
|
|
12
12
|
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
13
13
|
|
|
14
|
-
# ── Search (
|
|
14
|
+
# ── Search (SearXNG, query → URLs — separate from the fetch cascade) ─────────
|
|
15
15
|
SEARXNG_URL=http://localhost:8888
|
|
16
16
|
|
|
17
17
|
# ── Output format ───────────────────────────────────────────────────────────
|
|
@@ -27,29 +27,38 @@ SEARXNG_URL=http://localhost:8888
|
|
|
27
27
|
# fall back to their text for those sources.
|
|
28
28
|
SCRAPER_OUTPUT_FORMAT=markdown
|
|
29
29
|
|
|
30
|
-
# ──
|
|
30
|
+
# ── tier_3 · Cloudflare solver (cloudscraper) ───────────────────────────────
|
|
31
31
|
# Needs the 3.x Enhanced Edition fork (see README); with the frozen PyPI build
|
|
32
32
|
# the tier reports `unavailable` and fails fast. Wall-clock cap on a single solve
|
|
33
33
|
# so an unsolvable challenge can't eat the per-URL deadline before the browser
|
|
34
|
-
# tier runs. Lower (e.g. 12) if
|
|
35
|
-
|
|
34
|
+
# tier runs. Lower (e.g. 12) if tier_3 rarely wins on your hosts.
|
|
35
|
+
SCRAPER_TIER_3_TIMEOUT_S=25
|
|
36
36
|
|
|
37
|
-
# ──
|
|
38
|
-
# Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
|
|
39
|
-
JINA_API_KEY=
|
|
40
|
-
SCRAPER_JINA_TIMEOUT_S=20
|
|
41
|
-
|
|
42
|
-
# ── Tier 3b · Camoufox (Firefox stealth) ────────────────────────────────────
|
|
37
|
+
# ── tier_5 · Camoufox (Firefox stealth) ─────────────────────────────────────
|
|
43
38
|
# ON by default. Needs: pip install camoufox && camoufox fetch
|
|
44
39
|
# Set to 1 to turn the tier off entirely.
|
|
45
40
|
SCRAPER_DISABLE_CAMOUFOX=
|
|
46
|
-
|
|
41
|
+
SCRAPER_TIER_5_TIMEOUT_S=45
|
|
47
42
|
|
|
48
|
-
# ──
|
|
43
|
+
# ── tier_7 · Firecrawl (paid, last resort) ──────────────────────────────────
|
|
49
44
|
# Required only if this tier runs. Set SCRAPER_DISABLE_FIRECRAWL=1 to skip it.
|
|
50
45
|
FIRECRAWL_API_KEY=
|
|
51
46
|
SCRAPER_DISABLE_FIRECRAWL=
|
|
52
47
|
|
|
48
|
+
# ── Per-tier timeouts (seconds) ─────────────────────────────────────────────
|
|
49
|
+
# Each tier's wall-clock/socket cap; override any of them. Defaults shown below.
|
|
50
|
+
# tier_3 (=25) and tier_5 (=45) are set live in their sections above; the rest
|
|
51
|
+
# fall back to these defaults. The pre-0.5.0 names are still honored when the new
|
|
52
|
+
# var is unset: SCRAPER_CLOUDSCRAPER_TIMEOUT_S → tier_3,
|
|
53
|
+
# SCRAPER_CAMOUFOX_TIMEOUT_MS → tier_5, SCRAPER_RESIDENTIAL_TIMEOUT_MS → tier_6.
|
|
54
|
+
#SCRAPER_TIER_1_TIMEOUT_S=15 # direct APIs / open mirrors
|
|
55
|
+
#SCRAPER_TIER_2_TIMEOUT_S=15 # plain HTTP + TLS impersonation
|
|
56
|
+
#SCRAPER_TIER_3_TIMEOUT_S=25 # cloudscraper (Cloudflare solver)
|
|
57
|
+
#SCRAPER_TIER_4_TIMEOUT_S=15 # stealth headless browser (patchright)
|
|
58
|
+
#SCRAPER_TIER_5_TIMEOUT_S=45 # camoufox (slowest rung; hard CF solves ~40s)
|
|
59
|
+
#SCRAPER_TIER_6_TIMEOUT_S=30 # residential-IP CDP browser
|
|
60
|
+
#SCRAPER_TIER_7_TIMEOUT_S=15 # Firecrawl (paid) — was unbounded; raise if scrapes get cut off
|
|
61
|
+
|
|
53
62
|
# ── Orchestrator ────────────────────────────────────────────────────────────
|
|
54
63
|
# Per-URL wall-clock budget (s), checked between tiers. 45s balances latency vs
|
|
55
64
|
# coverage — roughly fits a Camoufox solve (~40s) that starts after the cheaper
|
|
@@ -6,6 +6,31 @@ versioning while pre-1.0.
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.5.0] - 2026-06-30
|
|
10
|
+
|
|
11
|
+
### Changed
|
|
12
|
+
- **Tiers renamed to plain `tier_1`…`tier_7`** (cost-ordered, contiguous) in place
|
|
13
|
+
of the old mixed scheme (`tier0_apis`, `tier1_http`, `tier2_cloudscraper`,
|
|
14
|
+
`tier3_browser`, `tier3b_camoufox`, `tier_residential`, `tier4_firecrawl`). The
|
|
15
|
+
mapping is positional: `tier_1`=apis, `tier_2`=http, `tier_3`=cloudscraper,
|
|
16
|
+
`tier_4`=browser, `tier_5`=camoufox, `tier_6`=residential, `tier_7`=firecrawl.
|
|
17
|
+
**Backwards-compatible:** an existing `state/botwall_db.json` is migrated on load
|
|
18
|
+
(a host's learned `winning_tier` / `tier_stats` keys are remapped to the new
|
|
19
|
+
names), so routing survives the upgrade instead of re-probing from scratch.
|
|
20
|
+
|
|
21
|
+
### Added
|
|
22
|
+
- **Per-tier timeout knobs** — every tier now reads `SCRAPER_TIER_<N>_TIMEOUT_S`
|
|
23
|
+
(seconds, `N` = 1–7). Defaults: `15` for tiers without a prior budget
|
|
24
|
+
(apis/http/browser), and the existing budgets are preserved — `tier_3`=25,
|
|
25
|
+
`tier_5`=45, `tier_6`=30. The previously-unconfigurable/unbounded tiers (apis,
|
|
26
|
+
http, browser, **firecrawl**) are now bounded and overridable. The pre-rename
|
|
27
|
+
`SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` /
|
|
28
|
+
`SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset.
|
|
29
|
+
Note: `tier_7` (paid Firecrawl) was previously unbounded; its new `15`s default
|
|
30
|
+
bounds it — raise `SCRAPER_TIER_7_TIMEOUT_S` if hard hosts get cut off (a scrape
|
|
31
|
+
killed at the cap may still be billed). `SCRAPER_TIER_RETRIES_<TIER>` overrides
|
|
32
|
+
follow the new names (e.g. `SCRAPER_TIER_RETRIES_TIER_4`).
|
|
33
|
+
|
|
9
34
|
## [0.4.0] - 2026-06-29
|
|
10
35
|
|
|
11
36
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: switchback
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
5
|
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
6
|
License: MIT
|
|
@@ -121,13 +121,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
|
|
|
121
121
|
|
|
122
122
|
| Tier | Strategy | Cost |
|
|
123
123
|
|---|---|---|
|
|
124
|
-
|
|
|
125
|
-
|
|
|
126
|
-
|
|
|
127
|
-
|
|
|
128
|
-
|
|
|
129
|
-
|
|
|
130
|
-
|
|
|
124
|
+
| tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
|
|
125
|
+
| tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
|
|
126
|
+
| tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
|
|
127
|
+
| tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
|
|
128
|
+
| tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
|
|
129
|
+
| tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
|
|
130
|
+
| tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
|
|
131
131
|
|
|
132
132
|
Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
|
|
133
133
|
tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
|
|
@@ -142,17 +142,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
|
|
|
142
142
|
## Install
|
|
143
143
|
|
|
144
144
|
```bash
|
|
145
|
-
pip install switchback # core: normalization + cheap tiers (
|
|
146
|
-
pip install "switchback[cloudflare]" # +
|
|
145
|
+
pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
|
|
146
|
+
pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
|
|
147
147
|
pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
|
|
148
|
-
pip install "switchback[browser]" && patchright install chromium # +
|
|
149
|
-
pip install "switchback[camoufox]" && camoufox fetch # +
|
|
150
|
-
pip install "switchback[firecrawl]" # +
|
|
148
|
+
pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
|
|
149
|
+
pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
|
|
150
|
+
pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
|
|
151
151
|
pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
|
|
152
152
|
pip install "switchback[all]" # everything
|
|
153
153
|
```
|
|
154
154
|
|
|
155
|
-
For
|
|
155
|
+
For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
|
|
156
156
|
3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
|
|
157
157
|
git-URL dep inside a published package, so install it alongside):
|
|
158
158
|
|
|
@@ -170,19 +170,20 @@ and land *after* boot (e.g. an async install thread on Azure). Until they're
|
|
|
170
170
|
ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
|
|
171
171
|
fix) and the cascade falls through — they are never silently skipped. Checklist:
|
|
172
172
|
|
|
173
|
-
- **
|
|
173
|
+
- **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
|
|
174
174
|
is installed: `patchright install chromium` (note: **patchright**, not vanilla
|
|
175
175
|
`playwright`). On a cold start, run this in your post-boot install step/thread;
|
|
176
|
-
|
|
177
|
-
- **
|
|
176
|
+
tier_4 flips to ready once it finishes.
|
|
177
|
+
- **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
|
|
178
178
|
frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
|
|
179
|
-
solve budget) instead of erroring mid-cascade.
|
|
179
|
+
solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
|
|
180
180
|
modern Cloudflare — treat it as a cheap try before the browser, not the primary.
|
|
181
|
-
- **Install Node.js** for
|
|
181
|
+
- **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
|
|
182
182
|
vs. the pure-Python js2py fallback (relevant under concurrent load).
|
|
183
|
-
- **Bound
|
|
184
|
-
`
|
|
185
|
-
|
|
183
|
+
- **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
|
|
184
|
+
the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
|
|
185
|
+
challenge can't eat the per-URL deadline before the browser tier runs. Lower it
|
|
186
|
+
(e.g. `12`) if tier_3 rarely wins on your hosts.
|
|
186
187
|
|
|
187
188
|
**Verify readiness on the box** with the preflight check (doubles as a healthcheck
|
|
188
189
|
— exit 0 when the capable tiers are ready):
|
|
@@ -282,16 +283,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
282
283
|
<details>
|
|
283
284
|
<summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
|
|
284
285
|
|
|
285
|
-
- `SCRAPER_DISABLE_FIRECRAWL` — skip
|
|
286
|
-
- `FIRECRAWL_API_KEY` — enable
|
|
287
|
-
- `SCRAPER_DISABLE_CAMOUFOX` — turn off
|
|
288
|
-
- `BU_CDP_URL` — enable
|
|
286
|
+
- `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
|
|
287
|
+
- `FIRECRAWL_API_KEY` — enable tier_7
|
|
288
|
+
- `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
|
|
289
|
+
- `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
|
|
289
290
|
- `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
|
|
290
291
|
- `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
|
|
291
292
|
- `SEARXNG_URL` — defaults to `http://localhost:8888`
|
|
292
293
|
- `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
|
|
293
294
|
- `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
|
|
294
|
-
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into
|
|
295
|
+
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
|
|
295
296
|
</details>
|
|
296
297
|
|
|
297
298
|
<details>
|
|
@@ -300,7 +301,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
300
301
|
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
301
302
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
302
303
|
- `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
|
|
303
|
-
- `
|
|
304
|
+
- `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
|
|
304
305
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
305
306
|
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
306
307
|
- `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
|
|
@@ -308,7 +309,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
308
309
|
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
309
310
|
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
310
311
|
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
311
|
-
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `
|
|
312
|
+
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
|
|
312
313
|
- `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
|
|
313
314
|
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
314
315
|
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
@@ -62,13 +62,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
|
|
|
62
62
|
|
|
63
63
|
| Tier | Strategy | Cost |
|
|
64
64
|
|---|---|---|
|
|
65
|
-
|
|
|
66
|
-
|
|
|
67
|
-
|
|
|
68
|
-
|
|
|
69
|
-
|
|
|
70
|
-
|
|
|
71
|
-
|
|
|
65
|
+
| tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
|
|
66
|
+
| tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
|
|
67
|
+
| tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
|
|
68
|
+
| tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
|
|
69
|
+
| tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
|
|
70
|
+
| tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
|
|
71
|
+
| tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
|
|
72
72
|
|
|
73
73
|
Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
|
|
74
74
|
tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
|
|
@@ -83,17 +83,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
|
|
|
83
83
|
## Install
|
|
84
84
|
|
|
85
85
|
```bash
|
|
86
|
-
pip install switchback # core: normalization + cheap tiers (
|
|
87
|
-
pip install "switchback[cloudflare]" # +
|
|
86
|
+
pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
|
|
87
|
+
pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
|
|
88
88
|
pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
|
|
89
|
-
pip install "switchback[browser]" && patchright install chromium # +
|
|
90
|
-
pip install "switchback[camoufox]" && camoufox fetch # +
|
|
91
|
-
pip install "switchback[firecrawl]" # +
|
|
89
|
+
pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
|
|
90
|
+
pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
|
|
91
|
+
pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
|
|
92
92
|
pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
|
|
93
93
|
pip install "switchback[all]" # everything
|
|
94
94
|
```
|
|
95
95
|
|
|
96
|
-
For
|
|
96
|
+
For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
|
|
97
97
|
3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
|
|
98
98
|
git-URL dep inside a published package, so install it alongside):
|
|
99
99
|
|
|
@@ -111,19 +111,20 @@ and land *after* boot (e.g. an async install thread on Azure). Until they're
|
|
|
111
111
|
ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
|
|
112
112
|
fix) and the cascade falls through — they are never silently skipped. Checklist:
|
|
113
113
|
|
|
114
|
-
- **
|
|
114
|
+
- **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
|
|
115
115
|
is installed: `patchright install chromium` (note: **patchright**, not vanilla
|
|
116
116
|
`playwright`). On a cold start, run this in your post-boot install step/thread;
|
|
117
|
-
|
|
118
|
-
- **
|
|
117
|
+
tier_4 flips to ready once it finishes.
|
|
118
|
+
- **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
|
|
119
119
|
frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
|
|
120
|
-
solve budget) instead of erroring mid-cascade.
|
|
120
|
+
solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
|
|
121
121
|
modern Cloudflare — treat it as a cheap try before the browser, not the primary.
|
|
122
|
-
- **Install Node.js** for
|
|
122
|
+
- **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
|
|
123
123
|
vs. the pure-Python js2py fallback (relevant under concurrent load).
|
|
124
|
-
- **Bound
|
|
125
|
-
`
|
|
126
|
-
|
|
124
|
+
- **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
|
|
125
|
+
the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
|
|
126
|
+
challenge can't eat the per-URL deadline before the browser tier runs. Lower it
|
|
127
|
+
(e.g. `12`) if tier_3 rarely wins on your hosts.
|
|
127
128
|
|
|
128
129
|
**Verify readiness on the box** with the preflight check (doubles as a healthcheck
|
|
129
130
|
— exit 0 when the capable tiers are ready):
|
|
@@ -223,16 +224,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
223
224
|
<details>
|
|
224
225
|
<summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
|
|
225
226
|
|
|
226
|
-
- `SCRAPER_DISABLE_FIRECRAWL` — skip
|
|
227
|
-
- `FIRECRAWL_API_KEY` — enable
|
|
228
|
-
- `SCRAPER_DISABLE_CAMOUFOX` — turn off
|
|
229
|
-
- `BU_CDP_URL` — enable
|
|
227
|
+
- `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
|
|
228
|
+
- `FIRECRAWL_API_KEY` — enable tier_7
|
|
229
|
+
- `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
|
|
230
|
+
- `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
|
|
230
231
|
- `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
|
|
231
232
|
- `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
|
|
232
233
|
- `SEARXNG_URL` — defaults to `http://localhost:8888`
|
|
233
234
|
- `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
|
|
234
235
|
- `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
|
|
235
|
-
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into
|
|
236
|
+
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
|
|
236
237
|
</details>
|
|
237
238
|
|
|
238
239
|
<details>
|
|
@@ -241,7 +242,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
241
242
|
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
242
243
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
243
244
|
- `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
|
|
244
|
-
- `
|
|
245
|
+
- `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
|
|
245
246
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
246
247
|
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
247
248
|
- `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
|
|
@@ -249,7 +250,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
249
250
|
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
250
251
|
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
251
252
|
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
252
|
-
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `
|
|
253
|
+
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
|
|
253
254
|
- `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
|
|
254
255
|
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
255
256
|
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "switchback"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -13,7 +13,7 @@ from __future__ import annotations
|
|
|
13
13
|
import os
|
|
14
14
|
import shutil
|
|
15
15
|
|
|
16
|
-
from .tiers import
|
|
16
|
+
from .tiers import tier_3, tier_4
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def _camoufox() -> tuple[bool, str]:
|
|
@@ -28,16 +28,16 @@ def _camoufox() -> tuple[bool, str]:
|
|
|
28
28
|
|
|
29
29
|
def probe() -> list[tuple[str, bool, str]]:
|
|
30
30
|
"""(label, ok, detail) for each tier/dependency that matters at runtime."""
|
|
31
|
-
cs_ok, cs_detail =
|
|
32
|
-
br_ok, br_detail =
|
|
31
|
+
cs_ok, cs_detail = tier_3.available()
|
|
32
|
+
br_ok, br_detail = tier_4.available()
|
|
33
33
|
node = shutil.which("node")
|
|
34
34
|
return [
|
|
35
|
-
("
|
|
36
|
-
("
|
|
37
|
-
("
|
|
38
|
-
("node (
|
|
39
|
-
node or "not on PATH —
|
|
40
|
-
("
|
|
35
|
+
("tier_3 (cloudscraper)", cs_ok, cs_detail),
|
|
36
|
+
("tier_4 (browser)", br_ok, br_detail),
|
|
37
|
+
("tier_5 (camoufox)", *_camoufox()),
|
|
38
|
+
("node (tier_3 v3 concurrency)", bool(node),
|
|
39
|
+
node or "not on PATH — tier_3 falls back to slower, thread-fragile js2py"),
|
|
40
|
+
("tier_7 (firecrawl)", bool(os.getenv("FIRECRAWL_API_KEY")),
|
|
41
41
|
"FIRECRAWL_API_KEY set" if os.getenv("FIRECRAWL_API_KEY")
|
|
42
42
|
else "off (no FIRECRAWL_API_KEY)"),
|
|
43
43
|
]
|
|
@@ -10,7 +10,7 @@ Run it from cron / the /loop skill / any scheduler:
|
|
|
10
10
|
python -m switchback.flags --json # machine-readable digest
|
|
11
11
|
|
|
12
12
|
What it flags:
|
|
13
|
-
• domains still landing on paid Firecrawl (winning_tier ==
|
|
13
|
+
• domains still landing on paid Firecrawl (winning_tier == tier_7)
|
|
14
14
|
• domains escalated to residential egress (needs_egress)
|
|
15
15
|
• domains throwing the most bot-wall challenges (by vendor)
|
|
16
16
|
• low coverage / negative cost savings in the window
|
|
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
|
|
|
29
29
|
|
|
30
30
|
# A domain is "stuck" if its winning tier is the paid one — these are the hosts
|
|
31
31
|
# that still cost money and are the prime targets for a new tier / cookie / rule.
|
|
32
|
-
_PAID_TIER = "
|
|
32
|
+
_PAID_TIER = "tier_7"
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def build_digest(minutes: int | None = None) -> dict:
|
|
@@ -172,7 +172,7 @@ def _start_index(url: str, db: dict) -> int:
|
|
|
172
172
|
if botwall.needs_egress(host, db):
|
|
173
173
|
if egress.has_egress_proxy():
|
|
174
174
|
return 0
|
|
175
|
-
res_i = INDEX.get("
|
|
175
|
+
res_i = INDEX.get("tier_6")
|
|
176
176
|
if res_i is not None:
|
|
177
177
|
disabled_fn = getattr(TIERS[res_i], "disabled", None)
|
|
178
178
|
if not (disabled_fn and disabled_fn()):
|
|
@@ -298,7 +298,7 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
|
|
|
298
298
|
# solve attempt and go straight to the browser/egress tiers. A real CF
|
|
299
299
|
# challenge surfaces as `botwall`, not `http_block`, so this never skips a
|
|
300
300
|
# host cloudscraper could actually clear.
|
|
301
|
-
if tier.NAME == "
|
|
301
|
+
if tier.NAME == "tier_3" and any(
|
|
302
302
|
a.outcome == "http_block" for a in attempts):
|
|
303
303
|
logger.info(f"{tier.NAME} skipped (prior hard IP block): {url}")
|
|
304
304
|
attempts.append(TierAttempt(tier.NAME, "not_applicable"))
|
|
@@ -149,6 +149,40 @@ def _parse_skip_urls_file(path: str) -> dict[str, str]:
|
|
|
149
149
|
return out
|
|
150
150
|
|
|
151
151
|
|
|
152
|
+
# Tiers were renamed to plain tier_1..tier_7 in 0.5.0. A pre-rename DB persists a
|
|
153
|
+
# host's learned winning_tier / tier_stats under the old names; migrate them on
|
|
154
|
+
# load so routing survives the upgrade instead of re-probing the cascade fresh.
|
|
155
|
+
_TIER_RENAME = {
|
|
156
|
+
"tier0_apis": "tier_1",
|
|
157
|
+
"tier1_http": "tier_2",
|
|
158
|
+
"tier2_cloudscraper": "tier_3",
|
|
159
|
+
"tier3_browser": "tier_4",
|
|
160
|
+
"tier3b_camoufox": "tier_5",
|
|
161
|
+
"tier_residential": "tier_6",
|
|
162
|
+
"tier4_firecrawl": "tier_7",
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _migrate_tier_names(hosts: dict) -> bool:
|
|
167
|
+
"""Remap pre-rename tier names in persisted host records. Idempotent."""
|
|
168
|
+
changed = False
|
|
169
|
+
for rec in hosts.values():
|
|
170
|
+
if rec.get("winning_tier") in _TIER_RENAME:
|
|
171
|
+
rec["winning_tier"] = _TIER_RENAME[rec["winning_tier"]]
|
|
172
|
+
changed = True
|
|
173
|
+
stats = rec.get("tier_stats")
|
|
174
|
+
if isinstance(stats, dict):
|
|
175
|
+
for old, new in _TIER_RENAME.items():
|
|
176
|
+
if old in stats:
|
|
177
|
+
# Merge into the new key if it somehow already exists, else move.
|
|
178
|
+
dst = stats.setdefault(new, {"ok": 0, "miss": 0})
|
|
179
|
+
src = stats.pop(old)
|
|
180
|
+
dst["ok"] += src.get("ok", 0)
|
|
181
|
+
dst["miss"] += src.get("miss", 0)
|
|
182
|
+
changed = True
|
|
183
|
+
return changed
|
|
184
|
+
|
|
185
|
+
|
|
152
186
|
def load_db() -> dict:
|
|
153
187
|
db = {"version": 2, "updated_at": "", "hosts": {}, "urls": {}}
|
|
154
188
|
if os.path.exists(DB_PATH):
|
|
@@ -159,7 +193,7 @@ def load_db() -> dict:
|
|
|
159
193
|
logger.error(f"botwall: load failed ({e}); starting fresh")
|
|
160
194
|
hosts = db.setdefault("hosts", {})
|
|
161
195
|
urls = db.setdefault("urls", {})
|
|
162
|
-
changed =
|
|
196
|
+
changed = _migrate_tier_names(hosts)
|
|
163
197
|
for host, reason in SEED_HOSTS.items():
|
|
164
198
|
if host not in hosts:
|
|
165
199
|
hosts[host] = _new_record(reason=reason, status="skip")
|
|
@@ -305,7 +339,7 @@ def _track_egress(host: str, tier: str, outcome: str, db: dict) -> None:
|
|
|
305
339
|
unescalated. We don't count the residential tier's own misses (circular)."""
|
|
306
340
|
if not PROMOTE_EGRESS_AFTER or outcome not in _EGRESS_OUTCOMES:
|
|
307
341
|
return
|
|
308
|
-
if tier == "
|
|
342
|
+
if tier == "tier_6": # residential egress — don't count its own misses
|
|
309
343
|
return
|
|
310
344
|
rec = db.get("hosts", {}).get(host)
|
|
311
345
|
if rec is None or rec.get("needs_egress"):
|
|
@@ -39,7 +39,7 @@ FIRECRAWL_USD = float(os.getenv("BENCH_FIRECRAWL_USD", "0.001"))
|
|
|
39
39
|
HARD_MULT = float(os.getenv("BENCH_FIRECRAWL_HARD_MULT", "5"))
|
|
40
40
|
|
|
41
41
|
# Tiers whose win means Firecrawl would have billed the hard (stealth) rate.
|
|
42
|
-
_HARD_TIERS = {"
|
|
42
|
+
_HARD_TIERS = {"tier_4", "tier_5", "tier_6", "tier_7"}
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
def _parse_ts(ts: str) -> datetime | None:
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""The cost-ordered cascade. Each tier exposes:
|
|
2
|
+
|
|
3
|
+
NAME : str
|
|
4
|
+
PAID : bool # gated/audited if True
|
|
5
|
+
fetch(url) -> str | None # markdown on success; None if not
|
|
6
|
+
# applicable; raises on failure.
|
|
7
|
+
|
|
8
|
+
Order matters — cheapest/cleanest first, paid last.
|
|
9
|
+
"""
|
|
10
|
+
from . import (tier_1, tier_2, tier_3, tier_4, tier_5, tier_6, tier_7)
|
|
11
|
+
|
|
12
|
+
# Cost-ordered, cheapest first. Plain names; role noted inline.
|
|
13
|
+
TIERS = [
|
|
14
|
+
tier_1, # direct APIs / open mirrors
|
|
15
|
+
tier_2, # plain HTTP with TLS impersonation
|
|
16
|
+
tier_3, # cloudscraper (Cloudflare/anti-bot solver)
|
|
17
|
+
tier_4, # stealth headless browser (patchright)
|
|
18
|
+
tier_5, # camoufox (Firefox stealth; on by default, orthogonal to tier_4)
|
|
19
|
+
tier_6, # residential-IP CDP browser (off unless BU_CDP_URL set)
|
|
20
|
+
tier_7, # Firecrawl (paid, last resort)
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
# tier name -> index, for botwall winning-tier routing.
|
|
24
|
+
INDEX = {t.NAME: i for i, t in enumerate(TIERS)}
|
switchback-0.4.0/switchback/tiers/tier0_apis.py → switchback-0.5.0/switchback/tiers/tier_1.py
RENAMED
|
@@ -10,6 +10,7 @@ Web *search* (query → URLs) is a different shape and lives in switchback/searc
|
|
|
10
10
|
"""
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
+
import os
|
|
13
14
|
import re
|
|
14
15
|
from urllib.parse import unquote
|
|
15
16
|
from xml.etree import ElementTree as ET
|
|
@@ -17,9 +18,12 @@ from xml.etree import ElementTree as ET
|
|
|
17
18
|
from ..normalize import html_to_markdown, UA
|
|
18
19
|
from ..policy.gates import check
|
|
19
20
|
|
|
20
|
-
NAME = "
|
|
21
|
+
NAME = "tier_1"
|
|
21
22
|
PAID = False
|
|
22
23
|
|
|
24
|
+
# Per-tier request timeout (seconds); override with SCRAPER_TIER_1_TIMEOUT_S.
|
|
25
|
+
_TIMEOUT_S = float(os.getenv("SCRAPER_TIER_1_TIMEOUT_S", "15"))
|
|
26
|
+
|
|
23
27
|
ARXIV_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})(?:v\d+)?(?:\.pdf)?", re.I)
|
|
24
28
|
WIKI_RE = re.compile(r"en\.wikipedia\.org/wiki/([^?#]+)", re.I)
|
|
25
29
|
PMC_RE = re.compile(r"pmc\.ncbi\.nlm\.nih\.gov/articles/(PMC\d+)", re.I)
|
|
@@ -42,7 +46,7 @@ def _arxiv(arxiv_id: str, url: str) -> str:
|
|
|
42
46
|
# impersonating Chrome triggers aggressive 429s from their Akamai front-end.
|
|
43
47
|
import requests
|
|
44
48
|
r = requests.get(f"https://export.arxiv.org/api/query?id_list={arxiv_id}",
|
|
45
|
-
timeout=
|
|
49
|
+
timeout=_TIMEOUT_S,
|
|
46
50
|
headers={"User-Agent": "switchback/1.0 (mailto:akash@theaklabs.com)"})
|
|
47
51
|
r.raise_for_status()
|
|
48
52
|
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
|
@@ -60,7 +64,7 @@ def _arxiv(arxiv_id: str, url: str) -> str:
|
|
|
60
64
|
def _wikipedia(title: str, url: str) -> str:
|
|
61
65
|
from curl_cffi import requests as cffi
|
|
62
66
|
r = cffi.get(f"https://en.wikipedia.org/api/rest_v1/page/html/{unquote(title)}",
|
|
63
|
-
timeout=
|
|
67
|
+
timeout=_TIMEOUT_S, impersonate="chrome")
|
|
64
68
|
r.raise_for_status()
|
|
65
69
|
return check(url, html_to_markdown(r.text, base_url=url))
|
|
66
70
|
|
|
@@ -70,7 +74,7 @@ def _europepmc(url: str) -> str:
|
|
|
70
74
|
import requests
|
|
71
75
|
pmcid = PMC_RE.search(url).group(1)
|
|
72
76
|
api = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
|
|
73
|
-
r = requests.get(api, timeout=
|
|
77
|
+
r = requests.get(api, timeout=_TIMEOUT_S, headers={"User-Agent": UA})
|
|
74
78
|
r.raise_for_status()
|
|
75
79
|
if len(r.text) < 1000:
|
|
76
80
|
raise RuntimeError(f"europepmc empty: {len(r.text)}")
|
switchback-0.4.0/switchback/tiers/tier1_http.py → switchback-0.5.0/switchback/tiers/tier_2.py
RENAMED
|
@@ -15,6 +15,7 @@ says one Chrome version, the header says another).
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
17
|
import hashlib
|
|
18
|
+
import os
|
|
18
19
|
from urllib.parse import urlsplit
|
|
19
20
|
|
|
20
21
|
from .. import session_cache
|
|
@@ -22,9 +23,12 @@ from ..egress import requests_proxies, add_wire_bytes
|
|
|
22
23
|
from ..normalize import html_to_markdown, pdf_bytes_to_text
|
|
23
24
|
from ..policy.gates import BotWall, check, is_cf_challenge
|
|
24
25
|
|
|
25
|
-
NAME = "
|
|
26
|
+
NAME = "tier_2"
|
|
26
27
|
PAID = False
|
|
27
28
|
|
|
29
|
+
# Per-tier request timeout (seconds); override with SCRAPER_TIER_2_TIMEOUT_S.
|
|
30
|
+
_TIMEOUT_S = float(os.getenv("SCRAPER_TIER_2_TIMEOUT_S", "15"))
|
|
31
|
+
|
|
28
32
|
# Recent Chrome JA3 targets available in curl_cffi 0.15.x. A small spread of real
|
|
29
33
|
# versions mirrors how live traffic is distributed across Chrome releases.
|
|
30
34
|
_IMPERSONATE_TARGETS = ("chrome131", "chrome136", "chrome142")
|
|
@@ -43,7 +47,7 @@ def fetch(url: str) -> str:
|
|
|
43
47
|
# against Tier 1's distinct impersonate UA would be a mismatch tell.
|
|
44
48
|
cookie = session_cache.cookie_header(url, include_cache=False)
|
|
45
49
|
headers = {"Cookie": cookie} if cookie else None
|
|
46
|
-
r = cffi.get(url, timeout=
|
|
50
|
+
r = cffi.get(url, timeout=_TIMEOUT_S, allow_redirects=True,
|
|
47
51
|
impersonate=_impersonate_for(url),
|
|
48
52
|
proxies=requests_proxies(), headers=headers)
|
|
49
53
|
add_wire_bytes(len(r.content)) # count even on a block — failed fetches burn bandwidth too
|
|
@@ -26,7 +26,7 @@ from ..policy.gates import Unavailable, check
|
|
|
26
26
|
|
|
27
27
|
logger = logging.getLogger(__name__)
|
|
28
28
|
|
|
29
|
-
NAME = "
|
|
29
|
+
NAME = "tier_3"
|
|
30
30
|
PAID = False
|
|
31
31
|
|
|
32
32
|
# Install hint surfaced when cloudscraper is missing or the frozen PyPI 1.2.71
|
|
@@ -61,7 +61,10 @@ def available() -> tuple[bool, str]:
|
|
|
61
61
|
# per-request socket timeout. Capping it here lets the cascade fall through to the
|
|
62
62
|
# stealth browser (which can handle interactive challenges) instead of burning the
|
|
63
63
|
# per-URL deadline. ~25s comfortably covers a real JS/v3 solve (~5-15s).
|
|
64
|
-
|
|
64
|
+
# Back-compat: honor the pre-0.5.0 SCRAPER_CLOUDSCRAPER_TIMEOUT_S if the new var
|
|
65
|
+
# is unset, so a tuned prod value survives the tier rename.
|
|
66
|
+
_TIMEOUT_S = float(os.getenv("SCRAPER_TIER_3_TIMEOUT_S",
|
|
67
|
+
os.getenv("SCRAPER_CLOUDSCRAPER_TIMEOUT_S", "25")))
|
|
65
68
|
|
|
66
69
|
# Stealth pacing. Kept modest: Tier 2 only fires on CF-suspected hosts, and the
|
|
67
70
|
# real latency win comes from skipping the solve entirely on repeat hits (session
|
|
@@ -153,7 +156,7 @@ def fetch(url: str) -> str:
|
|
|
153
156
|
except BaseException as e: # noqa: BLE001 — propagated to caller below
|
|
154
157
|
box["err"] = e
|
|
155
158
|
|
|
156
|
-
t = threading.Thread(target=work, name="
|
|
159
|
+
t = threading.Thread(target=work, name="tier_3-cloudscraper", daemon=True)
|
|
157
160
|
t.start()
|
|
158
161
|
t.join(_TIMEOUT_S)
|
|
159
162
|
if t.is_alive():
|
switchback-0.4.0/switchback/tiers/tier3_browser.py → switchback-0.5.0/switchback/tiers/tier_4.py
RENAMED
|
@@ -19,9 +19,12 @@ from ..egress import playwright_proxy, add_wire_bytes
|
|
|
19
19
|
from ..normalize import html_to_markdown
|
|
20
20
|
from ..policy.gates import Unavailable, check
|
|
21
21
|
|
|
22
|
-
NAME = "
|
|
22
|
+
NAME = "tier_4"
|
|
23
23
|
PAID = False
|
|
24
24
|
|
|
25
|
+
# Per-tier navigation timeout (seconds); override with SCRAPER_TIER_4_TIMEOUT_S.
|
|
26
|
+
_TIMEOUT_S = float(os.getenv("SCRAPER_TIER_4_TIMEOUT_S", "15"))
|
|
27
|
+
|
|
25
28
|
# Install hint surfaced when patchright or its Chromium isn't ready — notably
|
|
26
29
|
# during an async cold-start install (the browser binary lands after boot).
|
|
27
30
|
_INSTALL_HINT = 'pip install "switchback[browser]" && patchright install chromium'
|
|
@@ -47,7 +50,7 @@ def available() -> tuple[bool, str]:
|
|
|
47
50
|
return True, "patchright + Chromium ready"
|
|
48
51
|
|
|
49
52
|
|
|
50
|
-
def fetch(url: str, timeout_ms: int =
|
|
53
|
+
def fetch(url: str, timeout_ms: int = int(_TIMEOUT_S * 1000)) -> str:
|
|
51
54
|
try:
|
|
52
55
|
from patchright.sync_api import sync_playwright
|
|
53
56
|
except ImportError:
|
switchback-0.4.0/switchback/tiers/tier3b_camoufox.py → switchback-0.5.0/switchback/tiers/tier_5.py
RENAMED
|
@@ -25,10 +25,16 @@ from ..policy.gates import check
|
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
27
27
|
|
|
28
|
-
NAME = "
|
|
28
|
+
NAME = "tier_5"
|
|
29
29
|
PAID = False
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
# Per-tier navigation timeout (seconds); override with SCRAPER_TIER_5_TIMEOUT_S.
|
|
32
|
+
# Default kept at 45s — camoufox is the slowest rung (~40s on a hard CF solve).
|
|
33
|
+
# Back-compat: honor the pre-0.5.0 SCRAPER_CAMOUFOX_TIMEOUT_MS (ms) if unset.
|
|
34
|
+
_legacy_ms = os.getenv("SCRAPER_CAMOUFOX_TIMEOUT_MS")
|
|
35
|
+
_TIMEOUT_S = float(os.getenv("SCRAPER_TIER_5_TIMEOUT_S",
|
|
36
|
+
str(float(_legacy_ms) / 1000) if _legacy_ms else "45"))
|
|
37
|
+
_TIMEOUT_MS = int(_TIMEOUT_S * 1000)
|
|
32
38
|
|
|
33
39
|
|
|
34
40
|
def disabled() -> bool:
|
switchback-0.4.0/switchback/tiers/tier_residential.py → switchback-0.5.0/switchback/tiers/tier_6.py
RENAMED
|
@@ -22,10 +22,16 @@ from ..concurrency import browser_slot
|
|
|
22
22
|
from ..normalize import html_to_markdown
|
|
23
23
|
from ..policy.gates import check
|
|
24
24
|
|
|
25
|
-
NAME = "
|
|
25
|
+
NAME = "tier_6"
|
|
26
26
|
PAID = False
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
# Per-tier navigation timeout (seconds); override with SCRAPER_TIER_6_TIMEOUT_S.
|
|
29
|
+
# Default kept at 30s — remote CDP over a residential proxy is slow to first paint.
|
|
30
|
+
# Back-compat: honor the pre-0.5.0 SCRAPER_RESIDENTIAL_TIMEOUT_MS (ms) if unset.
|
|
31
|
+
_legacy_ms = os.getenv("SCRAPER_RESIDENTIAL_TIMEOUT_MS")
|
|
32
|
+
_TIMEOUT_S = float(os.getenv("SCRAPER_TIER_6_TIMEOUT_S",
|
|
33
|
+
str(float(_legacy_ms) / 1000) if _legacy_ms else "30"))
|
|
34
|
+
_TIMEOUT_MS = int(_TIMEOUT_S * 1000)
|
|
29
35
|
|
|
30
36
|
|
|
31
37
|
def disabled() -> bool:
|
switchback-0.4.0/switchback/tiers/tier4_firecrawl.py → switchback-0.5.0/switchback/tiers/tier_7.py
RENAMED
|
@@ -12,9 +12,15 @@ import threading
|
|
|
12
12
|
from ..normalize import active_format, render
|
|
13
13
|
from ..policy.gates import check
|
|
14
14
|
|
|
15
|
-
NAME = "
|
|
15
|
+
NAME = "tier_7"
|
|
16
16
|
PAID = True
|
|
17
17
|
|
|
18
|
+
# Per-tier wall-clock cap (seconds); override with SCRAPER_TIER_7_TIMEOUT_S.
|
|
19
|
+
# This paid last resort was previously unbounded; 15s bounds it like the rest, but
|
|
20
|
+
# Firecrawl scrapes can legitimately run longer — raise this if hard hosts get cut
|
|
21
|
+
# off at the finish line (you may still be billed for a scrape killed here).
|
|
22
|
+
_TIMEOUT_S = float(os.getenv("SCRAPER_TIER_7_TIMEOUT_S", "15"))
|
|
23
|
+
|
|
18
24
|
|
|
19
25
|
def disabled() -> bool:
|
|
20
26
|
return bool(os.getenv("SCRAPER_DISABLE_FIRECRAWL"))
|
|
@@ -49,9 +55,13 @@ def fetch(url: str) -> str:
|
|
|
49
55
|
except BaseException as e: # noqa: BLE001 — re-raised to the caller below
|
|
50
56
|
box["err"] = e
|
|
51
57
|
|
|
52
|
-
t = threading.Thread(target=work, name="
|
|
58
|
+
t = threading.Thread(target=work, name="tier_7-firecrawl", daemon=True)
|
|
53
59
|
t.start()
|
|
54
|
-
t.join()
|
|
60
|
+
t.join(_TIMEOUT_S)
|
|
61
|
+
if t.is_alive():
|
|
62
|
+
raise TimeoutError(
|
|
63
|
+
f"firecrawl exceeded {_TIMEOUT_S}s "
|
|
64
|
+
"(raise SCRAPER_TIER_7_TIMEOUT_S for slow hosts)")
|
|
55
65
|
if "err" in box:
|
|
56
66
|
raise box["err"]
|
|
57
67
|
return box["md"]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: switchback
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
5
|
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
6
|
License: MIT
|
|
@@ -121,13 +121,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
|
|
|
121
121
|
|
|
122
122
|
| Tier | Strategy | Cost |
|
|
123
123
|
|---|---|---|
|
|
124
|
-
|
|
|
125
|
-
|
|
|
126
|
-
|
|
|
127
|
-
|
|
|
128
|
-
|
|
|
129
|
-
|
|
|
130
|
-
|
|
|
124
|
+
| tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
|
|
125
|
+
| tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
|
|
126
|
+
| tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
|
|
127
|
+
| tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
|
|
128
|
+
| tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
|
|
129
|
+
| tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
|
|
130
|
+
| tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
|
|
131
131
|
|
|
132
132
|
Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
|
|
133
133
|
tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
|
|
@@ -142,17 +142,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
|
|
|
142
142
|
## Install
|
|
143
143
|
|
|
144
144
|
```bash
|
|
145
|
-
pip install switchback # core: normalization + cheap tiers (
|
|
146
|
-
pip install "switchback[cloudflare]" # +
|
|
145
|
+
pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
|
|
146
|
+
pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
|
|
147
147
|
pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
|
|
148
|
-
pip install "switchback[browser]" && patchright install chromium # +
|
|
149
|
-
pip install "switchback[camoufox]" && camoufox fetch # +
|
|
150
|
-
pip install "switchback[firecrawl]" # +
|
|
148
|
+
pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
|
|
149
|
+
pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
|
|
150
|
+
pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
|
|
151
151
|
pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
|
|
152
152
|
pip install "switchback[all]" # everything
|
|
153
153
|
```
|
|
154
154
|
|
|
155
|
-
For
|
|
155
|
+
For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
|
|
156
156
|
3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
|
|
157
157
|
git-URL dep inside a published package, so install it alongside):
|
|
158
158
|
|
|
@@ -170,19 +170,20 @@ and land *after* boot (e.g. an async install thread on Azure). Until they're
|
|
|
170
170
|
ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
|
|
171
171
|
fix) and the cascade falls through — they are never silently skipped. Checklist:
|
|
172
172
|
|
|
173
|
-
- **
|
|
173
|
+
- **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
|
|
174
174
|
is installed: `patchright install chromium` (note: **patchright**, not vanilla
|
|
175
175
|
`playwright`). On a cold start, run this in your post-boot install step/thread;
|
|
176
|
-
|
|
177
|
-
- **
|
|
176
|
+
tier_4 flips to ready once it finishes.
|
|
177
|
+
- **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
|
|
178
178
|
frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
|
|
179
|
-
solve budget) instead of erroring mid-cascade.
|
|
179
|
+
solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
|
|
180
180
|
modern Cloudflare — treat it as a cheap try before the browser, not the primary.
|
|
181
|
-
- **Install Node.js** for
|
|
181
|
+
- **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
|
|
182
182
|
vs. the pure-Python js2py fallback (relevant under concurrent load).
|
|
183
|
-
- **Bound
|
|
184
|
-
`
|
|
185
|
-
|
|
183
|
+
- **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
|
|
184
|
+
the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
|
|
185
|
+
challenge can't eat the per-URL deadline before the browser tier runs. Lower it
|
|
186
|
+
(e.g. `12`) if tier_3 rarely wins on your hosts.
|
|
186
187
|
|
|
187
188
|
**Verify readiness on the box** with the preflight check (doubles as a healthcheck
|
|
188
189
|
— exit 0 when the capable tiers are ready):
|
|
@@ -282,16 +283,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
282
283
|
<details>
|
|
283
284
|
<summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
|
|
284
285
|
|
|
285
|
-
- `SCRAPER_DISABLE_FIRECRAWL` — skip
|
|
286
|
-
- `FIRECRAWL_API_KEY` — enable
|
|
287
|
-
- `SCRAPER_DISABLE_CAMOUFOX` — turn off
|
|
288
|
-
- `BU_CDP_URL` — enable
|
|
286
|
+
- `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
|
|
287
|
+
- `FIRECRAWL_API_KEY` — enable tier_7
|
|
288
|
+
- `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
|
|
289
|
+
- `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
|
|
289
290
|
- `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
|
|
290
291
|
- `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
|
|
291
292
|
- `SEARXNG_URL` — defaults to `http://localhost:8888`
|
|
292
293
|
- `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
|
|
293
294
|
- `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
|
|
294
|
-
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into
|
|
295
|
+
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
|
|
295
296
|
</details>
|
|
296
297
|
|
|
297
298
|
<details>
|
|
@@ -300,7 +301,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
300
301
|
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
301
302
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
302
303
|
- `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
|
|
303
|
-
- `
|
|
304
|
+
- `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
|
|
304
305
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
305
306
|
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
306
307
|
- `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
|
|
@@ -308,7 +309,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
308
309
|
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
309
310
|
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
310
311
|
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
311
|
-
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `
|
|
312
|
+
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
|
|
312
313
|
- `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
|
|
313
314
|
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
314
315
|
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
@@ -40,10 +40,10 @@ switchback/policy/botwall.py
|
|
|
40
40
|
switchback/policy/gates.py
|
|
41
41
|
switchback/tiers/__init__.py
|
|
42
42
|
switchback/tiers/_browser.py
|
|
43
|
-
switchback/tiers/
|
|
44
|
-
switchback/tiers/
|
|
45
|
-
switchback/tiers/
|
|
46
|
-
switchback/tiers/
|
|
47
|
-
switchback/tiers/
|
|
48
|
-
switchback/tiers/
|
|
49
|
-
switchback/tiers/
|
|
43
|
+
switchback/tiers/tier_1.py
|
|
44
|
+
switchback/tiers/tier_2.py
|
|
45
|
+
switchback/tiers/tier_3.py
|
|
46
|
+
switchback/tiers/tier_4.py
|
|
47
|
+
switchback/tiers/tier_5.py
|
|
48
|
+
switchback/tiers/tier_6.py
|
|
49
|
+
switchback/tiers/tier_7.py
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
"""The cost-ordered cascade. Each tier exposes:
|
|
2
|
-
|
|
3
|
-
NAME : str
|
|
4
|
-
PAID : bool # gated/audited if True
|
|
5
|
-
fetch(url) -> str | None # markdown on success; None if not
|
|
6
|
-
# applicable; raises on failure.
|
|
7
|
-
|
|
8
|
-
Order matters — cheapest/cleanest first, paid last.
|
|
9
|
-
"""
|
|
10
|
-
from . import (tier0_apis, tier1_http, tier2_cloudscraper,
|
|
11
|
-
tier3_browser, tier3b_camoufox, tier_residential, tier4_firecrawl)
|
|
12
|
-
|
|
13
|
-
TIERS = [
|
|
14
|
-
tier0_apis,
|
|
15
|
-
tier1_http,
|
|
16
|
-
tier2_cloudscraper,
|
|
17
|
-
tier3_browser,
|
|
18
|
-
tier3b_camoufox, # env-gated Firefox stealth (off by default; orthogonal to T3)
|
|
19
|
-
tier_residential, # residential-IP CDP browser (off unless BU_CDP_URL set)
|
|
20
|
-
tier4_firecrawl,
|
|
21
|
-
]
|
|
22
|
-
|
|
23
|
-
# tier name -> index, for botwall winning-tier routing.
|
|
24
|
-
INDEX = {t.NAME: i for i, t in enumerate(TIERS)}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|