switchback 0.2.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {switchback-0.2.0 → switchback-0.5.0}/.env.example +24 -8
- switchback-0.5.0/CHANGELOG.md +119 -0
- {switchback-0.2.0 → switchback-0.5.0}/PKG-INFO +52 -20
- {switchback-0.2.0 → switchback-0.5.0}/README.md +51 -19
- {switchback-0.2.0 → switchback-0.5.0}/pyproject.toml +1 -1
- {switchback-0.2.0 → switchback-0.5.0}/switchback/api.py +5 -0
- switchback-0.5.0/switchback/doctor.py +59 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/flags.py +2 -2
- {switchback-0.2.0 → switchback-0.5.0}/switchback/orchestrator.py +172 -63
- {switchback-0.2.0 → switchback-0.5.0}/switchback/policy/botwall.py +36 -2
- {switchback-0.2.0 → switchback-0.5.0}/switchback/policy/gates.py +59 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/reporting.py +1 -1
- switchback-0.5.0/switchback/tiers/__init__.py +24 -0
- switchback-0.2.0/switchback/tiers/tier0_apis.py → switchback-0.5.0/switchback/tiers/tier_1.py +8 -4
- switchback-0.2.0/switchback/tiers/tier1_http.py → switchback-0.5.0/switchback/tiers/tier_2.py +6 -2
- switchback-0.2.0/switchback/tiers/tier2_cloudscraper.py → switchback-0.5.0/switchback/tiers/tier_3.py +39 -4
- switchback-0.2.0/switchback/tiers/tier3_browser.py → switchback-0.5.0/switchback/tiers/tier_4.py +46 -5
- switchback-0.2.0/switchback/tiers/tier3b_camoufox.py → switchback-0.5.0/switchback/tiers/tier_5.py +8 -2
- switchback-0.2.0/switchback/tiers/tier_residential.py → switchback-0.5.0/switchback/tiers/tier_6.py +8 -2
- switchback-0.2.0/switchback/tiers/tier4_firecrawl.py → switchback-0.5.0/switchback/tiers/tier_7.py +13 -3
- {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/PKG-INFO +52 -20
- {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/SOURCES.txt +8 -7
- switchback-0.2.0/CHANGELOG.md +0 -46
- switchback-0.2.0/switchback/tiers/__init__.py +0 -24
- {switchback-0.2.0 → switchback-0.5.0}/CONTRIBUTING.md +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/LICENSE +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/MANIFEST.in +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/NOTICE +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/SECURITY.md +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/clients/node_bridge.md +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/clients/python_client.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/config/botwall_skip_urls.txt +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/config/extraction.example.json +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/setup.cfg +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/__init__.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/__main__.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/concurrency.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/content_cache.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/egress.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/extract.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/normalize.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/policy/__init__.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/py.typed +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/search.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/server.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/session_cache.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/session_trace.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/tiers/_browser.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback/tracing.py +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/dependency_links.txt +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/entry_points.txt +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/requires.txt +0 -0
- {switchback-0.2.0 → switchback-0.5.0}/switchback.egg-info/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
OTEL_SERVICE_NAME=switchback
|
|
12
12
|
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
13
13
|
|
|
14
|
-
# ── Search (
|
|
14
|
+
# ── Search (SearXNG, query → URLs — separate from the fetch cascade) ─────────
|
|
15
15
|
SEARXNG_URL=http://localhost:8888
|
|
16
16
|
|
|
17
17
|
# ── Output format ───────────────────────────────────────────────────────────
|
|
@@ -27,22 +27,38 @@ SEARXNG_URL=http://localhost:8888
|
|
|
27
27
|
# fall back to their text for those sources.
|
|
28
28
|
SCRAPER_OUTPUT_FORMAT=markdown
|
|
29
29
|
|
|
30
|
-
# ──
|
|
31
|
-
#
|
|
32
|
-
|
|
33
|
-
|
|
30
|
+
# ── tier_3 · Cloudflare solver (cloudscraper) ───────────────────────────────
|
|
31
|
+
# Needs the 3.x Enhanced Edition fork (see README); with the frozen PyPI build
|
|
32
|
+
# the tier reports `unavailable` and fails fast. Wall-clock cap on a single solve
|
|
33
|
+
# so an unsolvable challenge can't eat the per-URL deadline before the browser
|
|
34
|
+
# tier runs. Lower (e.g. 12) if tier_3 rarely wins on your hosts.
|
|
35
|
+
SCRAPER_TIER_3_TIMEOUT_S=25
|
|
34
36
|
|
|
35
|
-
# ──
|
|
37
|
+
# ── tier_5 · Camoufox (Firefox stealth) ─────────────────────────────────────
|
|
36
38
|
# ON by default. Needs: pip install camoufox && camoufox fetch
|
|
37
39
|
# Set to 1 to turn the tier off entirely.
|
|
38
40
|
SCRAPER_DISABLE_CAMOUFOX=
|
|
39
|
-
|
|
41
|
+
SCRAPER_TIER_5_TIMEOUT_S=45
|
|
40
42
|
|
|
41
|
-
# ──
|
|
43
|
+
# ── tier_7 · Firecrawl (paid, last resort) ──────────────────────────────────
|
|
42
44
|
# Required only if this tier runs. Set SCRAPER_DISABLE_FIRECRAWL=1 to skip it.
|
|
43
45
|
FIRECRAWL_API_KEY=
|
|
44
46
|
SCRAPER_DISABLE_FIRECRAWL=
|
|
45
47
|
|
|
48
|
+
# ── Per-tier timeouts (seconds) ─────────────────────────────────────────────
|
|
49
|
+
# Each tier's wall-clock/socket cap; override any of them. Defaults shown below.
|
|
50
|
+
# tier_3 (=25) and tier_5 (=45) are set live in their sections above; the rest
|
|
51
|
+
# fall back to these defaults. The pre-0.5.0 names are still honored when the new
|
|
52
|
+
# var is unset: SCRAPER_CLOUDSCRAPER_TIMEOUT_S → tier_3,
|
|
53
|
+
# SCRAPER_CAMOUFOX_TIMEOUT_MS → tier_5, SCRAPER_RESIDENTIAL_TIMEOUT_MS → tier_6.
|
|
54
|
+
#SCRAPER_TIER_1_TIMEOUT_S=15 # direct APIs / open mirrors
|
|
55
|
+
#SCRAPER_TIER_2_TIMEOUT_S=15 # plain HTTP + TLS impersonation
|
|
56
|
+
#SCRAPER_TIER_3_TIMEOUT_S=25 # cloudscraper (Cloudflare solver)
|
|
57
|
+
#SCRAPER_TIER_4_TIMEOUT_S=15 # stealth headless browser (patchright)
|
|
58
|
+
#SCRAPER_TIER_5_TIMEOUT_S=45 # camoufox (slowest rung; hard CF solves ~40s)
|
|
59
|
+
#SCRAPER_TIER_6_TIMEOUT_S=30 # residential-IP CDP browser
|
|
60
|
+
#SCRAPER_TIER_7_TIMEOUT_S=15 # Firecrawl (paid) — was unbounded; raise if scrapes get cut off
|
|
61
|
+
|
|
46
62
|
# ── Orchestrator ────────────────────────────────────────────────────────────
|
|
47
63
|
# Per-URL wall-clock budget (s), checked between tiers. 45s balances latency vs
|
|
48
64
|
# coverage — roughly fits a Camoufox solve (~40s) that starts after the cheaper
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. Format loosely follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/); this project uses semantic-ish
|
|
5
|
+
versioning while pre-1.0.
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [0.5.0] - 2026-06-30
|
|
10
|
+
|
|
11
|
+
### Changed
|
|
12
|
+
- **Tiers renamed to plain `tier_1`…`tier_7`** (cost-ordered, contiguous) in place
|
|
13
|
+
of the old mixed scheme (`tier0_apis`, `tier1_http`, `tier2_cloudscraper`,
|
|
14
|
+
`tier3_browser`, `tier3b_camoufox`, `tier_residential`, `tier4_firecrawl`). The
|
|
15
|
+
mapping is positional: `tier_1`=apis, `tier_2`=http, `tier_3`=cloudscraper,
|
|
16
|
+
`tier_4`=browser, `tier_5`=camoufox, `tier_6`=residential, `tier_7`=firecrawl.
|
|
17
|
+
**Backwards-compatible:** an existing `state/botwall_db.json` is migrated on load
|
|
18
|
+
(a host's learned `winning_tier` / `tier_stats` keys are remapped to the new
|
|
19
|
+
names), so routing survives the upgrade instead of re-probing from scratch.
|
|
20
|
+
|
|
21
|
+
### Added
|
|
22
|
+
- **Per-tier timeout knobs** — every tier now reads `SCRAPER_TIER_<N>_TIMEOUT_S`
|
|
23
|
+
(seconds, `N` = 1–7). Defaults: `15` for tiers without a prior budget
|
|
24
|
+
(apis/http/browser), and the existing budgets are preserved — `tier_3`=25,
|
|
25
|
+
`tier_5`=45, `tier_6`=30. The previously-unconfigurable/unbounded tiers (apis,
|
|
26
|
+
http, browser, **firecrawl**) are now bounded and overridable. The pre-rename
|
|
27
|
+
`SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` /
|
|
28
|
+
`SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset.
|
|
29
|
+
Note: `tier_7` (paid Firecrawl) was previously unbounded; its new `15`s default
|
|
30
|
+
bounds it — raise `SCRAPER_TIER_7_TIMEOUT_S` if hard hosts get cut off (a scrape
|
|
31
|
+
killed at the cap may still be billed). `SCRAPER_TIER_RETRIES_<TIER>` overrides
|
|
32
|
+
follow the new names (e.g. `SCRAPER_TIER_RETRIES_TIER_4`).
|
|
33
|
+
|
|
34
|
+
## [0.4.0] - 2026-06-29
|
|
35
|
+
|
|
36
|
+
### Added
|
|
37
|
+
- **Configurable per-tier retries** — a tier can now re-attempt before falling
|
|
38
|
+
through to the next, more capable one. `SCRAPER_TIER_RETRIES` (global, default
|
|
39
|
+
`0` = off; `N` → up to `1+N` tries per tier), per-tier overrides
|
|
40
|
+
`SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`), and
|
|
41
|
+
`SCRAPER_TIER_RETRY_ON` (retryable failure classes; default
|
|
42
|
+
`timeout,rate_limited,connection` — widen to include `botwall,http_block` behind
|
|
43
|
+
a rotating residential proxy, where each retry gets a fresh IP). Retries stay
|
|
44
|
+
bounded by `SCRAPER_DEADLINE_S`, and intermediate retries are traced/logged but
|
|
45
|
+
**not** persisted to the botwall policy DB, so they never inflate the
|
|
46
|
+
self-healing skip / `needs_egress` counters. Default `0` keeps behaviour
|
|
47
|
+
unchanged. Enabling retries on the paid Firecrawl tier bills per attempt.
|
|
48
|
+
|
|
49
|
+
### Fixed
|
|
50
|
+
- **Quality gate rejects content shells** — the gate no longer passes a page just
|
|
51
|
+
because it clears the length floor; thin "shell" pages (nav/boilerplate with no
|
|
52
|
+
real article body) are now treated as a tier miss so the cascade falls through.
|
|
53
|
+
- **Paid last-resort budget reserve** — `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S`
|
|
54
|
+
(default 25s) stops starting local tiers once enough of the per-URL deadline has
|
|
55
|
+
elapsed and an enabled paid tier is still ahead, so a hard host can't burn the
|
|
56
|
+
whole budget before Firecrawl gets a turn.
|
|
57
|
+
|
|
58
|
+
## [0.3.0] - 2026-06-27
|
|
59
|
+
|
|
60
|
+
### Added
|
|
61
|
+
- **`unavailable` tier outcome** — when a tier's optional dependency is missing,
|
|
62
|
+
the wrong version, or not installed yet (frozen PyPI `cloudscraper` instead of
|
|
63
|
+
the 3.x stealth fork; patchright's Chromium not downloaded during an async
|
|
64
|
+
cold-start install), the tier now fails fast (~0ms) with a distinct
|
|
65
|
+
`unavailable` outcome carrying the exact install command, logged once per tier.
|
|
66
|
+
It ranks above bot-wall in the verdict, so an environment problem is no longer
|
|
67
|
+
masked as `botwall` — and a missing Tier 2 dependency no longer burns the
|
|
68
|
+
per-URL solve budget before the browser tier runs.
|
|
69
|
+
- **`switchback --doctor`** — preflight tier-readiness check (doubles as a
|
|
70
|
+
healthcheck: exit 0 when the capable tiers are ready). Reports whether
|
|
71
|
+
cloudscraper is the stealth-capable 3.x fork, patchright + Chromium are
|
|
72
|
+
installed, Camoufox/Node are present, and Firecrawl is configured. Built for
|
|
73
|
+
cold-start deploys where the browser is installed by a background thread after
|
|
74
|
+
boot.
|
|
75
|
+
|
|
76
|
+
### Docs
|
|
77
|
+
- README **Production / cold-start deployment** section and a `.env.example`
|
|
78
|
+
Tier 2 block: install `patchright install chromium` in the post-boot step, the
|
|
79
|
+
cloudscraper 3.x fork requirement, Node.js for Tier 2 concurrency, and the
|
|
80
|
+
`SCRAPER_CLOUDSCRAPER_TIMEOUT_S` budget knob.
|
|
81
|
+
|
|
82
|
+
## [0.2.0] - 2026-06-25
|
|
83
|
+
|
|
84
|
+
### Added
|
|
85
|
+
- **Selectable output formats** — `SCRAPER_OUTPUT_FORMAT` (or per-call
|
|
86
|
+
`scrape(fmt=...)`, CLI `--format`, `/scrape` `{"format": ...}`) selects the
|
|
87
|
+
content shape: `markdown` (default, unchanged), `markdown_trimmed` (extra
|
|
88
|
+
ad/nav/boilerplate removed), `html` (raw), or `html_selectors` (cleaned HTML
|
|
89
|
+
with per-domain `drop`/`selector` applied). Default output is byte-identical;
|
|
90
|
+
html-family results use a `html` JSON key instead of `markdown`.
|
|
91
|
+
|
|
92
|
+
## [0.1.0] - 2026-06-23
|
|
93
|
+
|
|
94
|
+
### Added
|
|
95
|
+
- **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
|
|
96
|
+
DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
|
|
97
|
+
botwall DB; the vendor is attached to each event and OTel span (`scrape.challenge`).
|
|
98
|
+
- **Metrics & reporting** — `switchback.reporting` rolls the event log + botwall DB
|
|
99
|
+
into cost-savings-vs-Firecrawl, coverage, overall/per-tier/per-domain latency
|
|
100
|
+
(mean/median/min/max/p50/p95), outcomes, error codes by domain, and challenges
|
|
101
|
+
by domain. Exposed via `GET /metrics` and `GET /metrics/domains` (both accept
|
|
102
|
+
`?minutes=N`).
|
|
103
|
+
- **Periodic flagging** — `python -m switchback.flags` emits a cron-friendly digest
|
|
104
|
+
(domains stuck on Firecrawl, escalated to egress, most-challenged) to logs/OTel.
|
|
105
|
+
- **Content cache** — optional URL→result cache (`SCRAPER_CONTENT_TTL_S`, sqlite,
|
|
106
|
+
off by default) short-circuits re-scrapes before any tier runs.
|
|
107
|
+
- **Login-session refresh** — `SCRAPER_LOGIN_HOOK` (`pkg.module:func`) refreshes a
|
|
108
|
+
dead logged-in session on demand; cookies overlay every tier and persist.
|
|
109
|
+
- **Exponential backoff** — between-tier backoff with jitter after rate-limit /
|
|
110
|
+
timeout (`SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS`, off by default).
|
|
111
|
+
- **Per-domain extraction prefs** — `config/extraction.json` (CSS scope selector +
|
|
112
|
+
extra drops) applied automatically in the normalize step for every tier.
|
|
113
|
+
- **Session traces** — opt-in Playwright trace capture (`SCRAPER_TRACE_SESSION=1`)
|
|
114
|
+
for browser tiers, with `GET/DELETE /traces` management endpoints.
|
|
115
|
+
|
|
116
|
+
### Changed
|
|
117
|
+
- Tier 2's `cloudscraper` moved from a core dependency (which pinned a git-URL
|
|
118
|
+
fork PyPI can't publish) to the `cloudflare` extra; see the README for installing
|
|
119
|
+
the 3.x Enhanced Edition fork for full stealth.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: switchback
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
5
|
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
6
|
License: MIT
|
|
@@ -121,13 +121,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
|
|
|
121
121
|
|
|
122
122
|
| Tier | Strategy | Cost |
|
|
123
123
|
|---|---|---|
|
|
124
|
-
|
|
|
125
|
-
|
|
|
126
|
-
|
|
|
127
|
-
|
|
|
128
|
-
|
|
|
129
|
-
|
|
|
130
|
-
|
|
|
124
|
+
| tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
|
|
125
|
+
| tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
|
|
126
|
+
| tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
|
|
127
|
+
| tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
|
|
128
|
+
| tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
|
|
129
|
+
| tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
|
|
130
|
+
| tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
|
|
131
131
|
|
|
132
132
|
Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
|
|
133
133
|
tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
|
|
@@ -142,17 +142,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
|
|
|
142
142
|
## Install
|
|
143
143
|
|
|
144
144
|
```bash
|
|
145
|
-
pip install switchback # core: normalization + cheap tiers (
|
|
146
|
-
pip install "switchback[cloudflare]" # +
|
|
145
|
+
pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
|
|
146
|
+
pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
|
|
147
147
|
pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
|
|
148
|
-
pip install "switchback[browser]" && patchright install chromium # +
|
|
149
|
-
pip install "switchback[camoufox]" && camoufox fetch # +
|
|
150
|
-
pip install "switchback[firecrawl]" # +
|
|
148
|
+
pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
|
|
149
|
+
pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
|
|
150
|
+
pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
|
|
151
151
|
pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
|
|
152
152
|
pip install "switchback[all]" # everything
|
|
153
153
|
```
|
|
154
154
|
|
|
155
|
-
For
|
|
155
|
+
For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
|
|
156
156
|
3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
|
|
157
157
|
git-URL dep inside a published package, so install it alongside):
|
|
158
158
|
|
|
@@ -163,6 +163,35 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
|
|
|
163
163
|
Or run the whole thing as a container:
|
|
164
164
|
`docker build -t switchback . && docker run -p 8799:8799 switchback`.
|
|
165
165
|
|
|
166
|
+
### Production / cold-start deployment
|
|
167
|
+
|
|
168
|
+
The two heavy tiers pull dependencies that often can't be baked into a base image
|
|
169
|
+
and land *after* boot (e.g. an async install thread on Azure). Until they're
|
|
170
|
+
ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
|
|
171
|
+
fix) and the cascade falls through — they are never silently skipped. Checklist:
|
|
172
|
+
|
|
173
|
+
- **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
|
|
174
|
+
is installed: `patchright install chromium` (note: **patchright**, not vanilla
|
|
175
|
+
`playwright`). On a cold start, run this in your post-boot install step/thread;
|
|
176
|
+
tier_4 flips to ready once it finishes.
|
|
177
|
+
- **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
|
|
178
|
+
frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
|
|
179
|
+
solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
|
|
180
|
+
modern Cloudflare — treat it as a cheap try before the browser, not the primary.
|
|
181
|
+
- **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
|
|
182
|
+
vs. the pure-Python js2py fallback (relevant under concurrent load).
|
|
183
|
+
- **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
|
|
184
|
+
the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
|
|
185
|
+
challenge can't eat the per-URL deadline before the browser tier runs. Lower it
|
|
186
|
+
(e.g. `12`) if tier_3 rarely wins on your hosts.
|
|
187
|
+
|
|
188
|
+
**Verify readiness on the box** with the preflight check (doubles as a healthcheck
|
|
189
|
+
— exit 0 when the capable tiers are ready):
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
switchback --doctor # or: python -m switchback --doctor
|
|
193
|
+
```
|
|
194
|
+
|
|
166
195
|
## Use it from your app
|
|
167
196
|
|
|
168
197
|
Three interchangeable entry points — all return the same shape
|
|
@@ -254,16 +283,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
254
283
|
<details>
|
|
255
284
|
<summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
|
|
256
285
|
|
|
257
|
-
- `SCRAPER_DISABLE_FIRECRAWL` — skip
|
|
258
|
-
- `FIRECRAWL_API_KEY` — enable
|
|
259
|
-
- `SCRAPER_DISABLE_CAMOUFOX` — turn off
|
|
260
|
-
- `BU_CDP_URL` — enable
|
|
286
|
+
- `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
|
|
287
|
+
- `FIRECRAWL_API_KEY` — enable tier_7
|
|
288
|
+
- `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
|
|
289
|
+
- `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
|
|
261
290
|
- `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
|
|
262
291
|
- `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
|
|
263
292
|
- `SEARXNG_URL` — defaults to `http://localhost:8888`
|
|
264
293
|
- `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
|
|
265
294
|
- `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
|
|
266
|
-
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into
|
|
295
|
+
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
|
|
267
296
|
</details>
|
|
268
297
|
|
|
269
298
|
<details>
|
|
@@ -271,7 +300,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
271
300
|
|
|
272
301
|
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
273
302
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
274
|
-
- `
|
|
303
|
+
- `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
|
|
304
|
+
- `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
|
|
275
305
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
276
306
|
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
277
307
|
- `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
|
|
@@ -279,6 +309,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
279
309
|
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
280
310
|
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
281
311
|
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
312
|
+
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
|
|
313
|
+
- `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
|
|
282
314
|
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
283
315
|
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
284
316
|
- `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
|
|
@@ -62,13 +62,13 @@ That's the whole loop. Add tiers as you need them (see [Install](#install)).
|
|
|
62
62
|
|
|
63
63
|
| Tier | Strategy | Cost |
|
|
64
64
|
|---|---|---|
|
|
65
|
-
|
|
|
66
|
-
|
|
|
67
|
-
|
|
|
68
|
-
|
|
|
69
|
-
|
|
|
70
|
-
|
|
|
71
|
-
|
|
|
65
|
+
| tier_1 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
|
|
66
|
+
| tier_2 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
|
|
67
|
+
| tier_3 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
|
|
68
|
+
| tier_4 | Stealth headless browser (`patchright`, Chromium) | heavy |
|
|
69
|
+
| tier_5 | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
|
|
70
|
+
| tier_6 | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
|
|
71
|
+
| tier_7 | Firecrawl (paid, env-gated, audited) | paid, last resort |
|
|
72
72
|
|
|
73
73
|
Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
|
|
74
74
|
tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
|
|
@@ -83,17 +83,17 @@ Search (query → URLs) is separate from the scrape cascade: `switchback.search(
|
|
|
83
83
|
## Install
|
|
84
84
|
|
|
85
85
|
```bash
|
|
86
|
-
pip install switchback # core: normalization + cheap tiers (
|
|
87
|
-
pip install "switchback[cloudflare]" # +
|
|
86
|
+
pip install switchback # core: normalization + cheap tiers (tier_1/tier_2) + search
|
|
87
|
+
pip install "switchback[cloudflare]" # + tier_3 Cloudflare/anti-bot solver (cloudscraper)
|
|
88
88
|
pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
|
|
89
|
-
pip install "switchback[browser]" && patchright install chromium # +
|
|
90
|
-
pip install "switchback[camoufox]" && camoufox fetch # +
|
|
91
|
-
pip install "switchback[firecrawl]" # +
|
|
89
|
+
pip install "switchback[browser]" && patchright install chromium # + tier_4 stealth Chromium
|
|
90
|
+
pip install "switchback[camoufox]" && camoufox fetch # + tier_5 Firefox stealth
|
|
91
|
+
pip install "switchback[firecrawl]" # + tier_7 paid API (needs FIRECRAWL_API_KEY)
|
|
92
92
|
pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
|
|
93
93
|
pip install "switchback[all]" # everything
|
|
94
94
|
```
|
|
95
95
|
|
|
96
|
-
For
|
|
96
|
+
For tier_3's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
|
|
97
97
|
3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
|
|
98
98
|
git-URL dep inside a published package, so install it alongside):
|
|
99
99
|
|
|
@@ -104,6 +104,35 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
|
|
|
104
104
|
Or run the whole thing as a container:
|
|
105
105
|
`docker build -t switchback . && docker run -p 8799:8799 switchback`.
|
|
106
106
|
|
|
107
|
+
### Production / cold-start deployment
|
|
108
|
+
|
|
109
|
+
The two heavy tiers pull dependencies that often can't be baked into a base image
|
|
110
|
+
and land *after* boot (e.g. an async install thread on Azure). Until they're
|
|
111
|
+
ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
|
|
112
|
+
fix) and the cascade falls through — they are never silently skipped. Checklist:
|
|
113
|
+
|
|
114
|
+
- **tier_4 is the real workhorse for Cloudflare/JS sites** — make sure its browser
|
|
115
|
+
is installed: `patchright install chromium` (note: **patchright**, not vanilla
|
|
116
|
+
`playwright`). On a cold start, run this in your post-boot install step/thread;
|
|
117
|
+
tier_4 flips to ready once it finishes.
|
|
118
|
+
- **tier_3 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
|
|
119
|
+
frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
|
|
120
|
+
solve budget) instead of erroring mid-cascade. tier_3 is a *weak* solver for
|
|
121
|
+
modern Cloudflare — treat it as a cheap try before the browser, not the primary.
|
|
122
|
+
- **Install Node.js** for tier_3's v3 JS-VM challenges — faster and thread-safe
|
|
123
|
+
vs. the pure-Python js2py fallback (relevant under concurrent load).
|
|
124
|
+
- **Bound tier_3's solve budget** with `SCRAPER_TIER_3_TIMEOUT_S` (default `25`;
|
|
125
|
+
the old `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` is still honored) so an unsolvable
|
|
126
|
+
challenge can't eat the per-URL deadline before the browser tier runs. Lower it
|
|
127
|
+
(e.g. `12`) if tier_3 rarely wins on your hosts.
|
|
128
|
+
|
|
129
|
+
**Verify readiness on the box** with the preflight check (doubles as a healthcheck
|
|
130
|
+
— exit 0 when the capable tiers are ready):
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
switchback --doctor # or: python -m switchback --doctor
|
|
134
|
+
```
|
|
135
|
+
|
|
107
136
|
## Use it from your app
|
|
108
137
|
|
|
109
138
|
Three interchangeable entry points — all return the same shape
|
|
@@ -195,16 +224,16 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
195
224
|
<details>
|
|
196
225
|
<summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
|
|
197
226
|
|
|
198
|
-
- `SCRAPER_DISABLE_FIRECRAWL` — skip
|
|
199
|
-
- `FIRECRAWL_API_KEY` — enable
|
|
200
|
-
- `SCRAPER_DISABLE_CAMOUFOX` — turn off
|
|
201
|
-
- `BU_CDP_URL` — enable
|
|
227
|
+
- `SCRAPER_DISABLE_FIRECRAWL` — skip tier_7
|
|
228
|
+
- `FIRECRAWL_API_KEY` — enable tier_7
|
|
229
|
+
- `SCRAPER_DISABLE_CAMOUFOX` — turn off tier_5 (on by default; needs `pip install camoufox` + `camoufox fetch`)
|
|
230
|
+
- `BU_CDP_URL` — enable tier_6 residential browser by pointing at a CDP endpoint
|
|
202
231
|
- `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
|
|
203
232
|
- `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
|
|
204
233
|
- `SEARXNG_URL` — defaults to `http://localhost:8888`
|
|
205
234
|
- `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
|
|
206
235
|
- `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
|
|
207
|
-
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into
|
|
236
|
+
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into tier_3 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
|
|
208
237
|
</details>
|
|
209
238
|
|
|
210
239
|
<details>
|
|
@@ -212,7 +241,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
212
241
|
|
|
213
242
|
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
214
243
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
215
|
-
- `
|
|
244
|
+
- `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
|
|
245
|
+
- `SCRAPER_TIER_<N>_TIMEOUT_S` — per-tier timeout in seconds, `N` = 1–7 (tier_1 apis · tier_2 http · tier_3 cloudscraper · tier_4 browser · tier_5 camoufox · tier_6 residential · tier_7 firecrawl). Defaults: 15/15/**25**/15/**45**/**30**/15 (the three bold ones keep their prior budgets; everything else is 15s). The pre-0.5.0 `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` / `SCRAPER_CAMOUFOX_TIMEOUT_MS` / `SCRAPER_RESIDENTIAL_TIMEOUT_MS` are still honored when the new var is unset. Note: `tier_7` (Firecrawl) was previously unbounded — its 15s default now bounds the paid tier, so raise `SCRAPER_TIER_7_TIMEOUT_S` if slow hosts get cut off
|
|
216
246
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
217
247
|
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
218
248
|
- `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
|
|
@@ -220,6 +250,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
220
250
|
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
221
251
|
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
222
252
|
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
253
|
+
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER_4=2`)
|
|
254
|
+
- `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
|
|
223
255
|
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
224
256
|
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
225
257
|
- `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "switchback"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -55,6 +55,7 @@ def _main() -> int:
|
|
|
55
55
|
_os.environ[_k] = _v.strip()
|
|
56
56
|
usage = ("usage: switchback [--format FMT] <url> [<url> ...]\n"
|
|
57
57
|
" switchback --search <query ...>\n"
|
|
58
|
+
" switchback --doctor\n"
|
|
58
59
|
" (or: python -m switchback <url> ...)\n"
|
|
59
60
|
" FMT: markdown (default) | markdown_trimmed | html | html_selectors")
|
|
60
61
|
# --help/-h is an explicit request: usage to stdout, exit 0 (don't treat it
|
|
@@ -62,6 +63,10 @@ def _main() -> int:
|
|
|
62
63
|
if any(a in ("--help", "-h") for a in sys.argv[1:]):
|
|
63
64
|
print(usage)
|
|
64
65
|
return 0
|
|
66
|
+
# --doctor: preflight tier-readiness report (no scrape). Side-effect-free.
|
|
67
|
+
if "--doctor" in sys.argv[1:]:
|
|
68
|
+
from .doctor import report
|
|
69
|
+
return report()
|
|
65
70
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
66
71
|
setup_logs() # also ship logs to the OTLP backend when configured
|
|
67
72
|
if len(sys.argv) < 2:
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Preflight readiness check — `switchback doctor`.
|
|
2
|
+
|
|
3
|
+
Reports which tiers can actually run on this box and, when one can't, the exact
|
|
4
|
+
fix. Built for cold-start deploys (e.g. Azure) where the stealth browser is
|
|
5
|
+
installed by a background thread *after* boot: run this to confirm the tiers are
|
|
6
|
+
live before sending traffic, or to see why Tier 2/3 aren't catching anything.
|
|
7
|
+
|
|
8
|
+
Exit code: 0 if both capable local tiers (cloudscraper + browser) are ready,
|
|
9
|
+
else 1 — so it doubles as a healthcheck.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import shutil
|
|
15
|
+
|
|
16
|
+
from .tiers import tier_3, tier_4
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _camoufox() -> tuple[bool, str]:
|
|
20
|
+
if os.getenv("SCRAPER_DISABLE_CAMOUFOX"):
|
|
21
|
+
return False, "off (SCRAPER_DISABLE_CAMOUFOX set)"
|
|
22
|
+
try:
|
|
23
|
+
import camoufox # noqa: F401
|
|
24
|
+
except ImportError:
|
|
25
|
+
return False, 'not installed — pip install "switchback[camoufox]" && camoufox fetch'
|
|
26
|
+
return True, "camoufox installed"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def probe() -> list[tuple[str, bool, str]]:
|
|
30
|
+
"""(label, ok, detail) for each tier/dependency that matters at runtime."""
|
|
31
|
+
cs_ok, cs_detail = tier_3.available()
|
|
32
|
+
br_ok, br_detail = tier_4.available()
|
|
33
|
+
node = shutil.which("node")
|
|
34
|
+
return [
|
|
35
|
+
("tier_3 (cloudscraper)", cs_ok, cs_detail),
|
|
36
|
+
("tier_4 (browser)", br_ok, br_detail),
|
|
37
|
+
("tier_5 (camoufox)", *_camoufox()),
|
|
38
|
+
("node (tier_3 v3 concurrency)", bool(node),
|
|
39
|
+
node or "not on PATH — tier_3 falls back to slower, thread-fragile js2py"),
|
|
40
|
+
("tier_7 (firecrawl)", bool(os.getenv("FIRECRAWL_API_KEY")),
|
|
41
|
+
"FIRECRAWL_API_KEY set" if os.getenv("FIRECRAWL_API_KEY")
|
|
42
|
+
else "off (no FIRECRAWL_API_KEY)"),
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def report() -> int:
|
|
47
|
+
rows = probe()
|
|
48
|
+
print("switchback doctor — tier readiness\n")
|
|
49
|
+
for label, ok, detail in rows:
|
|
50
|
+
mark = "OK " if ok else "MISS"
|
|
51
|
+
print(f" [{mark}] {label:30} {detail}")
|
|
52
|
+
cs_ok = rows[0][1]
|
|
53
|
+
br_ok = rows[1][1]
|
|
54
|
+
if cs_ok and br_ok:
|
|
55
|
+
print("\nCapable tiers ready.")
|
|
56
|
+
return 0
|
|
57
|
+
print("\nOne or more capable tiers are unavailable (see above). On a cold "
|
|
58
|
+
"start this may resolve once the async install thread finishes.")
|
|
59
|
+
return 1
|
|
@@ -10,7 +10,7 @@ Run it from cron / the /loop skill / any scheduler:
|
|
|
10
10
|
python -m switchback.flags --json # machine-readable digest
|
|
11
11
|
|
|
12
12
|
What it flags:
|
|
13
|
-
• domains still landing on paid Firecrawl (winning_tier ==
|
|
13
|
+
• domains still landing on paid Firecrawl (winning_tier == tier_7)
|
|
14
14
|
• domains escalated to residential egress (needs_egress)
|
|
15
15
|
• domains throwing the most bot-wall challenges (by vendor)
|
|
16
16
|
• low coverage / negative cost savings in the window
|
|
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
|
|
|
29
29
|
|
|
30
30
|
# A domain is "stuck" if its winning tier is the paid one — these are the hosts
|
|
31
31
|
# that still cost money and are the prime targets for a new tier / cookie / rule.
|
|
32
|
-
_PAID_TIER = "
|
|
32
|
+
_PAID_TIER = "tier_7"
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def build_digest(minutes: int | None = None) -> dict:
|