switchback 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- switchback-0.1.0/.env.example +70 -0
- switchback-0.1.0/CHANGELOG.md +34 -0
- switchback-0.1.0/CONTRIBUTING.md +35 -0
- switchback-0.1.0/LICENSE +21 -0
- switchback-0.1.0/MANIFEST.in +12 -0
- switchback-0.1.0/NOTICE +34 -0
- switchback-0.1.0/PKG-INFO +325 -0
- switchback-0.1.0/README.md +266 -0
- switchback-0.1.0/SECURITY.md +27 -0
- switchback-0.1.0/clients/node_bridge.md +44 -0
- switchback-0.1.0/clients/python_client.py +87 -0
- switchback-0.1.0/config/botwall_skip_urls.txt +11 -0
- switchback-0.1.0/config/extraction.example.json +10 -0
- switchback-0.1.0/pyproject.toml +77 -0
- switchback-0.1.0/setup.cfg +4 -0
- switchback-0.1.0/switchback/__init__.py +12 -0
- switchback-0.1.0/switchback/__main__.py +4 -0
- switchback-0.1.0/switchback/api.py +81 -0
- switchback-0.1.0/switchback/concurrency.py +37 -0
- switchback-0.1.0/switchback/content_cache.py +94 -0
- switchback-0.1.0/switchback/egress.py +108 -0
- switchback-0.1.0/switchback/extract.py +56 -0
- switchback-0.1.0/switchback/flags.py +96 -0
- switchback-0.1.0/switchback/normalize.py +81 -0
- switchback-0.1.0/switchback/orchestrator.py +343 -0
- switchback-0.1.0/switchback/policy/__init__.py +0 -0
- switchback-0.1.0/switchback/policy/botwall.py +393 -0
- switchback-0.1.0/switchback/policy/gates.py +173 -0
- switchback-0.1.0/switchback/py.typed +0 -0
- switchback-0.1.0/switchback/reporting.py +236 -0
- switchback-0.1.0/switchback/search.py +39 -0
- switchback-0.1.0/switchback/server.py +114 -0
- switchback-0.1.0/switchback/session_cache.py +274 -0
- switchback-0.1.0/switchback/session_trace.py +96 -0
- switchback-0.1.0/switchback/tiers/__init__.py +24 -0
- switchback-0.1.0/switchback/tiers/_browser.py +50 -0
- switchback-0.1.0/switchback/tiers/tier0_apis.py +77 -0
- switchback-0.1.0/switchback/tiers/tier1_http.py +65 -0
- switchback-0.1.0/switchback/tiers/tier2_cloudscraper.py +135 -0
- switchback-0.1.0/switchback/tiers/tier3_browser.py +59 -0
- switchback-0.1.0/switchback/tiers/tier3b_camoufox.py +89 -0
- switchback-0.1.0/switchback/tiers/tier4_firecrawl.py +48 -0
- switchback-0.1.0/switchback/tiers/tier_residential.py +57 -0
- switchback-0.1.0/switchback/tracing.py +152 -0
- switchback-0.1.0/switchback.egg-info/PKG-INFO +325 -0
- switchback-0.1.0/switchback.egg-info/SOURCES.txt +48 -0
- switchback-0.1.0/switchback.egg-info/dependency_links.txt +1 -0
- switchback-0.1.0/switchback.egg-info/entry_points.txt +3 -0
- switchback-0.1.0/switchback.egg-info/requires.txt +38 -0
- switchback-0.1.0/switchback.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# switchback — environment configuration
|
|
2
|
+
#
|
|
3
|
+
# The engine reads these via os.getenv; it does NOT auto-load this file.
|
|
4
|
+
# To use it: cp .env.example .env then load before running:
|
|
5
|
+
# set -a; source .env; set +a
|
|
6
|
+
# python -m switchback.api <url>
|
|
7
|
+
# Everything here has a sane default or is optional — an empty .env still runs.
|
|
8
|
+
|
|
9
|
+
# ── Tracing → OTLP backend (OTLP/gRPC) ────────────────────────────────────────────
|
|
10
|
+
# Spans/logs export here; if unreachable the engine degrades gracefully.
|
|
11
|
+
OTEL_SERVICE_NAME=switchback
|
|
12
|
+
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
13
|
+
|
|
14
|
+
# ── Search (Tier-0 SearXNG, query → URLs) ───────────────────────────────────
|
|
15
|
+
SEARXNG_URL=http://localhost:8888
|
|
16
|
+
|
|
17
|
+
# ── Tier 2.5 · Jina Reader (r.jina.ai) ──────────────────────────────────────
|
|
18
|
+
# Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
|
|
19
|
+
JINA_API_KEY=
|
|
20
|
+
SCRAPER_JINA_TIMEOUT_S=20
|
|
21
|
+
|
|
22
|
+
# ── Tier 3b · Camoufox (Firefox stealth) ────────────────────────────────────
|
|
23
|
+
# ON by default. Needs: pip install camoufox && camoufox fetch
|
|
24
|
+
# Set to 1 to turn the tier off entirely.
|
|
25
|
+
SCRAPER_DISABLE_CAMOUFOX=
|
|
26
|
+
SCRAPER_CAMOUFOX_TIMEOUT_MS=45000
|
|
27
|
+
|
|
28
|
+
# ── Tier 4 · Firecrawl (paid, last resort) ──────────────────────────────────
|
|
29
|
+
# Required only if this tier runs. Set SCRAPER_DISABLE_FIRECRAWL=1 to skip it.
|
|
30
|
+
FIRECRAWL_API_KEY=
|
|
31
|
+
SCRAPER_DISABLE_FIRECRAWL=
|
|
32
|
+
|
|
33
|
+
# ── Orchestrator ────────────────────────────────────────────────────────────
|
|
34
|
+
# Per-URL wall-clock budget (s), checked between tiers. 45s balances latency vs
|
|
35
|
+
# coverage — roughly fits a Camoufox solve (~40s) that starts after the cheaper
|
|
36
|
+
# tiers fail fast. Lower toward 30s for tighter latency, raise toward 60s for
|
|
37
|
+
# deeper hard-host coverage.
|
|
38
|
+
SCRAPER_DEADLINE_S=45
|
|
39
|
+
|
|
40
|
+
# Max headless browsers (patchright ~150MB, Camoufox ~600MB) running at once.
|
|
41
|
+
# 1 matches the sequential design; raise only if the box has RAM headroom and
|
|
42
|
+
# you scrape in parallel.
|
|
43
|
+
SCRAPER_BROWSER_CONCURRENCY=1
|
|
44
|
+
|
|
45
|
+
# ── State ───────────────────────────────────────────────────────────────────
|
|
46
|
+
# Where the botwall DB + JSONL event log live (default: ./state).
|
|
47
|
+
SCRAPER_STATE_DIR=
|
|
48
|
+
|
|
49
|
+
# ── Botwall policy ──────────────────────────────────────────────────────────
|
|
50
|
+
# SCRAPER_BOTWALL_URL_SKIP_AFTER
|
|
51
|
+
# Hard failures (botwall hit or short content) on the *same URL* before that
|
|
52
|
+
# URL is excluded. Set to 0 to disable URL-level auto-exclusion.
|
|
53
|
+
SCRAPER_BOTWALL_URL_SKIP_AFTER=2
|
|
54
|
+
|
|
55
|
+
# SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER
|
|
56
|
+
# Hard failures across any URLs on a domain before the whole domain is
|
|
57
|
+
# skip-listed. 0 (default) = domains are never auto-skipped; only seeded
|
|
58
|
+
# hard-block domains and manual overrides in the DB are domain-level skips.
|
|
59
|
+
# Set to a positive number (e.g. 10) to re-enable domain-level auto-skip.
|
|
60
|
+
SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER=0
|
|
61
|
+
|
|
62
|
+
# SCRAPER_BOTWALL_COUNT_FIRECRAWL
|
|
63
|
+
# When true, each Firecrawl invocation counts toward the domain failure total
|
|
64
|
+
# (original v1 behaviour). No effect when DOMAIN_SKIP_AFTER is 0.
|
|
65
|
+
SCRAPER_BOTWALL_COUNT_FIRECRAWL=false
|
|
66
|
+
|
|
67
|
+
# SCRAPER_BOTWALL_SKIP_URLS_FILE
|
|
68
|
+
# Path to the manual URL skip list (default: config/botwall_skip_urls.txt).
|
|
69
|
+
# Format: one URL per line, optional " # reason" suffix.
|
|
70
|
+
SCRAPER_BOTWALL_SKIP_URLS_FILE=
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. Format loosely follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/); this project uses semantic-ish
|
|
5
|
+
versioning while pre-1.0.
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
|
|
11
|
+
DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
|
|
12
|
+
botwall DB; the vendor is attached to each event and OTel span (`scrape.challenge`).
|
|
13
|
+
- **Metrics & reporting** — `switchback.reporting` rolls the event log + botwall DB
|
|
14
|
+
into cost-savings-vs-Firecrawl, coverage, overall/per-tier/per-domain latency
|
|
15
|
+
(mean/median/min/max/p50/p95), outcomes, error codes by domain, and challenges
|
|
16
|
+
by domain. Exposed via `GET /metrics` and `GET /metrics/domains` (both accept
|
|
17
|
+
`?minutes=N`).
|
|
18
|
+
- **Periodic flagging** — `python -m switchback.flags` emits a cron-friendly digest
|
|
19
|
+
(domains stuck on Firecrawl, escalated to egress, most-challenged) to logs/OTel.
|
|
20
|
+
- **Content cache** — optional URL→result cache (`SCRAPER_CONTENT_TTL_S`, sqlite,
|
|
21
|
+
off by default) short-circuits re-scrapes before any tier runs.
|
|
22
|
+
- **Login-session refresh** — `SCRAPER_LOGIN_HOOK` (`pkg.module:func`) refreshes a
|
|
23
|
+
dead logged-in session on demand; cookies overlay every tier and persist.
|
|
24
|
+
- **Exponential backoff** — between-tier backoff with jitter after rate-limit /
|
|
25
|
+
timeout (`SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS`, off by default).
|
|
26
|
+
- **Per-domain extraction prefs** — `config/extraction.json` (CSS scope selector +
|
|
27
|
+
extra drops) applied automatically in the normalize step for every tier.
|
|
28
|
+
- **Session traces** — opt-in Playwright trace capture (`SCRAPER_TRACE_SESSION=1`)
|
|
29
|
+
for browser tiers, with `GET/DELETE /traces` management endpoints.
|
|
30
|
+
|
|
31
|
+
### Changed
|
|
32
|
+
- Tier 2's `cloudscraper` moved from a core dependency (which pinned a git-URL
|
|
33
|
+
fork PyPI can't publish) to the `cloudflare` extra; see the README for installing
|
|
34
|
+
the 3.x Enhanced Edition fork for full stealth.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in improving switchback.
|
|
4
|
+
|
|
5
|
+
## Development setup
|
|
6
|
+
```bash
|
|
7
|
+
python -m venv .venv && . .venv/bin/activate
|
|
8
|
+
pip install -e ".[all]"
|
|
9
|
+
patchright install chromium && camoufox fetch # for the browser tiers
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Architecture
|
|
13
|
+
The engine is a cost-ordered cascade (`switchback/tiers/`) governed by a per-host
|
|
14
|
+
policy (`switchback/policy/`). Start with the cascade runner in
|
|
15
|
+
`switchback/orchestrator.py`.
|
|
16
|
+
|
|
17
|
+
## Guidelines
|
|
18
|
+
- **Keep the core small.** Each tier imports its deps lazily and a missing dep is
|
|
19
|
+
just a tier miss — keep heavy/paid/optional pieces behind extras in
|
|
20
|
+
`pyproject.toml`.
|
|
21
|
+
- **Make new behavior configurable and off-safe.** New features should be gated by
|
|
22
|
+
an env var that defaults to current behavior (see the existing `SCRAPER_*` vars).
|
|
23
|
+
- **Match the surrounding style** — terse, comment-the-why, no speculative
|
|
24
|
+
abstractions.
|
|
25
|
+
- **Don't commit secrets or run artifacts.** `.env`, `state/`, and `*.csv` are
|
|
26
|
+
gitignored; keep it that way.
|
|
27
|
+
|
|
28
|
+
## Tests
|
|
29
|
+
`tests/test_suite.py` exercises the cascade across the anti-bot difficulty
|
|
30
|
+
spectrum (needs network + browser tiers): `python tests/test_suite.py --quick`
|
|
31
|
+
for a fast tier-0/1 pass.
|
|
32
|
+
|
|
33
|
+
## Pull requests
|
|
34
|
+
Keep PRs focused; describe what changed and why, and note any new env var or
|
|
35
|
+
endpoint in the README and CHANGELOG.
|
switchback-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Akash Kodavuru
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
include LICENSE
|
|
2
|
+
include NOTICE
|
|
3
|
+
include README.md
|
|
4
|
+
include CHANGELOG.md
|
|
5
|
+
include CONTRIBUTING.md
|
|
6
|
+
include SECURITY.md
|
|
7
|
+
include .env.example
|
|
8
|
+
include switchback/py.typed
|
|
9
|
+
recursive-include config *.txt *.json
|
|
10
|
+
graft clients
|
|
11
|
+
graft deploy
|
|
12
|
+
prune tests
|
switchback-0.1.0/NOTICE
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
switchback
|
|
2
|
+
Copyright (c) 2026 Akash Kodavuru
|
|
3
|
+
Licensed under the MIT License (see LICENSE).
|
|
4
|
+
|
|
5
|
+
This product depends on third-party components, each under its own license.
|
|
6
|
+
They are installed via pip and are NOT bundled or redistributed in this
|
|
7
|
+
repository — no upstream source or full license texts are vendored here. All
|
|
8
|
+
dependencies use permissive licenses (MIT / BSD-3-Clause / Apache-2.0), which
|
|
9
|
+
are compatible with this project's MIT license.
|
|
10
|
+
|
|
11
|
+
Third-party components
|
|
12
|
+
----------------------
|
|
13
|
+
|
|
14
|
+
MIT
|
|
15
|
+
markdownify HTML -> Markdown normalization
|
|
16
|
+
beautifulsoup4 HTML parsing / normalization
|
|
17
|
+
curl_cffi tier1: plain HTTP with TLS impersonation
|
|
18
|
+
cloudscraper tier2: Cloudflare / anti-bot challenge solver
|
|
19
|
+
camoufox tier3b: Firefox stealth (optional)
|
|
20
|
+
firecrawl-py tier4: paid last-resort scrape API (optional)
|
|
21
|
+
fastapi HTTP service (optional)
|
|
22
|
+
|
|
23
|
+
BSD-3-Clause
|
|
24
|
+
pypdf PDF -> text extraction (tier1 PDFs)
|
|
25
|
+
uvicorn ASGI server for the HTTP service (optional)
|
|
26
|
+
|
|
27
|
+
Apache-2.0
|
|
28
|
+
requests HTTP client
|
|
29
|
+
patchright tier3: stealth headless Chromium (optional)
|
|
30
|
+
opentelemetry-sdk tracing -> any OTLP backend (optional)
|
|
31
|
+
opentelemetry-exporter-otlp-proto-grpc OTLP trace/log export (optional)
|
|
32
|
+
|
|
33
|
+
Full license texts are available with each package distribution (e.g. in its
|
|
34
|
+
`*.dist-info` directory after install) and from each project's repository.
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: switchback
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
|
+
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/akash-kr/switchback
|
|
8
|
+
Project-URL: Repository, https://github.com/akash-kr/switchback
|
|
9
|
+
Project-URL: Issues, https://github.com/akash-kr/switchback/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/akash-kr/switchback/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: scraping,crawler,cloudflare,markdown,cascade,anti-bot,stealth
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
License-File: NOTICE
|
|
28
|
+
Requires-Dist: markdownify
|
|
29
|
+
Requires-Dist: beautifulsoup4
|
|
30
|
+
Requires-Dist: pypdf
|
|
31
|
+
Requires-Dist: requests
|
|
32
|
+
Requires-Dist: curl_cffi
|
|
33
|
+
Provides-Extra: cloudflare
|
|
34
|
+
Requires-Dist: cloudscraper; extra == "cloudflare"
|
|
35
|
+
Provides-Extra: browser
|
|
36
|
+
Requires-Dist: patchright; extra == "browser"
|
|
37
|
+
Provides-Extra: camoufox
|
|
38
|
+
Requires-Dist: camoufox[geoip]; extra == "camoufox"
|
|
39
|
+
Provides-Extra: firecrawl
|
|
40
|
+
Requires-Dist: firecrawl-py; extra == "firecrawl"
|
|
41
|
+
Provides-Extra: server
|
|
42
|
+
Requires-Dist: fastapi; extra == "server"
|
|
43
|
+
Requires-Dist: uvicorn; extra == "server"
|
|
44
|
+
Provides-Extra: tracing
|
|
45
|
+
Requires-Dist: opentelemetry-sdk; extra == "tracing"
|
|
46
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
|
|
47
|
+
Provides-Extra: all
|
|
48
|
+
Requires-Dist: cloudscraper; extra == "all"
|
|
49
|
+
Requires-Dist: patchright; extra == "all"
|
|
50
|
+
Requires-Dist: camoufox[geoip]; extra == "all"
|
|
51
|
+
Requires-Dist: firecrawl-py; extra == "all"
|
|
52
|
+
Requires-Dist: fastapi; extra == "all"
|
|
53
|
+
Requires-Dist: uvicorn; extra == "all"
|
|
54
|
+
Requires-Dist: opentelemetry-sdk; extra == "all"
|
|
55
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "all"
|
|
56
|
+
Provides-Extra: dev
|
|
57
|
+
Requires-Dist: pytest; extra == "dev"
|
|
58
|
+
Dynamic: license-file
|
|
59
|
+
|
|
60
|
+
<!-- switchback -->
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
███████╗██╗ ██╗██╗████████╗ ██████╗██╗ ██╗██████╗ █████╗ ██████╗██╗ ██╗
|
|
64
|
+
██╔════╝██║ ██║██║╚══██╔══╝██╔════╝██║ ██║██╔══██╗██╔══██╗██╔════╝██║ ██╔╝
|
|
65
|
+
███████╗██║ █╗ ██║██║ ██║ ██║ ███████║██████╔╝███████║██║ █████╔╝
|
|
66
|
+
╚════██║██║███╗██║██║ ██║ ██║ ██╔══██║██╔══██╗██╔══██║██║ ██╔═██╗
|
|
67
|
+
███████║╚███╔███╔╝██║ ██║ ╚██████╗██║ ██║██████╔╝██║ ██║╚██████╗██║ ██╗
|
|
68
|
+
╚══════╝ ╚══╝╚══╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
<div align="center">
|
|
72
|
+
|
|
73
|
+
**One cost-ordered scrape cascade — HTTP → stealth browser → paid — shared by every tool.**
|
|
74
|
+
|
|
75
|
+
Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
|
|
76
|
+
to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
|
|
77
|
+
|
|
78
|
+
[](https://pypi.org/project/switchback/)
|
|
79
|
+
[](https://pypi.org/project/switchback/)
|
|
80
|
+
[](LICENSE)
|
|
81
|
+
[](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
|
|
82
|
+
|
|
83
|
+
</div>
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Why
|
|
88
|
+
|
|
89
|
+
Most scrapers either give up on hard pages or send *everything* through an expensive
|
|
90
|
+
headless browser / paid API. **switchback** orders the methods by cost and walks them
|
|
91
|
+
cheapest-first, per host, learning which tier wins where so the next run starts there.
|
|
92
|
+
The easy majority stays free; only genuinely-walled hosts pay for the heavy tiers.
|
|
93
|
+
|
|
94
|
+
- **Cost-ordered cascade** — free APIs → cheap HTTP → anti-bot solver → stealth browser → paid API.
|
|
95
|
+
- **Per-host memory (botwall)** — remembers the winning tier per host, skip-lists hard blockers, auto-skips hosts stuck on the paid tier.
|
|
96
|
+
- **Cost-scoped residential egress** — routes *only* walled hosts through a residential proxy, never the easy majority.
|
|
97
|
+
- **One shape, three entry points** — Python library, CLI (JSON on stdout), or an HTTP service.
|
|
98
|
+
- **Observable** — every attempt is an OpenTelemetry span; logs ship trace-correlated to any OTLP backend (Jaeger, Tempo, SigNoz).
|
|
99
|
+
- **Runs with any subset installed** — each tier imports its deps lazily; a missing one is just a tier miss.
|
|
100
|
+
|
|
101
|
+
## Quickstart
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install switchback # core: cheap tiers (0/1) + search
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from switchback import scrape
|
|
109
|
+
|
|
110
|
+
for r in scrape(["https://arxiv.org/abs/1706.03762"]):
|
|
111
|
+
print(r.source_method, len(r.markdown))
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
python -m switchback https://example.com/article # JSON on stdout — bridge for any language
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
That's the whole loop. Add tiers as you need them (see [Install](#install)).
|
|
119
|
+
|
|
120
|
+
## The cascade (stop at first success)
|
|
121
|
+
|
|
122
|
+
| Tier | Strategy | Cost |
|
|
123
|
+
|---|---|---|
|
|
124
|
+
| 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
|
|
125
|
+
| 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
|
|
126
|
+
| 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
|
|
127
|
+
| 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
|
|
128
|
+
| 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
|
|
129
|
+
| 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
|
|
130
|
+
| 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
|
|
131
|
+
|
|
132
|
+
Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
|
|
133
|
+
tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
|
|
134
|
+
latency + outcome (`ok` / `short_content` / `rate_limited` / `miss` / `not_applicable`)
|
|
135
|
+
to its span and the botwall event log; the root span carries total latency and the final
|
|
136
|
+
outcome (incl. `deadline_exceeded`).
|
|
137
|
+
|
|
138
|
+
Search (query → URLs) is separate from the scrape cascade: `switchback.search()` /
|
|
139
|
+
`python -m switchback.api --search <query>`, backed by a local SearXNG.
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
## Install
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pip install switchback # core: normalization + cheap tiers (0/1) + search
|
|
146
|
+
pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
|
|
147
|
+
pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
|
|
148
|
+
pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
|
|
149
|
+
pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
|
|
150
|
+
pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
|
|
151
|
+
pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
|
|
152
|
+
pip install "switchback[all]" # everything
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
|
|
156
|
+
3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
|
|
157
|
+
git-URL dep inside a published package, so install it alongside):
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Or run the whole thing as a container:
|
|
164
|
+
`docker build -t switchback . && docker run -p 8799:8799 switchback`.
|
|
165
|
+
|
|
166
|
+
## Use it from your app
|
|
167
|
+
|
|
168
|
+
Three interchangeable entry points — all return the same shape
|
|
169
|
+
(`[{url, source_method, markdown}]`, successes only):
|
|
170
|
+
|
|
171
|
+
**Python library**
|
|
172
|
+
```python
|
|
173
|
+
from switchback import scrape
|
|
174
|
+
for r in scrape(["https://arxiv.org/abs/1706.03762"]):
|
|
175
|
+
print(r.source_method, len(r.markdown))
|
|
176
|
+
|
|
177
|
+
# Need failures + reasons too? scrape_detailed returns a ScrapeOutcome per URL
|
|
178
|
+
# (ok, final_outcome, error_class, status_code, and the per-tier attempts):
|
|
179
|
+
from switchback import scrape_detailed
|
|
180
|
+
for o in scrape_detailed(["https://www.pcmag.com/news"]):
|
|
181
|
+
if not o.ok:
|
|
182
|
+
print(o.url, o.final_outcome, o.error_class, o.status_code)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
**CLI** (JSON on stdout — bridge for any language)
|
|
186
|
+
```bash
|
|
187
|
+
python -m switchback https://example.com/article # or: switchback <url>
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
**HTTP service** (language-agnostic; one warm process keeps the browser pool hot)
|
|
191
|
+
```bash
|
|
192
|
+
switchback-server # listens on :8799
|
|
193
|
+
curl -s localhost:8799/scrape -d '{"urls":["https://example.com"]}'
|
|
194
|
+
curl 'localhost:8799/search?q=web+scraping'
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Non-Python callers: see [clients/node_bridge.md](clients/node_bridge.md). Python
|
|
198
|
+
callers that want HTTP-with-CLI-fallback can drop in
|
|
199
|
+
[clients/python_client.py](clients/python_client.py).
|
|
200
|
+
|
|
201
|
+
## Cost-scoped residential egress
|
|
202
|
+
|
|
203
|
+
The dominant reason hard hosts wall you is the **datacenter IP**, not the
|
|
204
|
+
fingerprint. When a host repeatedly walls the local tiers (a 403/429 or a
|
|
205
|
+
bot-wall page, `SCRAPER_BOTWALL_EGRESS_AFTER` times) it's flagged `needs_egress`
|
|
206
|
+
and the cascade reruns through a **residential proxy** — but only for that host:
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
export SCRAPER_EGRESS_PROXY="http://user:pass@p.webshare.io:80"
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
The easy majority that already succeeds free at the datacenter IP stays direct,
|
|
213
|
+
so you never spend (often metered) residential bandwidth on it. Escalation tries
|
|
214
|
+
the cheap HTTP tiers through the proxy first (~0.2MB/page) before the heavier
|
|
215
|
+
browser tiers. [Webshare](https://www.webshare.io/)'s free plan includes ~1GB/mo
|
|
216
|
+
of residential bandwidth — enough for low-volume hard-host recovery at $0. Use
|
|
217
|
+
`SCRAPER_PROXY` instead to force *every* request through a proxy.
|
|
218
|
+
|
|
219
|
+
## Metrics & reporting
|
|
220
|
+
|
|
221
|
+
The engine derives all metrics from its own state files (no external store): the
|
|
222
|
+
botwall event log (one row per tier attempt, incl. the detected challenge vendor)
|
|
223
|
+
and the per-host DB (winning tier, per-vendor `challenge_counts`).
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
curl localhost:8799/metrics # cost savings vs Firecrawl, coverage,
|
|
227
|
+
# overall + per-tier latency, outcomes
|
|
228
|
+
curl localhost:8799/metrics/domains # per-domain: error codes, challenges, latency
|
|
229
|
+
python -m switchback.flags # periodic digest: domains stuck on Firecrawl,
|
|
230
|
+
# escalated to egress, top challenged (cron-friendly)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Both endpoints accept `?minutes=N` to window the event-derived sections. The
|
|
234
|
+
**savings** figure compares engine spend (Firecrawl invocations only) against a
|
|
235
|
+
Firecrawl-everything baseline, charging the hard-page credit multiplier
|
|
236
|
+
(`BENCH_FIRECRAWL_HARD_MULT`) for URLs that needed a browser/residential tier or
|
|
237
|
+
hit a challenge — i.e. exactly the ones Firecrawl bills more for.
|
|
238
|
+
|
|
239
|
+
## Configuration
|
|
240
|
+
|
|
241
|
+
All configuration is via environment variables. The engine runs with missing
|
|
242
|
+
pieces: each tier imports its deps lazily and a missing one just counts as a tier
|
|
243
|
+
miss. Tracing no-ops if OTel isn't installed/configured.
|
|
244
|
+
|
|
245
|
+
<details>
|
|
246
|
+
<summary><b>Tracing (optional)</b></summary>
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
export OTEL_SERVICE_NAME=switchback
|
|
250
|
+
export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
251
|
+
```
|
|
252
|
+
</details>
|
|
253
|
+
|
|
254
|
+
<details>
|
|
255
|
+
<summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
|
|
256
|
+
|
|
257
|
+
- `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
|
|
258
|
+
- `FIRECRAWL_API_KEY` — enable Tier 4
|
|
259
|
+
- `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
|
|
260
|
+
- `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
|
|
261
|
+
- `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
|
|
262
|
+
- `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
|
|
263
|
+
- `SEARXNG_URL` — defaults to `http://localhost:8888`
|
|
264
|
+
- `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
|
|
265
|
+
- `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
|
|
266
|
+
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
|
|
267
|
+
</details>
|
|
268
|
+
|
|
269
|
+
<details>
|
|
270
|
+
<summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
|
|
271
|
+
|
|
272
|
+
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
273
|
+
- `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
|
|
274
|
+
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
275
|
+
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
276
|
+
- `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
|
|
277
|
+
- `SCRAPER_SESSION_TTL_S` — cf_clearance reuse window (1800s)
|
|
278
|
+
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
279
|
+
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
280
|
+
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
281
|
+
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
282
|
+
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
283
|
+
- `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
|
|
284
|
+
- `BENCH_FIRECRAWL_USD` / `BENCH_FIRECRAWL_HARD_MULT` — cost model for the savings report
|
|
285
|
+
</details>
|
|
286
|
+
|
|
287
|
+
### Logged-in sessions
|
|
288
|
+
Beyond a static `SCRAPER_COOKIES_FILE`, wire `SCRAPER_LOGIN_HOOK` to a callable
|
|
289
|
+
`func(host) -> {cookie: value}`. When an authenticated host trips a login/bot
|
|
290
|
+
wall, the engine calls the hook once, persists the returned cookies per host, and
|
|
291
|
+
overlays them on every tier (and future runs), then re-runs that URL on a fresh
|
|
292
|
+
budget. The hook owns the site-specific login mechanics; the engine stays generic.
|
|
293
|
+
|
|
294
|
+
### Session traces
|
|
295
|
+
With `SCRAPER_TRACE_SESSION=1`, each browser-tier attempt writes a Playwright
|
|
296
|
+
trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
|
|
297
|
+
`GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
|
|
298
|
+
`playwright show-trace <zip>`. Off by default (traces are MBs each).
|
|
299
|
+
|
|
300
|
+
### Per-domain extraction
|
|
301
|
+
Markdown of the whole page is the default. To scope a site to its content node or
|
|
302
|
+
strip site-specific noise, declare prefs per host in `config/extraction.json`
|
|
303
|
+
(see [config/extraction.example.json](config/extraction.example.json)); every
|
|
304
|
+
tier's normalize step picks them up automatically.
|
|
305
|
+
|
|
306
|
+
## Contributing
|
|
307
|
+
|
|
308
|
+
Issues and PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md). Start with the
|
|
309
|
+
cascade runner in `switchback/orchestrator.py`.
|
|
310
|
+
|
|
311
|
+
## Responsible use
|
|
312
|
+
|
|
313
|
+
This engine is for lawful data collection. You are responsible for respecting
|
|
314
|
+
each target site's Terms of Service, `robots.txt`, and rate limits, and for
|
|
315
|
+
having the right to access the content you fetch. The stealth / anti-bot tiers
|
|
316
|
+
(`cloudscraper`, `patchright`, `camoufox`) exist to handle legitimate access
|
|
317
|
+
friction (e.g. generic bot interstitials on public pages) — not to evade access
|
|
318
|
+
controls, paywalls, or authentication you aren't authorized to bypass. The
|
|
319
|
+
software is provided "as is", without warranty (see [LICENSE](LICENSE)).
|
|
320
|
+
|
|
321
|
+
## License
|
|
322
|
+
|
|
323
|
+
MIT — see [LICENSE](LICENSE). Third-party dependencies and their licenses are
|
|
324
|
+
listed in [NOTICE](NOTICE); all are permissive (MIT / BSD-3-Clause / Apache-2.0)
|
|
325
|
+
and compatible with this project's MIT license.
|