switchback 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- switchback/__init__.py +12 -0
- switchback/__main__.py +4 -0
- switchback/api.py +81 -0
- switchback/concurrency.py +37 -0
- switchback/content_cache.py +94 -0
- switchback/egress.py +108 -0
- switchback/extract.py +56 -0
- switchback/flags.py +96 -0
- switchback/normalize.py +81 -0
- switchback/orchestrator.py +343 -0
- switchback/policy/__init__.py +0 -0
- switchback/policy/botwall.py +393 -0
- switchback/policy/gates.py +173 -0
- switchback/py.typed +0 -0
- switchback/reporting.py +236 -0
- switchback/search.py +39 -0
- switchback/server.py +114 -0
- switchback/session_cache.py +274 -0
- switchback/session_trace.py +96 -0
- switchback/tiers/__init__.py +24 -0
- switchback/tiers/_browser.py +50 -0
- switchback/tiers/tier0_apis.py +77 -0
- switchback/tiers/tier1_http.py +65 -0
- switchback/tiers/tier2_cloudscraper.py +135 -0
- switchback/tiers/tier3_browser.py +59 -0
- switchback/tiers/tier3b_camoufox.py +89 -0
- switchback/tiers/tier4_firecrawl.py +48 -0
- switchback/tiers/tier_residential.py +57 -0
- switchback/tracing.py +152 -0
- switchback-0.1.0.dist-info/METADATA +325 -0
- switchback-0.1.0.dist-info/RECORD +36 -0
- switchback-0.1.0.dist-info/WHEEL +5 -0
- switchback-0.1.0.dist-info/entry_points.txt +3 -0
- switchback-0.1.0.dist-info/licenses/LICENSE +21 -0
- switchback-0.1.0.dist-info/licenses/NOTICE +34 -0
- switchback-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Tier (residential egress) — drive a remote browser over CDP.
|
|
2
|
+
|
|
3
|
+
The strongest egress lever we own: connect to a browser running on a *residential*
|
|
4
|
+
IP with a real, full Chrome fingerprint, instead of this box's datacenter IP. That
|
|
5
|
+
is the one thing no local fingerprint trick can fake, and it's the actual reason
|
|
6
|
+
hard Cloudflare / DataDome hosts (and datacenter-blocklisted sites) wall us.
|
|
7
|
+
|
|
8
|
+
Operator-provided seam: export BU_CDP_URL to a CDP endpoint and this tier connects
|
|
9
|
+
to it. Start one with browser-harness:
|
|
10
|
+
|
|
11
|
+
start_remote_daemon("scrape", proxyCountryCode="us") # prints a cdpUrl
|
|
12
|
+
|
|
13
|
+
then export that URL as BU_CDP_URL. Unset → tier is disabled and the cascade
|
|
14
|
+
skips it. It sits after the local browser tiers (only walls that beat them pay
|
|
15
|
+
this cost) and before paid Firecrawl.
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
|
|
21
|
+
from ..concurrency import browser_slot
|
|
22
|
+
from ..normalize import html_to_markdown
|
|
23
|
+
from ..policy.gates import check
|
|
24
|
+
|
|
25
|
+
NAME = "tier_residential"
|
|
26
|
+
PAID = False
|
|
27
|
+
|
|
28
|
+
_TIMEOUT_MS = int(os.getenv("SCRAPER_RESIDENTIAL_TIMEOUT_MS", "30000"))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def disabled() -> bool:
|
|
32
|
+
"""Off unless an operator has wired a residential CDP endpoint."""
|
|
33
|
+
return not os.getenv("BU_CDP_URL")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def fetch(url: str) -> str:
|
|
37
|
+
from patchright.sync_api import sync_playwright
|
|
38
|
+
cdp = os.environ["BU_CDP_URL"]
|
|
39
|
+
with browser_slot(NAME), sync_playwright() as p:
|
|
40
|
+
browser = p.chromium.connect_over_cdp(cdp)
|
|
41
|
+
try:
|
|
42
|
+
ctx = browser.contexts[0] if browser.contexts else browser.new_context()
|
|
43
|
+
page = ctx.new_page()
|
|
44
|
+
try:
|
|
45
|
+
page.goto(url, timeout=_TIMEOUT_MS, wait_until="domcontentloaded")
|
|
46
|
+
if len(page.content()) < 5000:
|
|
47
|
+
try:
|
|
48
|
+
page.wait_for_load_state("networkidle", timeout=8000)
|
|
49
|
+
except Exception:
|
|
50
|
+
pass
|
|
51
|
+
html = page.content()
|
|
52
|
+
finally:
|
|
53
|
+
page.close()
|
|
54
|
+
md = html_to_markdown(html, base_url=page.url or url)
|
|
55
|
+
finally:
|
|
56
|
+
browser.close()
|
|
57
|
+
return check(url, md)
|
switchback/tracing.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""OpenTelemetry wiring → any OTLP backend: traces and logs.
|
|
2
|
+
|
|
3
|
+
Degrades gracefully: if the OTel packages aren't installed the `span()` helper
|
|
4
|
+
becomes a no-op context manager and `setup_logs()` is a no-op, so the engine
|
|
5
|
+
still runs without a backend. Point it at an OTLP backend (Jaeger, Tempo, SigNoz) with:
|
|
6
|
+
|
|
7
|
+
export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
8
|
+
export OTEL_SERVICE_NAME=switchback
|
|
9
|
+
|
|
10
|
+
Traces: one trace per scraped URL, one span per tier attempt. Span attributes
|
|
11
|
+
use the keys in `Attr` so backend dashboards stay consistent.
|
|
12
|
+
|
|
13
|
+
Logs: `setup_logs()` routes the stdlib `logging` records to an OTLP backend
|
|
14
|
+
(opt-in — call it from the CLI / app entry point, not from library code, so we
|
|
15
|
+
never hijack a host app's logging). Records emitted inside a span are
|
|
16
|
+
auto-correlated with that trace.
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
import threading
|
|
23
|
+
from contextlib import contextmanager
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Attr:
|
|
27
|
+
"""Canonical span-attribute keys (keep dashboards consistent)."""
|
|
28
|
+
HOST = "scrape.host"
|
|
29
|
+
TIER = "scrape.tier"
|
|
30
|
+
OUTCOME = "scrape.outcome" # ok | short_content | botwall | http_block |
|
|
31
|
+
# rate_limited | timeout | connection |
|
|
32
|
+
# http_error | error | not_applicable |
|
|
33
|
+
# deadline_exceeded | all_failed | *_skipped
|
|
34
|
+
ERROR_CLASS = "scrape.error_class" # normalized failure class (see classify_error)
|
|
35
|
+
CHALLENGE = "scrape.challenge" # bot-wall vendor when one was served
|
|
36
|
+
# (cloudflare / datadome / akamai / …)
|
|
37
|
+
STATUS_CODE = "scrape.status_code" # upstream HTTP status when known (403/429/…)
|
|
38
|
+
MD_LEN = "scrape.md_len"
|
|
39
|
+
SOURCE = "scrape.source_method"
|
|
40
|
+
ERROR = "scrape.error"
|
|
41
|
+
COST_USD = "scrape.cost_usd"
|
|
42
|
+
LATENCY_MS = "scrape.latency_ms" # per-tier attempt, and total on the root
|
|
43
|
+
DEADLINE_S = "scrape.deadline_s" # the per-request budget that was in force
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
_tracer = None
|
|
47
|
+
_init_lock = threading.Lock()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _init():
|
|
51
|
+
global _tracer
|
|
52
|
+
if _tracer is not None:
|
|
53
|
+
return _tracer
|
|
54
|
+
with _init_lock:
|
|
55
|
+
if _tracer is not None:
|
|
56
|
+
return _tracer
|
|
57
|
+
try:
|
|
58
|
+
from opentelemetry import trace
|
|
59
|
+
from opentelemetry.sdk.resources import Resource
|
|
60
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
61
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
62
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
63
|
+
|
|
64
|
+
service = os.getenv("OTEL_SERVICE_NAME", "switchback")
|
|
65
|
+
provider = TracerProvider(resource=Resource.create({"service.name": service}))
|
|
66
|
+
# Endpoint comes from OTEL_EXPORTER_OTLP_ENDPOINT; defaults to localhost:4317.
|
|
67
|
+
provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
|
|
68
|
+
trace.set_tracer_provider(provider)
|
|
69
|
+
_tracer = trace.get_tracer("switchback")
|
|
70
|
+
except Exception:
|
|
71
|
+
_tracer = False # tried and unavailable → no-op mode
|
|
72
|
+
return _tracer
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
_log_provider = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def setup_logs(level: int = logging.INFO) -> bool:
|
|
79
|
+
"""Route stdlib logging → an OTLP backend. Idempotent. Returns False (and
|
|
80
|
+
does nothing) if OTel isn't installed/configured. Opt-in: call from an app
|
|
81
|
+
entry point, not from library code."""
|
|
82
|
+
global _log_provider
|
|
83
|
+
if _log_provider is not None:
|
|
84
|
+
return bool(_log_provider)
|
|
85
|
+
try:
|
|
86
|
+
from opentelemetry._logs import set_logger_provider
|
|
87
|
+
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
|
|
88
|
+
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
|
89
|
+
from opentelemetry.sdk.resources import Resource
|
|
90
|
+
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
|
|
91
|
+
|
|
92
|
+
service = os.getenv("OTEL_SERVICE_NAME", "switchback")
|
|
93
|
+
provider = LoggerProvider(resource=Resource.create({"service.name": service}))
|
|
94
|
+
provider.add_log_record_processor(BatchLogRecordProcessor(OTLPLogExporter()))
|
|
95
|
+
set_logger_provider(provider)
|
|
96
|
+
handler = LoggingHandler(level=level, logger_provider=provider)
|
|
97
|
+
logging.getLogger().addHandler(handler)
|
|
98
|
+
_log_provider = provider
|
|
99
|
+
except Exception:
|
|
100
|
+
_log_provider = False # tried and unavailable → no-op
|
|
101
|
+
return bool(_log_provider)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def flush(timeout_ms: int = 5000) -> None:
|
|
105
|
+
"""Force-export buffered spans and logs. Call at the end of a batch/CLI run
|
|
106
|
+
so telemetry lands before the process exits (the batch processors otherwise
|
|
107
|
+
flush on their own ~5s timer)."""
|
|
108
|
+
if _tracer:
|
|
109
|
+
try:
|
|
110
|
+
from opentelemetry import trace
|
|
111
|
+
provider = trace.get_tracer_provider()
|
|
112
|
+
if hasattr(provider, "force_flush"):
|
|
113
|
+
provider.force_flush(timeout_ms)
|
|
114
|
+
except Exception:
|
|
115
|
+
pass
|
|
116
|
+
if _log_provider:
|
|
117
|
+
try:
|
|
118
|
+
_log_provider.force_flush(timeout_ms)
|
|
119
|
+
except Exception:
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@contextmanager
|
|
124
|
+
def span(name: str, **attrs):
|
|
125
|
+
"""Start a span. No-op if OTel isn't installed/configured.
|
|
126
|
+
|
|
127
|
+
Yields a small object with `.set(key, value)` so tiers can attach the
|
|
128
|
+
outcome/length/error once they know it.
|
|
129
|
+
"""
|
|
130
|
+
tracer = _init()
|
|
131
|
+
if not tracer:
|
|
132
|
+
yield _NoopSpan()
|
|
133
|
+
return
|
|
134
|
+
with tracer.start_as_current_span(name) as sp:
|
|
135
|
+
for k, v in attrs.items():
|
|
136
|
+
if v is not None:
|
|
137
|
+
sp.set_attribute(k, v)
|
|
138
|
+
yield _RealSpan(sp)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class _NoopSpan:
|
|
142
|
+
def set(self, key, value):
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class _RealSpan:
|
|
147
|
+
def __init__(self, sp):
|
|
148
|
+
self._sp = sp
|
|
149
|
+
|
|
150
|
+
def set(self, key, value):
|
|
151
|
+
if value is not None:
|
|
152
|
+
self._sp.set_attribute(key, value)
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: switchback
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
|
+
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/akash-kr/switchback
|
|
8
|
+
Project-URL: Repository, https://github.com/akash-kr/switchback
|
|
9
|
+
Project-URL: Issues, https://github.com/akash-kr/switchback/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/akash-kr/switchback/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: scraping,crawler,cloudflare,markdown,cascade,anti-bot,stealth
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
License-File: NOTICE
|
|
28
|
+
Requires-Dist: markdownify
|
|
29
|
+
Requires-Dist: beautifulsoup4
|
|
30
|
+
Requires-Dist: pypdf
|
|
31
|
+
Requires-Dist: requests
|
|
32
|
+
Requires-Dist: curl_cffi
|
|
33
|
+
Provides-Extra: cloudflare
|
|
34
|
+
Requires-Dist: cloudscraper; extra == "cloudflare"
|
|
35
|
+
Provides-Extra: browser
|
|
36
|
+
Requires-Dist: patchright; extra == "browser"
|
|
37
|
+
Provides-Extra: camoufox
|
|
38
|
+
Requires-Dist: camoufox[geoip]; extra == "camoufox"
|
|
39
|
+
Provides-Extra: firecrawl
|
|
40
|
+
Requires-Dist: firecrawl-py; extra == "firecrawl"
|
|
41
|
+
Provides-Extra: server
|
|
42
|
+
Requires-Dist: fastapi; extra == "server"
|
|
43
|
+
Requires-Dist: uvicorn; extra == "server"
|
|
44
|
+
Provides-Extra: tracing
|
|
45
|
+
Requires-Dist: opentelemetry-sdk; extra == "tracing"
|
|
46
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
|
|
47
|
+
Provides-Extra: all
|
|
48
|
+
Requires-Dist: cloudscraper; extra == "all"
|
|
49
|
+
Requires-Dist: patchright; extra == "all"
|
|
50
|
+
Requires-Dist: camoufox[geoip]; extra == "all"
|
|
51
|
+
Requires-Dist: firecrawl-py; extra == "all"
|
|
52
|
+
Requires-Dist: fastapi; extra == "all"
|
|
53
|
+
Requires-Dist: uvicorn; extra == "all"
|
|
54
|
+
Requires-Dist: opentelemetry-sdk; extra == "all"
|
|
55
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "all"
|
|
56
|
+
Provides-Extra: dev
|
|
57
|
+
Requires-Dist: pytest; extra == "dev"
|
|
58
|
+
Dynamic: license-file
|
|
59
|
+
|
|
60
|
+
<!-- switchback -->
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
███████╗██╗ ██╗██╗████████╗ ██████╗██╗ ██╗██████╗ █████╗ ██████╗██╗ ██╗
|
|
64
|
+
██╔════╝██║ ██║██║╚══██╔══╝██╔════╝██║ ██║██╔══██╗██╔══██╗██╔════╝██║ ██╔╝
|
|
65
|
+
███████╗██║ █╗ ██║██║ ██║ ██║ ███████║██████╔╝███████║██║ █████╔╝
|
|
66
|
+
╚════██║██║███╗██║██║ ██║ ██║ ██╔══██║██╔══██╗██╔══██║██║ ██╔═██╗
|
|
67
|
+
███████║╚███╔███╔╝██║ ██║ ╚██████╗██║ ██║██████╔╝██║ ██║╚██████╗██║ ██╗
|
|
68
|
+
╚══════╝ ╚══╝╚══╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
<div align="center">
|
|
72
|
+
|
|
73
|
+
**One cost-ordered scrape cascade — HTTP → stealth browser → paid — shared by every tool.**
|
|
74
|
+
|
|
75
|
+
Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
|
|
76
|
+
to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
|
|
77
|
+
|
|
78
|
+
[](https://pypi.org/project/switchback/)
|
|
79
|
+
[](https://pypi.org/project/switchback/)
|
|
80
|
+
[](LICENSE)
|
|
81
|
+
[](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
|
|
82
|
+
|
|
83
|
+
</div>
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Why
|
|
88
|
+
|
|
89
|
+
Most scrapers either give up on hard pages or send *everything* through an expensive
|
|
90
|
+
headless browser / paid API. **switchback** orders the methods by cost and walks them
|
|
91
|
+
cheapest-first, per host, learning which tier wins where so the next run starts there.
|
|
92
|
+
The easy majority stays free; only genuinely-walled hosts pay for the heavy tiers.
|
|
93
|
+
|
|
94
|
+
- **Cost-ordered cascade** — free APIs → cheap HTTP → anti-bot solver → stealth browser → paid API.
|
|
95
|
+
- **Per-host memory (botwall)** — remembers the winning tier per host, skip-lists hard blockers, auto-skips hosts stuck on the paid tier.
|
|
96
|
+
- **Cost-scoped residential egress** — routes *only* walled hosts through a residential proxy, never the easy majority.
|
|
97
|
+
- **One shape, three entry points** — Python library, CLI (JSON on stdout), or an HTTP service.
|
|
98
|
+
- **Observable** — every attempt is an OpenTelemetry span; logs ship trace-correlated to any OTLP backend (Jaeger, Tempo, SigNoz).
|
|
99
|
+
- **Runs with any subset installed** — each tier imports its deps lazily; a missing one is just a tier miss.
|
|
100
|
+
|
|
101
|
+
## Quickstart
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install switchback # core: cheap tiers (0/1) + search
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from switchback import scrape
|
|
109
|
+
|
|
110
|
+
for r in scrape(["https://arxiv.org/abs/1706.03762"]):
|
|
111
|
+
print(r.source_method, len(r.markdown))
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
python -m switchback https://example.com/article # JSON on stdout — bridge for any language
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
That's the whole loop. Add tiers as you need them (see [Install](#install)).
|
|
119
|
+
|
|
120
|
+
## The cascade (stop at first success)
|
|
121
|
+
|
|
122
|
+
| Tier | Strategy | Cost |
|
|
123
|
+
|---|---|---|
|
|
124
|
+
| 0 | Direct APIs / mirrors (arxiv, wikipedia, EuropePMC; extend: job boards) | free, cleanest |
|
|
125
|
+
| 1 | Plain HTTP + TLS impersonation (`curl_cffi`), incl. PDFs | cheap |
|
|
126
|
+
| 2 | Cloudflare / anti-bot solver (`cloudscraper`, install `.[cloudflare]`) | cheap-ish (~5s/host) |
|
|
127
|
+
| 3 | Stealth headless browser (`patchright`, Chromium) | heavy |
|
|
128
|
+
| 3b | Camoufox (Firefox stealth) — **on by default** (opt out: `SCRAPER_DISABLE_CAMOUFOX`) | heavy + slow (~40s on hard CF) |
|
|
129
|
+
| 3c | Residential-IP browser over CDP (`BU_CDP_URL`) — off unless configured | heavy (remote egress) |
|
|
130
|
+
| 4 | Firecrawl (paid, env-gated, audited) | paid, last resort |
|
|
131
|
+
|
|
132
|
+
Every URL has a wall-clock budget (`SCRAPER_DEADLINE_S`, default 45s) checked between
|
|
133
|
+
tiers so one URL can't run the whole cascade of timeouts. Each tier attempt records
|
|
134
|
+
latency + outcome (`ok` / `short_content` / `rate_limited` / `miss` / `not_applicable`)
|
|
135
|
+
to its span and the botwall event log; the root span carries total latency and the final
|
|
136
|
+
outcome (incl. `deadline_exceeded`).
|
|
137
|
+
|
|
138
|
+
Search (query → URLs) is separate from the scrape cascade: `switchback.search()` /
|
|
139
|
+
`python -m switchback.api --search <query>`, backed by a local SearXNG.
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
## Install
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pip install switchback # core: normalization + cheap tiers (0/1) + search
|
|
146
|
+
pip install "switchback[cloudflare]" # + Tier 2 Cloudflare/anti-bot solver (cloudscraper)
|
|
147
|
+
pip install "switchback[server]" # + HTTP service (fastapi, uvicorn) incl. /metrics + /traces
|
|
148
|
+
pip install "switchback[browser]" && patchright install chromium # + Tier 3 stealth Chromium
|
|
149
|
+
pip install "switchback[camoufox]" && camoufox fetch # + Tier 3b Firefox stealth
|
|
150
|
+
pip install "switchback[firecrawl]" # + Tier 4 paid API (needs FIRECRAWL_API_KEY)
|
|
151
|
+
pip install "switchback[tracing]" # + OpenTelemetry -> any OTLP backend
|
|
152
|
+
pip install "switchback[all]" # everything
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
For Tier 2's **full** v3 JS-VM + Turnstile + stealth, install the Enhanced Edition
|
|
156
|
+
3.x fork (PyPI's `cloudscraper` is the older v1/v2 — PyPI forbids pinning a
|
|
157
|
+
git-URL dep inside a published package, so install it alongside):
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Or run the whole thing as a container:
|
|
164
|
+
`docker build -t switchback . && docker run -p 8799:8799 switchback`.
|
|
165
|
+
|
|
166
|
+
## Use it from your app
|
|
167
|
+
|
|
168
|
+
Three interchangeable entry points — all return the same shape
|
|
169
|
+
(`[{url, source_method, markdown}]`, successes only):
|
|
170
|
+
|
|
171
|
+
**Python library**
|
|
172
|
+
```python
|
|
173
|
+
from switchback import scrape
|
|
174
|
+
for r in scrape(["https://arxiv.org/abs/1706.03762"]):
|
|
175
|
+
print(r.source_method, len(r.markdown))
|
|
176
|
+
|
|
177
|
+
# Need failures + reasons too? scrape_detailed returns a ScrapeOutcome per URL
|
|
178
|
+
# (ok, final_outcome, error_class, status_code, and the per-tier attempts):
|
|
179
|
+
from switchback import scrape_detailed
|
|
180
|
+
for o in scrape_detailed(["https://www.pcmag.com/news"]):
|
|
181
|
+
if not o.ok:
|
|
182
|
+
print(o.url, o.final_outcome, o.error_class, o.status_code)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
**CLI** (JSON on stdout — bridge for any language)
|
|
186
|
+
```bash
|
|
187
|
+
python -m switchback https://example.com/article # or: switchback <url>
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
**HTTP service** (language-agnostic; one warm process keeps the browser pool hot)
|
|
191
|
+
```bash
|
|
192
|
+
switchback-server # listens on :8799
|
|
193
|
+
curl -s localhost:8799/scrape -d '{"urls":["https://example.com"]}'
|
|
194
|
+
curl 'localhost:8799/search?q=web+scraping'
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Non-Python callers: see [clients/node_bridge.md](clients/node_bridge.md). Python
|
|
198
|
+
callers that want HTTP-with-CLI-fallback can drop in
|
|
199
|
+
[clients/python_client.py](clients/python_client.py).
|
|
200
|
+
|
|
201
|
+
## Cost-scoped residential egress
|
|
202
|
+
|
|
203
|
+
The dominant reason hard hosts wall you is the **datacenter IP**, not the
|
|
204
|
+
fingerprint. When a host repeatedly walls the local tiers (a 403/429 or a
|
|
205
|
+
bot-wall page, `SCRAPER_BOTWALL_EGRESS_AFTER` times) it's flagged `needs_egress`
|
|
206
|
+
and the cascade reruns through a **residential proxy** — but only for that host:
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
export SCRAPER_EGRESS_PROXY="http://user:pass@p.webshare.io:80"
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
The easy majority that already succeeds free at the datacenter IP stays direct,
|
|
213
|
+
so you never spend (often metered) residential bandwidth on it. Escalation tries
|
|
214
|
+
the cheap HTTP tiers through the proxy first (~0.2MB/page) before the heavier
|
|
215
|
+
browser tiers. [Webshare](https://www.webshare.io/)'s free plan includes ~1GB/mo
|
|
216
|
+
of residential bandwidth — enough for low-volume hard-host recovery at $0. Use
|
|
217
|
+
`SCRAPER_PROXY` instead to force *every* request through a proxy.
|
|
218
|
+
|
|
219
|
+
## Metrics & reporting
|
|
220
|
+
|
|
221
|
+
The engine derives all metrics from its own state files (no external store): the
|
|
222
|
+
botwall event log (one row per tier attempt, incl. the detected challenge vendor)
|
|
223
|
+
and the per-host DB (winning tier, per-vendor `challenge_counts`).
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
curl localhost:8799/metrics # cost savings vs Firecrawl, coverage,
|
|
227
|
+
# overall + per-tier latency, outcomes
|
|
228
|
+
curl localhost:8799/metrics/domains # per-domain: error codes, challenges, latency
|
|
229
|
+
python -m switchback.flags # periodic digest: domains stuck on Firecrawl,
|
|
230
|
+
# escalated to egress, top challenged (cron-friendly)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Both endpoints accept `?minutes=N` to window the event-derived sections. The
|
|
234
|
+
**savings** figure compares engine spend (Firecrawl invocations only) against a
|
|
235
|
+
Firecrawl-everything baseline, charging the hard-page credit multiplier
|
|
236
|
+
(`BENCH_FIRECRAWL_HARD_MULT`) for URLs that needed a browser/residential tier or
|
|
237
|
+
hit a challenge — i.e. exactly the ones Firecrawl bills more for.
|
|
238
|
+
|
|
239
|
+
## Configuration
|
|
240
|
+
|
|
241
|
+
All configuration is via environment variables. The engine runs with missing
|
|
242
|
+
pieces: each tier imports its deps lazily and a missing one just counts as a tier
|
|
243
|
+
miss. Tracing no-ops if OTel isn't installed/configured.
|
|
244
|
+
|
|
245
|
+
<details>
|
|
246
|
+
<summary><b>Tracing (optional)</b></summary>
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
export OTEL_SERVICE_NAME=switchback
|
|
250
|
+
export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
251
|
+
```
|
|
252
|
+
</details>
|
|
253
|
+
|
|
254
|
+
<details>
|
|
255
|
+
<summary><b>Env gates</b> — enable/disable tiers and integrations</summary>
|
|
256
|
+
|
|
257
|
+
- `SCRAPER_DISABLE_FIRECRAWL` — skip Tier 4
|
|
258
|
+
- `FIRECRAWL_API_KEY` — enable Tier 4
|
|
259
|
+
- `SCRAPER_DISABLE_CAMOUFOX` — turn off Tier 3b (on by default; needs `pip install camoufox` + `camoufox fetch`)
|
|
260
|
+
- `BU_CDP_URL` — enable Tier 3c residential browser by pointing at a CDP endpoint
|
|
261
|
+
- `SCRAPER_PROXY` — route *all* tiers/URLs through a proxy
|
|
262
|
+
- `SCRAPER_EGRESS_PROXY` — route only walled hosts through a proxy (see [Cost-scoped residential egress](#cost-scoped-residential-egress))
|
|
263
|
+
- `SEARXNG_URL` — defaults to `http://localhost:8888`
|
|
264
|
+
- `SCRAPER_STATE_DIR` — where the botwall DB/event log + session cache live
|
|
265
|
+
- `SCRAPER_COOKIES_FILE` — Netscape `cookies.txt` to scrape login-gated hosts (injected into the HTTP and browser tiers)
|
|
266
|
+
- `SCRAPER_CAPTCHA_PROVIDER` + `SCRAPER_CAPTCHA_API_KEY` — opt-in, off by default: wire a third-party solver (2captcha/capsolver/capmonster/anticaptcha/deathbycaptcha/9kw) into Tier 2 for Turnstile/reCAPTCHA/hCaptcha on CF hosts. **Paid**, billed per solve by the provider.
|
|
267
|
+
</details>
|
|
268
|
+
|
|
269
|
+
<details>
|
|
270
|
+
<summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
|
|
271
|
+
|
|
272
|
+
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
273
|
+
- `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
|
|
274
|
+
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
275
|
+
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
276
|
+
- `SCRAPER_BOTWALL_EGRESS_AFTER` — local-tier failures before a host escalates to the residential tier (default 2)
|
|
277
|
+
- `SCRAPER_SESSION_TTL_S` — cf_clearance reuse window (1800s)
|
|
278
|
+
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
279
|
+
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
280
|
+
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
281
|
+
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
282
|
+
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
283
|
+
- `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
|
|
284
|
+
- `BENCH_FIRECRAWL_USD` / `BENCH_FIRECRAWL_HARD_MULT` — cost model for the savings report
|
|
285
|
+
</details>
|
|
286
|
+
|
|
287
|
+
### Logged-in sessions
|
|
288
|
+
Beyond a static `SCRAPER_COOKIES_FILE`, wire `SCRAPER_LOGIN_HOOK` to a callable
|
|
289
|
+
`func(host) -> {cookie: value}`. When an authenticated host trips a login/bot
|
|
290
|
+
wall, the engine calls the hook once, persists the returned cookies per host, and
|
|
291
|
+
overlays them on every tier (and future runs), then re-runs that URL on a fresh
|
|
292
|
+
budget. The hook owns the site-specific login mechanics; the engine stays generic.
|
|
293
|
+
|
|
294
|
+
### Session traces
|
|
295
|
+
With `SCRAPER_TRACE_SESSION=1`, each browser-tier attempt writes a Playwright
|
|
296
|
+
trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
|
|
297
|
+
`GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
|
|
298
|
+
`playwright show-trace <zip>`. Off by default (traces are MBs each).
|
|
299
|
+
|
|
300
|
+
### Per-domain extraction
|
|
301
|
+
Markdown of the whole page is the default. To scope a site to its content node or
|
|
302
|
+
strip site-specific noise, declare prefs per host in `config/extraction.json`
|
|
303
|
+
(see [config/extraction.example.json](config/extraction.example.json)); every
|
|
304
|
+
tier's normalize step picks them up automatically.
|
|
305
|
+
|
|
306
|
+
## Contributing
|
|
307
|
+
|
|
308
|
+
Issues and PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md). Start with the
|
|
309
|
+
cascade runner in `switchback/orchestrator.py`.
|
|
310
|
+
|
|
311
|
+
## Responsible use
|
|
312
|
+
|
|
313
|
+
This engine is for lawful data collection. You are responsible for respecting
|
|
314
|
+
each target site's Terms of Service, `robots.txt`, and rate limits, and for
|
|
315
|
+
having the right to access the content you fetch. The stealth / anti-bot tiers
|
|
316
|
+
(`cloudscraper`, `patchright`, `camoufox`) exist to handle legitimate access
|
|
317
|
+
friction (e.g. generic bot interstitials on public pages) — not to evade access
|
|
318
|
+
controls, paywalls, or authentication you aren't authorized to bypass. The
|
|
319
|
+
software is provided "as is", without warranty (see [LICENSE](LICENSE)).
|
|
320
|
+
|
|
321
|
+
## License
|
|
322
|
+
|
|
323
|
+
MIT — see [LICENSE](LICENSE). Third-party dependencies and their licenses are
|
|
324
|
+
listed in [NOTICE](NOTICE); all are permissive (MIT / BSD-3-Clause / Apache-2.0)
|
|
325
|
+
and compatible with this project's MIT license.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
switchback/__init__.py,sha256=a3WwmcxW3YLBDVMPn5xho8sh-VBIUJC34r6zREpMeKk,506
|
|
2
|
+
switchback/__main__.py,sha256=FNhL9UYwA2BpXukEaEzFSbc5H-FHKDb390n6yUYM6mU,97
|
|
3
|
+
switchback/api.py,sha256=h9kx1o3q1vvxTpr5C-0b3lhORCdk3M_84NreyL3be2A,2973
|
|
4
|
+
switchback/concurrency.py,sha256=FYOjHVa9gMpydYK72Nm5t27lmNBRg_Eh507blHKjLzk,1348
|
|
5
|
+
switchback/content_cache.py,sha256=1Svp4PNFGXnywtrJzT4ptKorB9JlpDw9onmXtWPfKpc,3264
|
|
6
|
+
switchback/egress.py,sha256=_3fqqc8TSw9kuoH8o1YzDST9Tv-tbdEGjHGxTPVFWy0,4211
|
|
7
|
+
switchback/extract.py,sha256=dKWj4vHGewHb9PqBhjw-NDOo-aweLItGC1NWltD4-dY,1825
|
|
8
|
+
switchback/flags.py,sha256=R08P2RzRfijdkhvAN-JxqdsxZZZW9STqSLMRfIbLPZI,4012
|
|
9
|
+
switchback/normalize.py,sha256=hbpTmsVgCGiuJtSoV-SpFcZYiH4ndnF_wb_hwzBTKtM,3266
|
|
10
|
+
switchback/orchestrator.py,sha256=DAzBZBoB7CkSrklUWmzowVWaFxZQ-dot3uJN57cpnPw,15915
|
|
11
|
+
switchback/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
switchback/reporting.py,sha256=9do6pqreH7bHpGyQKKu9TqLrxPxR4bbrMlMb2GYB2V4,9218
|
|
13
|
+
switchback/search.py,sha256=Fg2bz2tPvPM_YOCCQE-u-WScvCyNtFdXsmAbvqvraoE,1352
|
|
14
|
+
switchback/server.py,sha256=8pdFg4pWBFQoExMUzEZUcKl5bOTqvEUwMzIt5-r0QgY,3766
|
|
15
|
+
switchback/session_cache.py,sha256=wBvMMLOUw8hriu4N7VbvXo10k6-cdKJ8CKDuMVoCKjQ,9986
|
|
16
|
+
switchback/session_trace.py,sha256=jJTmEyd7iZVAaDgIFyk4vu2mPAI8iWSH_pLjRUOfxg4,3332
|
|
17
|
+
switchback/tracing.py,sha256=5uTpUfYl8a2gIiaNg94Pk8COK143kBxeA1zKJoei73U,5772
|
|
18
|
+
switchback/policy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
switchback/policy/botwall.py,sha256=uZsoerKM0YsC_y_KtyckIufYeqIZhW2c2TMKhLgWc4g,16187
|
|
20
|
+
switchback/policy/gates.py,sha256=z7eeSWdb1Bqdjcz1qHrUtZ4mIgX8-Hms2Mvk9AEP30I,7103
|
|
21
|
+
switchback/tiers/__init__.py,sha256=seqE3NDnc71XVnf3DuklRx1KfqQDVzudPwxofCflHkY,851
|
|
22
|
+
switchback/tiers/_browser.py,sha256=4kaWxh8wkegyKsnE2pwe9LLv7xJG-eSxq5Gjwz53DFI,2195
|
|
23
|
+
switchback/tiers/tier0_apis.py,sha256=dAwTf_qkdYj1rCNVdH5Qn93JwAoGU_VhMT9YryYFTc8,3086
|
|
24
|
+
switchback/tiers/tier1_http.py,sha256=bpDrOY0lUen2sS9sgF3l_oObkcogasKpuYk-OJSNxto,2902
|
|
25
|
+
switchback/tiers/tier2_cloudscraper.py,sha256=weNXrZj61d5iATf7JVfjL5qG9Q-lsup8SSV9fsQoGe8,5522
|
|
26
|
+
switchback/tiers/tier3_browser.py,sha256=oYG_pDvvG79AwDdyhn1a1EV5NnDZfMp1hYgFO6eDZAY,2531
|
|
27
|
+
switchback/tiers/tier3b_camoufox.py,sha256=UFTIeUQ36r7ixQSjqtswbf9ddMfM9xv2fmKEB2tKs4o,3515
|
|
28
|
+
switchback/tiers/tier4_firecrawl.py,sha256=YZ02zx4Adv8xXALJw2C2XgzZAs4r6wJWMDmWeOlngPQ,1556
|
|
29
|
+
switchback/tiers/tier_residential.py,sha256=IlR0d-OYKYWqMOMp0ZLrliIVRsPXn2wCvWgSls-DIZ4,2097
|
|
30
|
+
switchback-0.1.0.dist-info/licenses/LICENSE,sha256=b8V82Q_eJ8JgOap-Zg7JexHZKRF-g1k6ZPfGn2FJXf8,1071
|
|
31
|
+
switchback-0.1.0.dist-info/licenses/NOTICE,sha256=wloHWGC3lw_iV6GM2AfaIOhBQmoTGWF1j3Gin8YNKrM,1461
|
|
32
|
+
switchback-0.1.0.dist-info/METADATA,sha256=JpEJURvlQYbmbrYFYWIUDaJFXN1ZlQP0JjJD-LfSQbM,16400
|
|
33
|
+
switchback-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
34
|
+
switchback-0.1.0.dist-info/entry_points.txt,sha256=Y51hpCqJxN5MaIhTkkBxE6LBs8N6CoIEF3bX58wqbh0,95
|
|
35
|
+
switchback-0.1.0.dist-info/top_level.txt,sha256=ttbyWWHmZeKuLw0aWB5AGSm4DBRjjJXNruNtJzeuSUk,11
|
|
36
|
+
switchback-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Akash Kodavuru
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
switchback
|
|
2
|
+
Copyright (c) 2026 Akash Kodavuru
|
|
3
|
+
Licensed under the MIT License (see LICENSE).
|
|
4
|
+
|
|
5
|
+
This product depends on third-party components, each under its own license.
|
|
6
|
+
They are installed via pip and are NOT bundled or redistributed in this
|
|
7
|
+
repository — no upstream source or full license texts are vendored here. All
|
|
8
|
+
dependencies use permissive licenses (MIT / BSD-3-Clause / Apache-2.0), which
|
|
9
|
+
are compatible with this project's MIT license.
|
|
10
|
+
|
|
11
|
+
Third-party components
|
|
12
|
+
----------------------
|
|
13
|
+
|
|
14
|
+
MIT
|
|
15
|
+
markdownify HTML -> Markdown normalization
|
|
16
|
+
beautifulsoup4 HTML parsing / normalization
|
|
17
|
+
curl_cffi tier1: plain HTTP with TLS impersonation
|
|
18
|
+
cloudscraper tier2: Cloudflare / anti-bot challenge solver
|
|
19
|
+
camoufox tier3b: Firefox stealth (optional)
|
|
20
|
+
firecrawl-py tier4: paid last-resort scrape API (optional)
|
|
21
|
+
fastapi HTTP service (optional)
|
|
22
|
+
|
|
23
|
+
BSD-3-Clause
|
|
24
|
+
pypdf PDF -> text extraction (tier1 PDFs)
|
|
25
|
+
uvicorn ASGI server for the HTTP service (optional)
|
|
26
|
+
|
|
27
|
+
Apache-2.0
|
|
28
|
+
requests HTTP client
|
|
29
|
+
patchright tier3: stealth headless Chromium (optional)
|
|
30
|
+
opentelemetry-sdk tracing -> any OTLP backend (optional)
|
|
31
|
+
opentelemetry-exporter-otlp-proto-grpc OTLP trace/log export (optional)
|
|
32
|
+
|
|
33
|
+
Full license texts are available with each package distribution (e.g. in its
|
|
34
|
+
`*.dist-info` directory after install) and from each project's repository.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
switchback
|