switchback 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {switchback-0.2.0 → switchback-0.4.0}/.env.example +7 -0
- switchback-0.4.0/CHANGELOG.md +94 -0
- {switchback-0.2.0 → switchback-0.4.0}/PKG-INFO +32 -1
- {switchback-0.2.0 → switchback-0.4.0}/README.md +31 -0
- {switchback-0.2.0 → switchback-0.4.0}/pyproject.toml +1 -1
- {switchback-0.2.0 → switchback-0.4.0}/switchback/api.py +5 -0
- switchback-0.4.0/switchback/doctor.py +59 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/orchestrator.py +170 -61
- {switchback-0.2.0 → switchback-0.4.0}/switchback/policy/gates.py +59 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/tier2_cloudscraper.py +33 -1
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/tier3_browser.py +41 -3
- {switchback-0.2.0 → switchback-0.4.0}/switchback.egg-info/PKG-INFO +32 -1
- {switchback-0.2.0 → switchback-0.4.0}/switchback.egg-info/SOURCES.txt +1 -0
- switchback-0.2.0/CHANGELOG.md +0 -46
- {switchback-0.2.0 → switchback-0.4.0}/CONTRIBUTING.md +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/LICENSE +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/MANIFEST.in +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/NOTICE +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/SECURITY.md +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/clients/node_bridge.md +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/clients/python_client.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/config/botwall_skip_urls.txt +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/config/extraction.example.json +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/setup.cfg +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/__init__.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/__main__.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/concurrency.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/content_cache.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/egress.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/extract.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/flags.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/normalize.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/policy/__init__.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/policy/botwall.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/py.typed +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/reporting.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/search.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/server.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/session_cache.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/session_trace.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/__init__.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/_browser.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/tier0_apis.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/tier1_http.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/tier3b_camoufox.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/tier4_firecrawl.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tiers/tier_residential.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback/tracing.py +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback.egg-info/dependency_links.txt +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback.egg-info/entry_points.txt +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback.egg-info/requires.txt +0 -0
- {switchback-0.2.0 → switchback-0.4.0}/switchback.egg-info/top_level.txt +0 -0
|
@@ -27,6 +27,13 @@ SEARXNG_URL=http://localhost:8888
|
|
|
27
27
|
# fall back to their text for those sources.
|
|
28
28
|
SCRAPER_OUTPUT_FORMAT=markdown
|
|
29
29
|
|
|
30
|
+
# ── Tier 2 · Cloudflare solver (cloudscraper) ───────────────────────────────
|
|
31
|
+
# Needs the 3.x Enhanced Edition fork (see README); with the frozen PyPI build
|
|
32
|
+
# the tier reports `unavailable` and fails fast. Wall-clock cap on a single solve
|
|
33
|
+
# so an unsolvable challenge can't eat the per-URL deadline before the browser
|
|
34
|
+
# tier runs. Lower (e.g. 12) if Tier 2 rarely wins on your hosts.
|
|
35
|
+
SCRAPER_CLOUDSCRAPER_TIMEOUT_S=25
|
|
36
|
+
|
|
30
37
|
# ── Tier 2.5 · Jina Reader (r.jina.ai) ──────────────────────────────────────
|
|
31
38
|
# Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
|
|
32
39
|
JINA_API_KEY=
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. Format loosely follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/); this project uses semantic-ish
|
|
5
|
+
versioning while pre-1.0.
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [0.4.0] - 2026-06-29
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **Configurable per-tier retries** — a tier can now re-attempt before falling
|
|
13
|
+
through to the next, more capable one. `SCRAPER_TIER_RETRIES` (global, default
|
|
14
|
+
`0` = off; `N` → up to `1+N` tries per tier), per-tier overrides
|
|
15
|
+
`SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`), and
|
|
16
|
+
`SCRAPER_TIER_RETRY_ON` (retryable failure classes; default
|
|
17
|
+
`timeout,rate_limited,connection` — widen to include `botwall,http_block` behind
|
|
18
|
+
a rotating residential proxy, where each retry gets a fresh IP). Retries stay
|
|
19
|
+
bounded by `SCRAPER_DEADLINE_S`, and intermediate retries are traced/logged but
|
|
20
|
+
**not** persisted to the botwall policy DB, so they never inflate the
|
|
21
|
+
self-healing skip / `needs_egress` counters. Default `0` keeps behaviour
|
|
22
|
+
unchanged. Enabling retries on the paid Firecrawl tier bills per attempt.
|
|
23
|
+
|
|
24
|
+
### Fixed
|
|
25
|
+
- **Quality gate rejects content shells** — the gate no longer passes a page just
|
|
26
|
+
because it clears the length floor; thin "shell" pages (nav/boilerplate with no
|
|
27
|
+
real article body) are now treated as a tier miss so the cascade falls through.
|
|
28
|
+
- **Paid last-resort budget reserve** — `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S`
|
|
29
|
+
(default 25s) stops starting local tiers once enough of the per-URL deadline has
|
|
30
|
+
elapsed and an enabled paid tier is still ahead, so a hard host can't burn the
|
|
31
|
+
whole budget before Firecrawl gets a turn.
|
|
32
|
+
|
|
33
|
+
## [0.3.0] - 2026-06-27
|
|
34
|
+
|
|
35
|
+
### Added
|
|
36
|
+
- **`unavailable` tier outcome** — when a tier's optional dependency is missing,
|
|
37
|
+
the wrong version, or not installed yet (frozen PyPI `cloudscraper` instead of
|
|
38
|
+
the 3.x stealth fork; patchright's Chromium not downloaded during an async
|
|
39
|
+
cold-start install), the tier now fails fast (~0ms) with a distinct
|
|
40
|
+
`unavailable` outcome carrying the exact install command, logged once per tier.
|
|
41
|
+
It ranks above bot-wall in the verdict, so an environment problem is no longer
|
|
42
|
+
masked as `botwall` — and a missing Tier 2 dependency no longer burns the
|
|
43
|
+
per-URL solve budget before the browser tier runs.
|
|
44
|
+
- **`switchback --doctor`** — preflight tier-readiness check (doubles as a
|
|
45
|
+
healthcheck: exit 0 when the capable tiers are ready). Reports whether
|
|
46
|
+
cloudscraper is the stealth-capable 3.x fork, patchright + Chromium are
|
|
47
|
+
installed, Camoufox/Node are present, and Firecrawl is configured. Built for
|
|
48
|
+
cold-start deploys where the browser is installed by a background thread after
|
|
49
|
+
boot.
|
|
50
|
+
|
|
51
|
+
### Docs
|
|
52
|
+
- README **Production / cold-start deployment** section and a `.env.example`
|
|
53
|
+
Tier 2 block: install `patchright install chromium` in the post-boot step, the
|
|
54
|
+
cloudscraper 3.x fork requirement, Node.js for Tier 2 concurrency, and the
|
|
55
|
+
`SCRAPER_CLOUDSCRAPER_TIMEOUT_S` budget knob.
|
|
56
|
+
|
|
57
|
+
## [0.2.0] - 2026-06-25
|
|
58
|
+
|
|
59
|
+
### Added
|
|
60
|
+
- **Selectable output formats** — `SCRAPER_OUTPUT_FORMAT` (or per-call
|
|
61
|
+
`scrape(fmt=...)`, CLI `--format`, `/scrape` `{"format": ...}`) selects the
|
|
62
|
+
content shape: `markdown` (default, unchanged), `markdown_trimmed` (extra
|
|
63
|
+
ad/nav/boilerplate removed), `html` (raw), or `html_selectors` (cleaned HTML
|
|
64
|
+
with per-domain `drop`/`selector` applied). Default output is byte-identical;
|
|
65
|
+
html-family results use a `html` JSON key instead of `markdown`.
|
|
66
|
+
|
|
67
|
+
## [0.1.0] - 2026-06-23
|
|
68
|
+
|
|
69
|
+
### Added
|
|
70
|
+
- **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
|
|
71
|
+
DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
|
|
72
|
+
botwall DB; the vendor is attached to each event and OTel span (`scrape.challenge`).
|
|
73
|
+
- **Metrics & reporting** — `switchback.reporting` rolls the event log + botwall DB
|
|
74
|
+
into cost-savings-vs-Firecrawl, coverage, overall/per-tier/per-domain latency
|
|
75
|
+
(mean/median/min/max/p50/p95), outcomes, error codes by domain, and challenges
|
|
76
|
+
by domain. Exposed via `GET /metrics` and `GET /metrics/domains` (both accept
|
|
77
|
+
`?minutes=N`).
|
|
78
|
+
- **Periodic flagging** — `python -m switchback.flags` emits a cron-friendly digest
|
|
79
|
+
(domains stuck on Firecrawl, escalated to egress, most-challenged) to logs/OTel.
|
|
80
|
+
- **Content cache** — optional URL→result cache (`SCRAPER_CONTENT_TTL_S`, sqlite,
|
|
81
|
+
off by default) short-circuits re-scrapes before any tier runs.
|
|
82
|
+
- **Login-session refresh** — `SCRAPER_LOGIN_HOOK` (`pkg.module:func`) refreshes a
|
|
83
|
+
dead logged-in session on demand; cookies overlay every tier and persist.
|
|
84
|
+
- **Exponential backoff** — between-tier backoff with jitter after rate-limit /
|
|
85
|
+
timeout (`SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS`, off by default).
|
|
86
|
+
- **Per-domain extraction prefs** — `config/extraction.json` (CSS scope selector +
|
|
87
|
+
extra drops) applied automatically in the normalize step for every tier.
|
|
88
|
+
- **Session traces** — opt-in Playwright trace capture (`SCRAPER_TRACE_SESSION=1`)
|
|
89
|
+
for browser tiers, with `GET/DELETE /traces` management endpoints.
|
|
90
|
+
|
|
91
|
+
### Changed
|
|
92
|
+
- Tier 2's `cloudscraper` moved from a core dependency (which pinned a git-URL
|
|
93
|
+
fork PyPI can't publish) to the `cloudflare` extra; see the README for installing
|
|
94
|
+
the 3.x Enhanced Edition fork for full stealth.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: switchback
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
5
|
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
6
|
License: MIT
|
|
@@ -163,6 +163,34 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
|
|
|
163
163
|
Or run the whole thing as a container:
|
|
164
164
|
`docker build -t switchback . && docker run -p 8799:8799 switchback`.
|
|
165
165
|
|
|
166
|
+
### Production / cold-start deployment
|
|
167
|
+
|
|
168
|
+
The two heavy tiers pull dependencies that often can't be baked into a base image
|
|
169
|
+
and land *after* boot (e.g. an async install thread on Azure). Until they're
|
|
170
|
+
ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
|
|
171
|
+
fix) and the cascade falls through — they are never silently skipped. Checklist:
|
|
172
|
+
|
|
173
|
+
- **Tier 3 is the real workhorse for Cloudflare/JS sites** — make sure its browser
|
|
174
|
+
is installed: `patchright install chromium` (note: **patchright**, not vanilla
|
|
175
|
+
`playwright`). On a cold start, run this in your post-boot install step/thread;
|
|
176
|
+
Tier 3 flips to ready once it finishes.
|
|
177
|
+
- **Tier 2 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
|
|
178
|
+
frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
|
|
179
|
+
solve budget) instead of erroring mid-cascade. Tier 2 is a *weak* solver for
|
|
180
|
+
modern Cloudflare — treat it as a cheap try before the browser, not the primary.
|
|
181
|
+
- **Install Node.js** for Tier 2's v3 JS-VM challenges — faster and thread-safe
|
|
182
|
+
vs. the pure-Python js2py fallback (relevant under concurrent load).
|
|
183
|
+
- **Bound Tier 2's solve budget** with `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` (default
|
|
184
|
+
`25`) so an unsolvable challenge can't eat the per-URL deadline before the
|
|
185
|
+
browser tier runs. Lower it (e.g. `12`) if Tier 2 rarely wins on your hosts.
|
|
186
|
+
|
|
187
|
+
**Verify readiness on the box** with the preflight check (doubles as a healthcheck
|
|
188
|
+
— exit 0 when the capable tiers are ready):
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
switchback --doctor # or: python -m switchback --doctor
|
|
192
|
+
```
|
|
193
|
+
|
|
166
194
|
## Use it from your app
|
|
167
195
|
|
|
168
196
|
Three interchangeable entry points — all return the same shape
|
|
@@ -271,6 +299,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
271
299
|
|
|
272
300
|
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
273
301
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
302
|
+
- `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
|
|
274
303
|
- `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
|
|
275
304
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
276
305
|
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
@@ -279,6 +308,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
279
308
|
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
280
309
|
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
281
310
|
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
311
|
+
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`)
|
|
312
|
+
- `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
|
|
282
313
|
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
283
314
|
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
284
315
|
- `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
|
|
@@ -104,6 +104,34 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
|
|
|
104
104
|
Or run the whole thing as a container:
|
|
105
105
|
`docker build -t switchback . && docker run -p 8799:8799 switchback`.
|
|
106
106
|
|
|
107
|
+
### Production / cold-start deployment
|
|
108
|
+
|
|
109
|
+
The two heavy tiers pull dependencies that often can't be baked into a base image
|
|
110
|
+
and land *after* boot (e.g. an async install thread on Azure). Until they're
|
|
111
|
+
ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
|
|
112
|
+
fix) and the cascade falls through — they are never silently skipped. Checklist:
|
|
113
|
+
|
|
114
|
+
- **Tier 3 is the real workhorse for Cloudflare/JS sites** — make sure its browser
|
|
115
|
+
is installed: `patchright install chromium` (note: **patchright**, not vanilla
|
|
116
|
+
`playwright`). On a cold start, run this in your post-boot install step/thread;
|
|
117
|
+
Tier 3 flips to ready once it finishes.
|
|
118
|
+
- **Tier 2 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
|
|
119
|
+
frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
|
|
120
|
+
solve budget) instead of erroring mid-cascade. Tier 2 is a *weak* solver for
|
|
121
|
+
modern Cloudflare — treat it as a cheap try before the browser, not the primary.
|
|
122
|
+
- **Install Node.js** for Tier 2's v3 JS-VM challenges — faster and thread-safe
|
|
123
|
+
vs. the pure-Python js2py fallback (relevant under concurrent load).
|
|
124
|
+
- **Bound Tier 2's solve budget** with `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` (default
|
|
125
|
+
`25`) so an unsolvable challenge can't eat the per-URL deadline before the
|
|
126
|
+
browser tier runs. Lower it (e.g. `12`) if Tier 2 rarely wins on your hosts.
|
|
127
|
+
|
|
128
|
+
**Verify readiness on the box** with the preflight check (doubles as a healthcheck
|
|
129
|
+
— exit 0 when the capable tiers are ready):
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
switchback --doctor # or: python -m switchback --doctor
|
|
133
|
+
```
|
|
134
|
+
|
|
107
135
|
## Use it from your app
|
|
108
136
|
|
|
109
137
|
Three interchangeable entry points — all return the same shape
|
|
@@ -212,6 +240,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
212
240
|
|
|
213
241
|
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
214
242
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
243
|
+
- `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
|
|
215
244
|
- `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
|
|
216
245
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
217
246
|
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
@@ -220,6 +249,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
220
249
|
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
221
250
|
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
222
251
|
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
252
|
+
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`)
|
|
253
|
+
- `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
|
|
223
254
|
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
224
255
|
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
225
256
|
- `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "switchback"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -55,6 +55,7 @@ def _main() -> int:
|
|
|
55
55
|
_os.environ[_k] = _v.strip()
|
|
56
56
|
usage = ("usage: switchback [--format FMT] <url> [<url> ...]\n"
|
|
57
57
|
" switchback --search <query ...>\n"
|
|
58
|
+
" switchback --doctor\n"
|
|
58
59
|
" (or: python -m switchback <url> ...)\n"
|
|
59
60
|
" FMT: markdown (default) | markdown_trimmed | html | html_selectors")
|
|
60
61
|
# --help/-h is an explicit request: usage to stdout, exit 0 (don't treat it
|
|
@@ -62,6 +63,10 @@ def _main() -> int:
|
|
|
62
63
|
if any(a in ("--help", "-h") for a in sys.argv[1:]):
|
|
63
64
|
print(usage)
|
|
64
65
|
return 0
|
|
66
|
+
# --doctor: preflight tier-readiness report (no scrape). Side-effect-free.
|
|
67
|
+
if "--doctor" in sys.argv[1:]:
|
|
68
|
+
from .doctor import report
|
|
69
|
+
return report()
|
|
65
70
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
66
71
|
setup_logs() # also ship logs to the OTLP backend when configured
|
|
67
72
|
if len(sys.argv) < 2:
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Preflight readiness check — `switchback doctor`.
|
|
2
|
+
|
|
3
|
+
Reports which tiers can actually run on this box and, when one can't, the exact
|
|
4
|
+
fix. Built for cold-start deploys (e.g. Azure) where the stealth browser is
|
|
5
|
+
installed by a background thread *after* boot: run this to confirm the tiers are
|
|
6
|
+
live before sending traffic, or to see why Tier 2/3 aren't catching anything.
|
|
7
|
+
|
|
8
|
+
Exit code: 0 if both capable local tiers (cloudscraper + browser) are ready,
|
|
9
|
+
else 1 — so it doubles as a healthcheck.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import shutil
|
|
15
|
+
|
|
16
|
+
from .tiers import tier2_cloudscraper, tier3_browser
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _camoufox() -> tuple[bool, str]:
|
|
20
|
+
if os.getenv("SCRAPER_DISABLE_CAMOUFOX"):
|
|
21
|
+
return False, "off (SCRAPER_DISABLE_CAMOUFOX set)"
|
|
22
|
+
try:
|
|
23
|
+
import camoufox # noqa: F401
|
|
24
|
+
except ImportError:
|
|
25
|
+
return False, 'not installed — pip install "switchback[camoufox]" && camoufox fetch'
|
|
26
|
+
return True, "camoufox installed"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def probe() -> list[tuple[str, bool, str]]:
|
|
30
|
+
"""(label, ok, detail) for each tier/dependency that matters at runtime."""
|
|
31
|
+
cs_ok, cs_detail = tier2_cloudscraper.available()
|
|
32
|
+
br_ok, br_detail = tier3_browser.available()
|
|
33
|
+
node = shutil.which("node")
|
|
34
|
+
return [
|
|
35
|
+
("tier2_cloudscraper", cs_ok, cs_detail),
|
|
36
|
+
("tier3_browser", br_ok, br_detail),
|
|
37
|
+
("tier3b_camoufox", *_camoufox()),
|
|
38
|
+
("node (tier2 v3 concurrency)", bool(node),
|
|
39
|
+
node or "not on PATH — Tier 2 falls back to slower, thread-fragile js2py"),
|
|
40
|
+
("tier4_firecrawl", bool(os.getenv("FIRECRAWL_API_KEY")),
|
|
41
|
+
"FIRECRAWL_API_KEY set" if os.getenv("FIRECRAWL_API_KEY")
|
|
42
|
+
else "off (no FIRECRAWL_API_KEY)"),
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def report() -> int:
|
|
47
|
+
rows = probe()
|
|
48
|
+
print("switchback doctor — tier readiness\n")
|
|
49
|
+
for label, ok, detail in rows:
|
|
50
|
+
mark = "OK " if ok else "MISS"
|
|
51
|
+
print(f" [{mark}] {label:30} {detail}")
|
|
52
|
+
cs_ok = rows[0][1]
|
|
53
|
+
br_ok = rows[1][1]
|
|
54
|
+
if cs_ok and br_ok:
|
|
55
|
+
print("\nCapable tiers ready.")
|
|
56
|
+
return 0
|
|
57
|
+
print("\nOne or more capable tiers are unavailable (see above). On a cold "
|
|
58
|
+
"start this may resolve once the async install thread finishes.")
|
|
59
|
+
return 1
|
|
@@ -19,7 +19,8 @@ from dataclasses import dataclass, field
|
|
|
19
19
|
from . import content_cache, egress, session_cache
|
|
20
20
|
from .normalize import active_format, output_format_scope
|
|
21
21
|
from .policy import botwall
|
|
22
|
-
from .policy.gates import BotWall, RateLimited, ShortContent,
|
|
22
|
+
from .policy.gates import (BotWall, RateLimited, ShortContent, Unavailable,
|
|
23
|
+
classify_error, host_of)
|
|
23
24
|
from .tiers import TIERS, INDEX
|
|
24
25
|
from .tracing import Attr, flush, span
|
|
25
26
|
|
|
@@ -31,6 +32,15 @@ logger = logging.getLogger(__name__)
|
|
|
31
32
|
# tiers fail fast, while still bounding the worst case.
|
|
32
33
|
_DEADLINE_S = float(os.getenv("SCRAPER_DEADLINE_S", "45"))
|
|
33
34
|
|
|
35
|
+
# Fall back to Firecrawl after this many seconds on a URL. On a hard host the
|
|
36
|
+
# cheaper tiers can burn the whole deadline (e.g. cloudscraper's ~25s timeout +
|
|
37
|
+
# two browser solves), so the cascade would hit the deadline and quit *before*
|
|
38
|
+
# ever trying the one tier that reliably works. Once this much time has elapsed,
|
|
39
|
+
# we stop starting more local tiers and jump straight to Firecrawl — so the
|
|
40
|
+
# safety net actually gets a turn. Default 25s leaves ~20s of the 45s deadline
|
|
41
|
+
# for Firecrawl. Only applies when a paid, enabled tier is still ahead; 0 = off.
|
|
42
|
+
_FIRECRAWL_FALLBACK_AFTER_S = float(os.getenv("SCRAPER_FIRECRAWL_FALLBACK_AFTER_S", "25"))
|
|
43
|
+
|
|
34
44
|
# Exponential backoff between tiers after a *transient* failure (rate_limited /
|
|
35
45
|
# timeout) — gives a rate limiter or a slow origin a moment before the next tier
|
|
36
46
|
# hammers it. Disabled by default (base 0) so behaviour is unchanged until opted
|
|
@@ -50,14 +60,43 @@ def _maybe_backoff(transient_n: int, deadline: float) -> None:
|
|
|
50
60
|
return
|
|
51
61
|
time.sleep(delay)
|
|
52
62
|
|
|
63
|
+
# Configurable same-tier retries. A failing tier normally falls straight through
|
|
64
|
+
# to the next, more capable one; these let a tier re-attempt first. Off by default
|
|
65
|
+
# (0 retries) so behaviour is unchanged until opted in. Read at call time (not
|
|
66
|
+
# import) so a caller/test can set the env per run.
|
|
67
|
+
# SCRAPER_TIER_RETRIES global extra attempts per tier (N → 1+N tries)
|
|
68
|
+
# SCRAPER_TIER_RETRIES_<TIER> per-tier override, <TIER> = uppercased NAME
|
|
69
|
+
# SCRAPER_TIER_RETRY_ON failure classes eligible for a retry
|
|
70
|
+
_DEFAULT_RETRY_ON = "timeout,rate_limited,connection"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _retries_for(name: str) -> int:
|
|
74
|
+
"""Extra attempts for a tier: its per-tier override, else the global default."""
|
|
75
|
+
raw = os.getenv(f"SCRAPER_TIER_RETRIES_{name.upper()}",
|
|
76
|
+
os.getenv("SCRAPER_TIER_RETRIES", "0"))
|
|
77
|
+
try:
|
|
78
|
+
return max(0, int(raw))
|
|
79
|
+
except ValueError:
|
|
80
|
+
logger.warning(f"invalid retry count {raw!r} for {name}; using 0")
|
|
81
|
+
return 0
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _retryable_outcomes() -> set[str]:
|
|
85
|
+
raw = os.getenv("SCRAPER_TIER_RETRY_ON", _DEFAULT_RETRY_ON)
|
|
86
|
+
return {o.strip() for o in raw.split(",") if o.strip()}
|
|
87
|
+
|
|
53
88
|
# Per-attempt outcomes that aren't real failures (don't carry a failure reason).
|
|
54
|
-
_NON_FAILURE = ("ok", "not_applicable", "disabled")
|
|
89
|
+
_NON_FAILURE = ("ok", "not_applicable", "disabled", "skipped_for_budget")
|
|
55
90
|
|
|
56
91
|
# How explanatory each failure class is, for picking the reason that best
|
|
57
92
|
# describes why a URL failed. A real wall (403 / bot-wall) outranks a trailing
|
|
58
93
|
# config error (e.g. Firecrawl with no API key → "error"), so the verdict points
|
|
59
94
|
# at the actual blocker rather than the last thing that happened to throw.
|
|
60
95
|
_FAILURE_PRIORITY = {
|
|
96
|
+
# A missing/old/not-yet-installed tier dependency is an operator-fixable
|
|
97
|
+
# environment problem; rank it above site walls so it surfaces as the verdict
|
|
98
|
+
# instead of being masked as "botwall" when the capable tiers can't run.
|
|
99
|
+
"unavailable": 6,
|
|
61
100
|
"botwall": 5, "http_block": 5,
|
|
62
101
|
"rate_limited": 4, "short_content": 4,
|
|
63
102
|
"timeout": 3, "connection": 3,
|
|
@@ -65,6 +104,10 @@ _FAILURE_PRIORITY = {
|
|
|
65
104
|
"error": 1,
|
|
66
105
|
}
|
|
67
106
|
|
|
107
|
+
# Tiers whose dependency we've already warned about this process — the install
|
|
108
|
+
# hint is logged once at WARNING, not per-URL across a whole batch.
|
|
109
|
+
_unavail_warned: set[str] = set()
|
|
110
|
+
|
|
68
111
|
|
|
69
112
|
@dataclass
|
|
70
113
|
class ScrapeResult:
|
|
@@ -139,11 +182,17 @@ def _start_index(url: str, db: dict) -> int:
|
|
|
139
182
|
|
|
140
183
|
|
|
141
184
|
def _record_failure(sp, attempts, db, url, tier_name, outcome, exc, status, dt,
|
|
142
|
-
challenge=None):
|
|
185
|
+
challenge=None, persist=True):
|
|
143
186
|
"""Annotate the span, persist to botwall, and append the attempt — for one
|
|
144
187
|
failed tier attempt. Shared by every except branch so classification,
|
|
145
188
|
tracing, and the event log never drift apart. `challenge` names the bot-wall
|
|
146
|
-
vendor when one was served, so the policy can learn it per host.
|
|
189
|
+
vendor when one was served, so the policy can learn it per host.
|
|
190
|
+
|
|
191
|
+
`persist=False` is for an intermediate same-tier retry: the attempt is still
|
|
192
|
+
traced, logged, and appended (so retries are observable), but it does NOT
|
|
193
|
+
touch the policy DB — otherwise a single URL's retries would inflate the
|
|
194
|
+
self-healing failure counters (URL skip / needs_egress) and over-count misses.
|
|
195
|
+
Only the final per-tier outcome persists."""
|
|
147
196
|
msg = f"{type(exc).__name__}: {exc}"
|
|
148
197
|
sp.set(Attr.OUTCOME, outcome)
|
|
149
198
|
sp.set(Attr.ERROR, msg)
|
|
@@ -151,10 +200,13 @@ def _record_failure(sp, attempts, db, url, tier_name, outcome, exc, status, dt,
|
|
|
151
200
|
sp.set(Attr.CHALLENGE, challenge)
|
|
152
201
|
sp.set(Attr.STATUS_CODE, status)
|
|
153
202
|
sp.set(Attr.LATENCY_MS, dt)
|
|
154
|
-
|
|
155
|
-
|
|
203
|
+
if persist:
|
|
204
|
+
botwall.record(db, url, tier_name, outcome, error=msg, latency_ms=dt,
|
|
205
|
+
status_code=status, challenge=challenge)
|
|
156
206
|
# A wall on a host we had a cached cf_clearance for means the cookie is stale
|
|
157
|
-
# or IP-mismatched: drop it so the next attempt
|
|
207
|
+
# or IP-mismatched: drop it so the next attempt (a same-tier retry or the next
|
|
208
|
+
# tier) re-solves instead of replaying. Safe on intermediate retries too — it's
|
|
209
|
+
# a cache drop, not a policy counter.
|
|
158
210
|
if outcome in ("botwall", "http_block"):
|
|
159
211
|
session_cache.forget(url)
|
|
160
212
|
attempts.append(TierAttempt(tier_name, outcome, msg, status, dt))
|
|
@@ -214,6 +266,17 @@ def _run_one(url: str, db: dict) -> ScrapeOutcome:
|
|
|
214
266
|
return res
|
|
215
267
|
|
|
216
268
|
|
|
269
|
+
def _enabled_paid_ahead(i: int) -> bool:
|
|
270
|
+
"""Is there a paid, currently-enabled tier after index i? (i.e. a last-resort
|
|
271
|
+
worth reserving budget for.)"""
|
|
272
|
+
for tier in TIERS[i + 1:]:
|
|
273
|
+
if getattr(tier, "PAID", False):
|
|
274
|
+
disabled_fn = getattr(tier, "disabled", None)
|
|
275
|
+
if not (disabled_fn and disabled_fn()):
|
|
276
|
+
return True
|
|
277
|
+
return False
|
|
278
|
+
|
|
279
|
+
|
|
217
280
|
def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
|
|
218
281
|
attempts: list[TierAttempt] = []
|
|
219
282
|
transient = 0 # count of rate_limited/timeout misses so far (drives backoff)
|
|
@@ -241,8 +304,25 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
|
|
|
241
304
|
attempts.append(TierAttempt(tier.NAME, "not_applicable"))
|
|
242
305
|
continue
|
|
243
306
|
|
|
244
|
-
#
|
|
245
|
-
|
|
307
|
+
# Fall back to Firecrawl: once enough time has elapsed on this URL and a
|
|
308
|
+
# paid enabled tier is still ahead, skip this (non-paid) tier and any
|
|
309
|
+
# others so the paid tier actually gets a turn instead of the cascade
|
|
310
|
+
# dying on the deadline mid-browser-solve.
|
|
311
|
+
if (_FIRECRAWL_FALLBACK_AFTER_S and not getattr(tier, "PAID", False)
|
|
312
|
+
and (time.monotonic() - t0) >= _FIRECRAWL_FALLBACK_AFTER_S
|
|
313
|
+
and _enabled_paid_ahead(i)):
|
|
314
|
+
logger.info(
|
|
315
|
+
f"{tier.NAME} skipped after {_FIRECRAWL_FALLBACK_AFTER_S}s to "
|
|
316
|
+
f"fall back to Firecrawl (last resort): {url}")
|
|
317
|
+
attempts.append(TierAttempt(tier.NAME, "skipped_for_budget"))
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
# Limit: stop before starting another tier if we're out of budget. The
|
|
321
|
+
# paid last resort is exempt — if the cascade reached it, let it run even
|
|
322
|
+
# a touch over the deadline rather than quit with nothing (it has its own
|
|
323
|
+
# internal timeout). Non-paid tiers with a paid tier ahead were already
|
|
324
|
+
# skipped above, so this only ever quits when no paid tier remains.
|
|
325
|
+
if time.monotonic() >= deadline and not getattr(tier, "PAID", False):
|
|
246
326
|
total = int((time.monotonic() - t0) * 1000)
|
|
247
327
|
ec, sc = _dominant_failure(attempts)
|
|
248
328
|
root.set(Attr.OUTCOME, "deadline_exceeded")
|
|
@@ -259,60 +339,89 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
|
|
|
259
339
|
status_code=sc, latency_ms=total, attempts=attempts)
|
|
260
340
|
|
|
261
341
|
paid = getattr(tier, "PAID", False)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
342
|
+
retries = _retries_for(tier.NAME)
|
|
343
|
+
retryable = _retryable_outcomes()
|
|
344
|
+
# Same-tier retry loop: 1 base attempt + N configured retries. A retryable
|
|
345
|
+
# failure with budget left re-attempts this tier; anything else (or the
|
|
346
|
+
# last attempt) falls through to the next tier. Each attempt is its own span.
|
|
347
|
+
for attempt in range(retries + 1):
|
|
348
|
+
with span(tier.NAME, **{Attr.HOST: host, Attr.TIER: tier.NAME}) as sp:
|
|
349
|
+
if paid:
|
|
350
|
+
# Count every invocation so the host can be promoted to skip
|
|
351
|
+
# (and to reflect real per-attempt spend on retries).
|
|
352
|
+
botwall.record(db, url, tier.NAME, "firecrawl_used")
|
|
353
|
+
ts = time.monotonic()
|
|
354
|
+
outcome = exc = status = challenge = unavailable_exc = None
|
|
355
|
+
try:
|
|
356
|
+
md = tier.fetch(url)
|
|
357
|
+
except BotWall as e:
|
|
358
|
+
outcome, exc, challenge = "botwall", e, getattr(e, "vendor", None)
|
|
359
|
+
except ShortContent as e:
|
|
360
|
+
outcome, exc = "short_content", e
|
|
361
|
+
except RateLimited as e:
|
|
362
|
+
outcome, exc, status = "rate_limited", e, 429
|
|
363
|
+
except Unavailable as e:
|
|
364
|
+
outcome, unavailable_exc = "unavailable", e
|
|
365
|
+
except Exception as e:
|
|
366
|
+
exc = e
|
|
367
|
+
outcome, status = classify_error(e)
|
|
285
368
|
dt = int((time.monotonic() - ts) * 1000)
|
|
286
|
-
error_class, status = classify_error(e)
|
|
287
|
-
_record_failure(sp, attempts, db, url, tier.NAME, error_class, e, status, dt)
|
|
288
|
-
if error_class in _TRANSIENT:
|
|
289
|
-
transient += 1
|
|
290
|
-
_maybe_backoff(transient, deadline)
|
|
291
|
-
continue
|
|
292
|
-
|
|
293
|
-
dt = int((time.monotonic() - ts) * 1000)
|
|
294
|
-
if md is None: # tier not applicable (e.g. no API mirror)
|
|
295
|
-
sp.set(Attr.OUTCOME, "not_applicable")
|
|
296
|
-
sp.set(Attr.LATENCY_MS, dt)
|
|
297
|
-
attempts.append(TierAttempt(tier.NAME, "not_applicable", latency_ms=dt))
|
|
298
|
-
continue
|
|
299
369
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
370
|
+
if outcome == "unavailable":
|
|
371
|
+
# Tier dependency missing/old/not-installed-yet. An environment
|
|
372
|
+
# problem, not a host trait — never retried and never taught to
|
|
373
|
+
# botwall; warn once per tier with the exact fix instead of
|
|
374
|
+
# spamming every URL.
|
|
375
|
+
sp.set(Attr.OUTCOME, "unavailable")
|
|
376
|
+
sp.set(Attr.ERROR, str(unavailable_exc))
|
|
377
|
+
sp.set(Attr.ERROR_CLASS, "unavailable")
|
|
378
|
+
sp.set(Attr.LATENCY_MS, dt)
|
|
379
|
+
attempts.append(TierAttempt(tier.NAME, "unavailable",
|
|
380
|
+
str(unavailable_exc), None, dt))
|
|
381
|
+
if tier.NAME not in _unavail_warned:
|
|
382
|
+
_unavail_warned.add(tier.NAME)
|
|
383
|
+
logger.warning(f"{tier.NAME} unavailable: {unavailable_exc}")
|
|
384
|
+
break # fall through to the next tier
|
|
385
|
+
|
|
386
|
+
if outcome is not None: # this attempt failed
|
|
387
|
+
# Retry the same tier only if attempts remain, the failure is
|
|
388
|
+
# retryable, and there's budget left for another shot.
|
|
389
|
+
do_retry = (attempt < retries and outcome in retryable
|
|
390
|
+
and time.monotonic() < deadline)
|
|
391
|
+
# Intermediate retries are traced/logged but not persisted to
|
|
392
|
+
# the policy DB — only the final per-tier outcome counts.
|
|
393
|
+
_record_failure(sp, attempts, db, url, tier.NAME, outcome, exc,
|
|
394
|
+
status, dt, challenge=challenge, persist=not do_retry)
|
|
395
|
+
if do_retry:
|
|
396
|
+
_maybe_backoff(attempt + 1, deadline) # space the retry
|
|
397
|
+
continue
|
|
398
|
+
if outcome in _TRANSIENT:
|
|
399
|
+
transient += 1
|
|
400
|
+
_maybe_backoff(transient, deadline)
|
|
401
|
+
break # fall through to the next tier
|
|
402
|
+
|
|
403
|
+
if md is None: # tier not applicable (e.g. no API mirror)
|
|
404
|
+
sp.set(Attr.OUTCOME, "not_applicable")
|
|
405
|
+
sp.set(Attr.LATENCY_MS, dt)
|
|
406
|
+
attempts.append(TierAttempt(tier.NAME, "not_applicable", latency_ms=dt))
|
|
407
|
+
break
|
|
408
|
+
|
|
409
|
+
total = int((time.monotonic() - t0) * 1000)
|
|
410
|
+
sp.set(Attr.OUTCOME, "ok")
|
|
411
|
+
sp.set(Attr.MD_LEN, len(md))
|
|
412
|
+
sp.set(Attr.SOURCE, tier.NAME)
|
|
413
|
+
sp.set(Attr.LATENCY_MS, dt)
|
|
414
|
+
botwall.record(db, url, tier.NAME, "ok", md_len=len(md), latency_ms=dt)
|
|
415
|
+
content_cache.put(url, md, tier.NAME, active_format())
|
|
416
|
+
root.set(Attr.OUTCOME, "ok")
|
|
417
|
+
root.set(Attr.SOURCE, tier.NAME)
|
|
418
|
+
root.set(Attr.LATENCY_MS, total)
|
|
419
|
+
attempts.append(TierAttempt(tier.NAME, "ok", latency_ms=dt))
|
|
420
|
+
logger.info(
|
|
421
|
+
f"{tier.NAME} OK {url} md_len={len(md)} {dt}ms (total {total}ms)")
|
|
422
|
+
return ScrapeOutcome(url, True, markdown=md, source_method=tier.NAME,
|
|
423
|
+
final_outcome="ok", latency_ms=total,
|
|
424
|
+
format=active_format(), attempts=attempts)
|
|
316
425
|
|
|
317
426
|
total = int((time.monotonic() - t0) * 1000)
|
|
318
427
|
ec, sc = _dominant_failure(attempts)
|
|
@@ -140,6 +140,50 @@ def classify_error(exc: BaseException) -> tuple[str, int | None]:
|
|
|
140
140
|
return "error", status
|
|
141
141
|
|
|
142
142
|
|
|
143
|
+
# A page can clear the length gate yet carry no article: a media page whose body
|
|
144
|
+
# never rendered (headline + "Loading video…") or a nav/listing shell that is
|
|
145
|
+
# almost all links. Length alone can't tell "1600 chars of nav links" from "1600
|
|
146
|
+
# chars of prose", so these high-precision checks reject the shell. A false
|
|
147
|
+
# positive (rejecting a real article) is worse than missing an exotic shell, so
|
|
148
|
+
# the thresholds are deliberately conservative — validated to reject NONE of a
|
|
149
|
+
# 90-URL real-content sample while catching the unrendered-media / nav-shell
|
|
150
|
+
# cases that otherwise pass as false-positive "successes".
|
|
151
|
+
_PLACEHOLDER_HEAD_MARKERS = (
|
|
152
|
+
"loading video", # video page whose player never hydrated (headline only)
|
|
153
|
+
)
|
|
154
|
+
_NAV_SHELL_LINK_DENSITY = 0.65 # words-inside-links / total words, above which …
|
|
155
|
+
_NAV_SHELL_MAX_TEXT = 600 # … and with this few chars of real text, it's a shell
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _link_density(md: str) -> float:
|
|
159
|
+
"""Fraction of words that live inside markdown links — a nav/listing shell is
|
|
160
|
+
nearly all links; an article is mostly prose."""
|
|
161
|
+
words = md.split()
|
|
162
|
+
if not words:
|
|
163
|
+
return 1.0
|
|
164
|
+
link_words = sum(len(m.split()) for m in re.findall(r"\[([^\]]+)\]\(", md))
|
|
165
|
+
return link_words / len(words)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _nonlink_text_len(md: str) -> int:
|
|
169
|
+
"""Chars of real text once markdown links, URLs and formatting are stripped."""
|
|
170
|
+
t = re.sub(r"\[[^\]]*\]\([^)]*\)", "", md) # [text](url)
|
|
171
|
+
t = re.sub(r"https?://\S+", "", t)
|
|
172
|
+
t = re.sub(r"[#*>`|!\-]", " ", t)
|
|
173
|
+
return len(re.sub(r"\s+", " ", t).strip())
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _content_shell_reason(md: str) -> str | None:
|
|
177
|
+
"""Reason if `md` cleared the length gate but is not an article (media
|
|
178
|
+
placeholder in the head, or a mostly-links nav/listing shell), else None."""
|
|
179
|
+
head = md[:_BOTWALL_HEAD_CHARS].lower()
|
|
180
|
+
if any(m in head for m in _PLACEHOLDER_HEAD_MARKERS):
|
|
181
|
+
return "unrendered media placeholder"
|
|
182
|
+
if _link_density(md) > _NAV_SHELL_LINK_DENSITY and _nonlink_text_len(md) < _NAV_SHELL_MAX_TEXT:
|
|
183
|
+
return "nav/listing shell (mostly links)"
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
|
|
143
187
|
def check(url: str, md: str | None) -> str:
|
|
144
188
|
"""Return md if it clears the gates, else raise BotWall / ShortContent."""
|
|
145
189
|
vendor = classify_botwall(md)
|
|
@@ -149,6 +193,11 @@ def check(url: str, md: str | None) -> str:
|
|
|
149
193
|
n = len(md) if md else 0
|
|
150
194
|
if n < gate:
|
|
151
195
|
raise ShortContent(f"body too short: {n} < {gate}")
|
|
196
|
+
# Length cleared, but is it actually content? Reject shells/placeholders so a
|
|
197
|
+
# tier falls through instead of returning a confident false-positive success.
|
|
198
|
+
shell = _content_shell_reason(md or "")
|
|
199
|
+
if shell:
|
|
200
|
+
raise ShortContent(f"no article content: {shell}")
|
|
152
201
|
return md
|
|
153
202
|
|
|
154
203
|
|
|
@@ -156,6 +205,16 @@ class ShortContent(RuntimeError):
|
|
|
156
205
|
"""Content fetched but below the quality gate — treated as a tier miss."""
|
|
157
206
|
|
|
158
207
|
|
|
208
|
+
class Unavailable(RuntimeError):
|
|
209
|
+
"""A tier can't run because an optional dependency is missing, the wrong
|
|
210
|
+
version, or not installed yet (e.g. cloudscraper 1.2.71 instead of the 3.x
|
|
211
|
+
stealth fork; patchright's Chromium not downloaded yet during an async
|
|
212
|
+
cold-start install). Distinct from a tier *failure*: the tier never got to
|
|
213
|
+
attempt the URL. Surfaced as its own `unavailable` outcome so an environment
|
|
214
|
+
problem isn't masked as a generic error or a site bot-wall. The message
|
|
215
|
+
carries the exact fix (e.g. `patchright install chromium`)."""
|
|
216
|
+
|
|
217
|
+
|
|
159
218
|
class BotWall(RuntimeError):
|
|
160
219
|
"""Content fetched but it's a bot-wall / block interstitial (e.g. Cloudflare
|
|
161
220
|
"Just a moment...") rather than the real page — treated as a tier miss so the
|
|
@@ -22,13 +22,40 @@ import threading
|
|
|
22
22
|
from .. import egress, session_cache
|
|
23
23
|
from ..egress import requests_proxies
|
|
24
24
|
from ..normalize import html_to_markdown
|
|
25
|
-
from ..policy.gates import check
|
|
25
|
+
from ..policy.gates import Unavailable, check
|
|
26
26
|
|
|
27
27
|
logger = logging.getLogger(__name__)
|
|
28
28
|
|
|
29
29
|
NAME = "tier2_cloudscraper"
|
|
30
30
|
PAID = False
|
|
31
31
|
|
|
32
|
+
# Install hint surfaced when cloudscraper is missing or the frozen PyPI 1.2.71
|
|
33
|
+
# (v1/v2, no stealth) instead of the 3.x Enhanced Edition this tier needs.
|
|
34
|
+
_INSTALL_HINT = ('pip install "cloudscraper @ '
|
|
35
|
+
'git+https://github.com/VeNoMouS/cloudscraper@3.0.0"')
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def available() -> tuple[bool, str]:
|
|
39
|
+
"""Whether cloudscraper is importable *and* the stealth-capable 3.x fork.
|
|
40
|
+
Returns (ok, detail). Used by `fetch` (to fail fast with a clear reason
|
|
41
|
+
instead of wasting the solve budget) and by `switchback doctor`.
|
|
42
|
+
|
|
43
|
+
Discriminates by major version: the Enhanced Edition fork this tier needs is
|
|
44
|
+
3.x; PyPI is frozen at 1.2.71 (v1/v2, no stealth, rejects `enable_stealth`)."""
|
|
45
|
+
try:
|
|
46
|
+
import cloudscraper
|
|
47
|
+
except ImportError:
|
|
48
|
+
return False, f"cloudscraper not installed — {_INSTALL_HINT}"
|
|
49
|
+
ver = getattr(cloudscraper, "__version__", "0")
|
|
50
|
+
try:
|
|
51
|
+
major = int(ver.split(".")[0])
|
|
52
|
+
except (ValueError, AttributeError):
|
|
53
|
+
major = 0
|
|
54
|
+
if major < 3:
|
|
55
|
+
return False, (f"cloudscraper {ver} has no stealth support (frozen PyPI "
|
|
56
|
+
f"v1/v2) — {_INSTALL_HINT}")
|
|
57
|
+
return True, f"cloudscraper {ver}"
|
|
58
|
+
|
|
32
59
|
# Wall-clock cap on the whole solve. cloudscraper 3.x *attempts* interactive
|
|
33
60
|
# Turnstile and can loop for minutes on a challenge it can't clear — far past the
|
|
34
61
|
# per-request socket timeout. Capping it here lets the cascade fall through to the
|
|
@@ -78,6 +105,11 @@ def _interpreter_opts() -> dict:
|
|
|
78
105
|
|
|
79
106
|
|
|
80
107
|
def _make_scraper():
|
|
108
|
+
ok, detail = available()
|
|
109
|
+
if not ok:
|
|
110
|
+
# Fail fast (~0ms) with the exact fix, rather than wasting the solve
|
|
111
|
+
# budget or surfacing a cryptic TypeError mid-cascade.
|
|
112
|
+
raise Unavailable(detail)
|
|
81
113
|
import cloudscraper
|
|
82
114
|
# enable_stealth / auto_refresh_on_403 are on by default in 3.x; we pass the
|
|
83
115
|
# stealth tuning explicitly. No UA override: cloudscraper derives a UA (and
|
|
@@ -10,21 +10,59 @@ auth-walled pages (set BU_CDP_URL).
|
|
|
10
10
|
"""
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
+
import os
|
|
14
|
+
|
|
13
15
|
from . import _browser
|
|
14
16
|
from .. import session_cache, session_trace
|
|
15
17
|
from ..concurrency import browser_slot
|
|
16
18
|
from ..egress import playwright_proxy, add_wire_bytes
|
|
17
19
|
from ..normalize import html_to_markdown
|
|
18
|
-
from ..policy.gates import check
|
|
20
|
+
from ..policy.gates import Unavailable, check
|
|
19
21
|
|
|
20
22
|
NAME = "tier3_browser"
|
|
21
23
|
PAID = False
|
|
22
24
|
|
|
25
|
+
# Install hint surfaced when patchright or its Chromium isn't ready — notably
|
|
26
|
+
# during an async cold-start install (the browser binary lands after boot).
|
|
27
|
+
_INSTALL_HINT = 'pip install "switchback[browser]" && patchright install chromium'
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def available() -> tuple[bool, str]:
|
|
31
|
+
"""Whether patchright is importable *and* its Chromium is downloaded.
|
|
32
|
+
Returns (ok, detail). On a cold start where the browser is installed by a
|
|
33
|
+
background thread, this flips to True once that finishes. Used by `fetch`
|
|
34
|
+
(clear `unavailable` reason instead of a buried launch error) and by
|
|
35
|
+
`switchback doctor`."""
|
|
36
|
+
try:
|
|
37
|
+
from patchright.sync_api import sync_playwright
|
|
38
|
+
except ImportError:
|
|
39
|
+
return False, f"patchright not installed — {_INSTALL_HINT}"
|
|
40
|
+
try:
|
|
41
|
+
with sync_playwright() as p:
|
|
42
|
+
exe = p.chromium.executable_path
|
|
43
|
+
except Exception as e: # pragma: no cover — driver start is environment-specific
|
|
44
|
+
return False, f"patchright driver error: {e}"
|
|
45
|
+
if not exe or not os.path.exists(exe):
|
|
46
|
+
return False, f"patchright Chromium not installed — {_INSTALL_HINT}"
|
|
47
|
+
return True, "patchright + Chromium ready"
|
|
48
|
+
|
|
23
49
|
|
|
24
50
|
def fetch(url: str, timeout_ms: int = 15000) -> str:
|
|
25
|
-
|
|
51
|
+
try:
|
|
52
|
+
from patchright.sync_api import sync_playwright
|
|
53
|
+
except ImportError:
|
|
54
|
+
raise Unavailable(f"patchright not installed — {_INSTALL_HINT}")
|
|
26
55
|
with browser_slot(NAME), sync_playwright() as p:
|
|
27
|
-
|
|
56
|
+
try:
|
|
57
|
+
browser = p.chromium.launch(headless=True, proxy=playwright_proxy())
|
|
58
|
+
except Exception as e:
|
|
59
|
+
# Chromium not downloaded yet (cold-start window) reads as a launch
|
|
60
|
+
# error; surface it as unavailable + the fix, not a generic failure.
|
|
61
|
+
msg = str(e)
|
|
62
|
+
if "Executable doesn't exist" in msg or "patchright install" in msg:
|
|
63
|
+
raise Unavailable(
|
|
64
|
+
f"patchright Chromium not installed — {_INSTALL_HINT}")
|
|
65
|
+
raise
|
|
28
66
|
ctx = None
|
|
29
67
|
try:
|
|
30
68
|
# No user_agent override: patchright ships a real, internally
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: switchback
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
5
|
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
6
|
License: MIT
|
|
@@ -163,6 +163,34 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
|
|
|
163
163
|
Or run the whole thing as a container:
|
|
164
164
|
`docker build -t switchback . && docker run -p 8799:8799 switchback`.
|
|
165
165
|
|
|
166
|
+
### Production / cold-start deployment
|
|
167
|
+
|
|
168
|
+
The two heavy tiers pull dependencies that often can't be baked into a base image
|
|
169
|
+
and land *after* boot (e.g. an async install thread on Azure). Until they're
|
|
170
|
+
ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
|
|
171
|
+
fix) and the cascade falls through — they are never silently skipped. Checklist:
|
|
172
|
+
|
|
173
|
+
- **Tier 3 is the real workhorse for Cloudflare/JS sites** — make sure its browser
|
|
174
|
+
is installed: `patchright install chromium` (note: **patchright**, not vanilla
|
|
175
|
+
`playwright`). On a cold start, run this in your post-boot install step/thread;
|
|
176
|
+
Tier 3 flips to ready once it finishes.
|
|
177
|
+
- **Tier 2 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
|
|
178
|
+
frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
|
|
179
|
+
solve budget) instead of erroring mid-cascade. Tier 2 is a *weak* solver for
|
|
180
|
+
modern Cloudflare — treat it as a cheap try before the browser, not the primary.
|
|
181
|
+
- **Install Node.js** for Tier 2's v3 JS-VM challenges — faster and thread-safe
|
|
182
|
+
vs. the pure-Python js2py fallback (relevant under concurrent load).
|
|
183
|
+
- **Bound Tier 2's solve budget** with `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` (default
|
|
184
|
+
`25`) so an unsolvable challenge can't eat the per-URL deadline before the
|
|
185
|
+
browser tier runs. Lower it (e.g. `12`) if Tier 2 rarely wins on your hosts.
|
|
186
|
+
|
|
187
|
+
**Verify readiness on the box** with the preflight check (doubles as a healthcheck
|
|
188
|
+
— exit 0 when the capable tiers are ready):
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
switchback --doctor # or: python -m switchback --doctor
|
|
192
|
+
```
|
|
193
|
+
|
|
166
194
|
## Use it from your app
|
|
167
195
|
|
|
168
196
|
Three interchangeable entry points — all return the same shape
|
|
@@ -271,6 +299,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
271
299
|
|
|
272
300
|
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
273
301
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
302
|
+
- `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
|
|
274
303
|
- `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
|
|
275
304
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
276
305
|
- `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
|
|
@@ -279,6 +308,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
279
308
|
- `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
|
|
280
309
|
- `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
|
|
281
310
|
- `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
|
|
311
|
+
- `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`)
|
|
312
|
+
- `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
|
|
282
313
|
- `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
|
|
283
314
|
- `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
|
|
284
315
|
- `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
|
switchback-0.2.0/CHANGELOG.md
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# Changelog
|
|
2
|
-
|
|
3
|
-
All notable changes to this project are documented here. Format loosely follows
|
|
4
|
-
[Keep a Changelog](https://keepachangelog.com/); this project uses semantic-ish
|
|
5
|
-
versioning while pre-1.0.
|
|
6
|
-
|
|
7
|
-
## [Unreleased]
|
|
8
|
-
|
|
9
|
-
## [0.2.0] - 2026-06-25
|
|
10
|
-
|
|
11
|
-
### Added
|
|
12
|
-
- **Selectable output formats** — `SCRAPER_OUTPUT_FORMAT` (or per-call
|
|
13
|
-
`scrape(fmt=...)`, CLI `--format`, `/scrape` `{"format": ...}`) selects the
|
|
14
|
-
content shape: `markdown` (default, unchanged), `markdown_trimmed` (extra
|
|
15
|
-
ad/nav/boilerplate removed), `html` (raw), or `html_selectors` (cleaned HTML
|
|
16
|
-
with per-domain `drop`/`selector` applied). Default output is byte-identical;
|
|
17
|
-
html-family results use a `html` JSON key instead of `markdown`.
|
|
18
|
-
|
|
19
|
-
## [0.1.0] - 2026-06-23
|
|
20
|
-
|
|
21
|
-
### Added
|
|
22
|
-
- **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
|
|
23
|
-
DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
|
|
24
|
-
botwall DB; the vendor is attached to each event and OTel span (`scrape.challenge`).
|
|
25
|
-
- **Metrics & reporting** — `switchback.reporting` rolls the event log + botwall DB
|
|
26
|
-
into cost-savings-vs-Firecrawl, coverage, overall/per-tier/per-domain latency
|
|
27
|
-
(mean/median/min/max/p50/p95), outcomes, error codes by domain, and challenges
|
|
28
|
-
by domain. Exposed via `GET /metrics` and `GET /metrics/domains` (both accept
|
|
29
|
-
`?minutes=N`).
|
|
30
|
-
- **Periodic flagging** — `python -m switchback.flags` emits a cron-friendly digest
|
|
31
|
-
(domains stuck on Firecrawl, escalated to egress, most-challenged) to logs/OTel.
|
|
32
|
-
- **Content cache** — optional URL→result cache (`SCRAPER_CONTENT_TTL_S`, sqlite,
|
|
33
|
-
off by default) short-circuits re-scrapes before any tier runs.
|
|
34
|
-
- **Login-session refresh** — `SCRAPER_LOGIN_HOOK` (`pkg.module:func`) refreshes a
|
|
35
|
-
dead logged-in session on demand; cookies overlay every tier and persist.
|
|
36
|
-
- **Exponential backoff** — between-tier backoff with jitter after rate-limit /
|
|
37
|
-
timeout (`SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS`, off by default).
|
|
38
|
-
- **Per-domain extraction prefs** — `config/extraction.json` (CSS scope selector +
|
|
39
|
-
extra drops) applied automatically in the normalize step for every tier.
|
|
40
|
-
- **Session traces** — opt-in Playwright trace capture (`SCRAPER_TRACE_SESSION=1`)
|
|
41
|
-
for browser tiers, with `GET/DELETE /traces` management endpoints.
|
|
42
|
-
|
|
43
|
-
### Changed
|
|
44
|
-
- Tier 2's `cloudscraper` moved from a core dependency (which pinned a git-URL
|
|
45
|
-
fork PyPI can't publish) to the `cloudflare` extra; see the README for installing
|
|
46
|
-
the 3.x Enhanced Edition fork for full stealth.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|