switchback 0.1.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {switchback-0.1.0 → switchback-0.4.0}/.env.example +20 -0
  2. switchback-0.4.0/CHANGELOG.md +94 -0
  3. {switchback-0.1.0 → switchback-0.4.0}/PKG-INFO +63 -3
  4. {switchback-0.1.0 → switchback-0.4.0}/README.md +62 -2
  5. {switchback-0.1.0 → switchback-0.4.0}/clients/python_client.py +12 -6
  6. {switchback-0.1.0 → switchback-0.4.0}/pyproject.toml +1 -1
  7. {switchback-0.1.0 → switchback-0.4.0}/switchback/api.py +34 -9
  8. {switchback-0.1.0 → switchback-0.4.0}/switchback/content_cache.py +12 -9
  9. switchback-0.4.0/switchback/doctor.py +59 -0
  10. switchback-0.4.0/switchback/normalize.py +183 -0
  11. {switchback-0.1.0 → switchback-0.4.0}/switchback/orchestrator.py +186 -70
  12. {switchback-0.1.0 → switchback-0.4.0}/switchback/policy/gates.py +59 -0
  13. {switchback-0.1.0 → switchback-0.4.0}/switchback/server.py +9 -4
  14. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/tier2_cloudscraper.py +33 -1
  15. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/tier3_browser.py +41 -3
  16. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/tier4_firecrawl.py +13 -4
  17. {switchback-0.1.0 → switchback-0.4.0}/switchback.egg-info/PKG-INFO +63 -3
  18. {switchback-0.1.0 → switchback-0.4.0}/switchback.egg-info/SOURCES.txt +1 -0
  19. switchback-0.1.0/CHANGELOG.md +0 -34
  20. switchback-0.1.0/switchback/normalize.py +0 -81
  21. {switchback-0.1.0 → switchback-0.4.0}/CONTRIBUTING.md +0 -0
  22. {switchback-0.1.0 → switchback-0.4.0}/LICENSE +0 -0
  23. {switchback-0.1.0 → switchback-0.4.0}/MANIFEST.in +0 -0
  24. {switchback-0.1.0 → switchback-0.4.0}/NOTICE +0 -0
  25. {switchback-0.1.0 → switchback-0.4.0}/SECURITY.md +0 -0
  26. {switchback-0.1.0 → switchback-0.4.0}/clients/node_bridge.md +0 -0
  27. {switchback-0.1.0 → switchback-0.4.0}/config/botwall_skip_urls.txt +0 -0
  28. {switchback-0.1.0 → switchback-0.4.0}/config/extraction.example.json +0 -0
  29. {switchback-0.1.0 → switchback-0.4.0}/setup.cfg +0 -0
  30. {switchback-0.1.0 → switchback-0.4.0}/switchback/__init__.py +0 -0
  31. {switchback-0.1.0 → switchback-0.4.0}/switchback/__main__.py +0 -0
  32. {switchback-0.1.0 → switchback-0.4.0}/switchback/concurrency.py +0 -0
  33. {switchback-0.1.0 → switchback-0.4.0}/switchback/egress.py +0 -0
  34. {switchback-0.1.0 → switchback-0.4.0}/switchback/extract.py +0 -0
  35. {switchback-0.1.0 → switchback-0.4.0}/switchback/flags.py +0 -0
  36. {switchback-0.1.0 → switchback-0.4.0}/switchback/policy/__init__.py +0 -0
  37. {switchback-0.1.0 → switchback-0.4.0}/switchback/policy/botwall.py +0 -0
  38. {switchback-0.1.0 → switchback-0.4.0}/switchback/py.typed +0 -0
  39. {switchback-0.1.0 → switchback-0.4.0}/switchback/reporting.py +0 -0
  40. {switchback-0.1.0 → switchback-0.4.0}/switchback/search.py +0 -0
  41. {switchback-0.1.0 → switchback-0.4.0}/switchback/session_cache.py +0 -0
  42. {switchback-0.1.0 → switchback-0.4.0}/switchback/session_trace.py +0 -0
  43. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/__init__.py +0 -0
  44. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/_browser.py +0 -0
  45. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/tier0_apis.py +0 -0
  46. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/tier1_http.py +0 -0
  47. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/tier3b_camoufox.py +0 -0
  48. {switchback-0.1.0 → switchback-0.4.0}/switchback/tiers/tier_residential.py +0 -0
  49. {switchback-0.1.0 → switchback-0.4.0}/switchback/tracing.py +0 -0
  50. {switchback-0.1.0 → switchback-0.4.0}/switchback.egg-info/dependency_links.txt +0 -0
  51. {switchback-0.1.0 → switchback-0.4.0}/switchback.egg-info/entry_points.txt +0 -0
  52. {switchback-0.1.0 → switchback-0.4.0}/switchback.egg-info/requires.txt +0 -0
  53. {switchback-0.1.0 → switchback-0.4.0}/switchback.egg-info/top_level.txt +0 -0
@@ -14,6 +14,26 @@ OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
14
14
  # ── Search (Tier-0 SearXNG, query → URLs) ───────────────────────────────────
15
15
  SEARXNG_URL=http://localhost:8888
16
16
 
17
+ # ── Output format ───────────────────────────────────────────────────────────
18
+ # Shape of the scraped content. Default markdown is byte-identical to before;
19
+ # override per-call with scrape(fmt=...), the CLI --format flag, or the /scrape
20
+ # {"format": ...} field. html-family results land under a "html" key (instead of
21
+ # "markdown") in the CLI/server JSON.
22
+ # markdown whole-page markdown (default)
23
+ # markdown_trimmed markdown with extra ad/nav/boilerplate lines removed
24
+ # html raw HTML exactly as fetched (no cleaning)
25
+ # html_selectors cleaned HTML (boilerplate strip + per-domain drop/selector)
26
+ # Note: the API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats
27
+ # fall back to their text for those sources.
28
+ SCRAPER_OUTPUT_FORMAT=markdown
29
+
30
+ # ── Tier 2 · Cloudflare solver (cloudscraper) ───────────────────────────────
31
+ # Needs the 3.x Enhanced Edition fork (see README); with the frozen PyPI build
32
+ # the tier reports `unavailable` and fails fast. Wall-clock cap on a single solve
33
+ # so an unsolvable challenge can't eat the per-URL deadline before the browser
34
+ # tier runs. Lower (e.g. 12) if Tier 2 rarely wins on your hosts.
35
+ SCRAPER_CLOUDSCRAPER_TIMEOUT_S=25
36
+
17
37
  # ── Tier 2.5 · Jina Reader (r.jina.ai) ──────────────────────────────────────
18
38
  # Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
19
39
  JINA_API_KEY=
@@ -0,0 +1,94 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. Format loosely follows
4
+ [Keep a Changelog](https://keepachangelog.com/); this project uses semantic-ish
5
+ versioning while pre-1.0.
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.4.0] - 2026-06-29
10
+
11
+ ### Added
12
+ - **Configurable per-tier retries** — a tier can now re-attempt before falling
13
+ through to the next, more capable one. `SCRAPER_TIER_RETRIES` (global, default
14
+ `0` = off; `N` → up to `1+N` tries per tier), per-tier overrides
15
+ `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`), and
16
+ `SCRAPER_TIER_RETRY_ON` (retryable failure classes; default
17
+ `timeout,rate_limited,connection` — widen to include `botwall,http_block` behind
18
+ a rotating residential proxy, where each retry gets a fresh IP). Retries stay
19
+ bounded by `SCRAPER_DEADLINE_S`, and intermediate retries are traced/logged but
20
+ **not** persisted to the botwall policy DB, so they never inflate the
21
+ self-healing skip / `needs_egress` counters. Default `0` keeps behaviour
22
+ unchanged. Enabling retries on the paid Firecrawl tier bills per attempt.
23
+
24
+ ### Fixed
25
+ - **Quality gate rejects content shells** — the gate no longer passes a page just
26
+ because it clears the length floor; thin "shell" pages (nav/boilerplate with no
27
+ real article body) are now treated as a tier miss so the cascade falls through.
28
+ - **Paid last-resort budget reserve** — `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S`
29
+ (default 25s) stops starting local tiers once enough of the per-URL deadline has
30
+ elapsed and an enabled paid tier is still ahead, so a hard host can't burn the
31
+ whole budget before Firecrawl gets a turn.
32
+
33
+ ## [0.3.0] - 2026-06-27
34
+
35
+ ### Added
36
+ - **`unavailable` tier outcome** — when a tier's optional dependency is missing,
37
+ the wrong version, or not installed yet (frozen PyPI `cloudscraper` instead of
38
+ the 3.x stealth fork; patchright's Chromium not downloaded during an async
39
+ cold-start install), the tier now fails fast (~0ms) with a distinct
40
+ `unavailable` outcome carrying the exact install command, logged once per tier.
41
+ It ranks above bot-wall in the verdict, so an environment problem is no longer
42
+ masked as `botwall` — and a missing Tier 2 dependency no longer burns the
43
+ per-URL solve budget before the browser tier runs.
44
+ - **`switchback --doctor`** — preflight tier-readiness check (doubles as a
45
+ healthcheck: exit 0 when the capable tiers are ready). Reports whether
46
+ cloudscraper is the stealth-capable 3.x fork, patchright + Chromium are
47
+ installed, Camoufox/Node are present, and Firecrawl is configured. Built for
48
+ cold-start deploys where the browser is installed by a background thread after
49
+ boot.
50
+
51
+ ### Docs
52
+ - README **Production / cold-start deployment** section and a `.env.example`
53
+ Tier 2 block: install `patchright install chromium` in the post-boot step, the
54
+ cloudscraper 3.x fork requirement, Node.js for Tier 2 concurrency, and the
55
+ `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` budget knob.
56
+
57
+ ## [0.2.0] - 2026-06-25
58
+
59
+ ### Added
60
+ - **Selectable output formats** — `SCRAPER_OUTPUT_FORMAT` (or per-call
61
+ `scrape(fmt=...)`, CLI `--format`, `/scrape` `{"format": ...}`) selects the
62
+ content shape: `markdown` (default, unchanged), `markdown_trimmed` (extra
63
+ ad/nav/boilerplate removed), `html` (raw), or `html_selectors` (cleaned HTML
64
+ with per-domain `drop`/`selector` applied). Default output is byte-identical;
65
+ html-family results use a `html` JSON key instead of `markdown`.
66
+
67
+ ## [0.1.0] - 2026-06-23
68
+
69
+ ### Added
70
+ - **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
71
+ DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
72
+ botwall DB; the vendor is attached to each event and OTel span (`scrape.challenge`).
73
+ - **Metrics & reporting** — `switchback.reporting` rolls the event log + botwall DB
74
+ into cost-savings-vs-Firecrawl, coverage, overall/per-tier/per-domain latency
75
+ (mean/median/min/max/p50/p95), outcomes, error codes by domain, and challenges
76
+ by domain. Exposed via `GET /metrics` and `GET /metrics/domains` (both accept
77
+ `?minutes=N`).
78
+ - **Periodic flagging** — `python -m switchback.flags` emits a cron-friendly digest
79
+ (domains stuck on Firecrawl, escalated to egress, most-challenged) to logs/OTel.
80
+ - **Content cache** — optional URL→result cache (`SCRAPER_CONTENT_TTL_S`, sqlite,
81
+ off by default) short-circuits re-scrapes before any tier runs.
82
+ - **Login-session refresh** — `SCRAPER_LOGIN_HOOK` (`pkg.module:func`) refreshes a
83
+ dead logged-in session on demand; cookies overlay every tier and persist.
84
+ - **Exponential backoff** — between-tier backoff with jitter after rate-limit /
85
+ timeout (`SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS`, off by default).
86
+ - **Per-domain extraction prefs** — `config/extraction.json` (CSS scope selector +
87
+ extra drops) applied automatically in the normalize step for every tier.
88
+ - **Session traces** — opt-in Playwright trace capture (`SCRAPER_TRACE_SESSION=1`)
89
+ for browser tiers, with `GET/DELETE /traces` management endpoints.
90
+
91
+ ### Changed
92
+ - Tier 2's `cloudscraper` moved from a core dependency (which pinned a git-URL
93
+ fork PyPI can't publish) to the `cloudflare` extra; see the README for installing
94
+ the 3.x Enhanced Edition fork for full stealth.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: switchback
3
- Version: 0.1.0
3
+ Version: 0.4.0
4
4
  Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
5
5
  Author-email: Akash Kodavuru <akash@theaklabs.com>
6
6
  License: MIT
@@ -75,8 +75,8 @@ Dynamic: license-file
75
75
  Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
76
76
  to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
77
77
 
78
- [![PyPI](https://img.shields.io/pypi/v/switchback.svg)](https://pypi.org/project/switchback/)
79
- [![Python](https://img.shields.io/pypi/pyversions/switchback.svg)](https://pypi.org/project/switchback/)
78
+ [![PyPI](https://img.shields.io/pypi/v/switchback)](https://pypi.org/project/switchback/)
79
+ [![Python](https://img.shields.io/pypi/pyversions/switchback)](https://pypi.org/project/switchback/)
80
80
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
81
81
  [![CI](https://github.com/akash-kr/switchback/actions/workflows/ci.yml/badge.svg)](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
82
82
 
@@ -163,6 +163,34 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
163
163
  Or run the whole thing as a container:
164
164
  `docker build -t switchback . && docker run -p 8799:8799 switchback`.
165
165
 
166
+ ### Production / cold-start deployment
167
+
168
+ The two heavy tiers pull dependencies that often can't be baked into a base image
169
+ and land *after* boot (e.g. an async install thread on Azure). Until they're
170
+ ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
171
+ fix) and the cascade falls through — they are never silently skipped. Checklist:
172
+
173
+ - **Tier 3 is the real workhorse for Cloudflare/JS sites** — make sure its browser
174
+ is installed: `patchright install chromium` (note: **patchright**, not vanilla
175
+ `playwright`). On a cold start, run this in your post-boot install step/thread;
176
+ Tier 3 flips to ready once it finishes.
177
+ - **Tier 2 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
178
+ frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
179
+ solve budget) instead of erroring mid-cascade. Tier 2 is a *weak* solver for
180
+ modern Cloudflare — treat it as a cheap try before the browser, not the primary.
181
+ - **Install Node.js** for Tier 2's v3 JS-VM challenges — faster and thread-safe
182
+ vs. the pure-Python js2py fallback (relevant under concurrent load).
183
+ - **Bound Tier 2's solve budget** with `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` (default
184
+ `25`) so an unsolvable challenge can't eat the per-URL deadline before the
185
+ browser tier runs. Lower it (e.g. `12`) if Tier 2 rarely wins on your hosts.
186
+
187
+ **Verify readiness on the box** with the preflight check (doubles as a healthcheck
188
+ — exit 0 when the capable tiers are ready):
189
+
190
+ ```bash
191
+ switchback --doctor # or: python -m switchback --doctor
192
+ ```
193
+
166
194
  ## Use it from your app
167
195
 
168
196
  Three interchangeable entry points — all return the same shape
@@ -269,7 +297,9 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
269
297
  <details>
270
298
  <summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
271
299
 
300
+ - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
272
301
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
302
+ - `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
273
303
  - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
274
304
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
275
305
  - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
@@ -278,6 +308,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
278
308
  - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
279
309
  - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
280
310
  - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
311
+ - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`)
312
+ - `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
281
313
  - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
282
314
  - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
283
315
  - `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
@@ -297,6 +329,34 @@ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
297
329
  `GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
298
330
  `playwright show-trace <zip>`. Off by default (traces are MBs each).
299
331
 
332
+ ### Output formats
333
+ Markdown is the default and is unchanged. Pick a different shape globally with
334
+ `SCRAPER_OUTPUT_FORMAT`, or per call:
335
+
336
+ ```python
337
+ from switchback import scrape
338
+ scrape(["https://example.com/article"]) # markdown (default)
339
+ scrape(["https://example.com/article"], fmt="html") # raw HTML
340
+ scrape(["https://example.com/article"], fmt="markdown_trimmed")
341
+ ```
342
+
343
+ ```bash
344
+ switchback --format html_selectors https://example.com/article
345
+ curl -s localhost:8799/scrape -d '{"urls":["https://example.com"],"format":"html"}'
346
+ ```
347
+
348
+ | format | what you get |
349
+ | --- | --- |
350
+ | `markdown` | whole-page markdown (boilerplate stripped + per-domain prefs) — **default** |
351
+ | `markdown_trimmed` | markdown with extra ad/nav/boilerplate lines removed |
352
+ | `html` | the raw HTML exactly as fetched, untouched |
353
+ | `html_selectors` | cleaned HTML (boilerplate strip + per-domain `drop`/`selector`), not converted |
354
+
355
+ The chosen content rides in the result's `markdown` field; in the CLI/server JSON
356
+ the key is `markdown` for markdown formats and `html` for html formats. The
357
+ API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats fall back to
358
+ their text for those sources.
359
+
300
360
  ### Per-domain extraction
301
361
  Markdown of the whole page is the default. To scope a site to its content node or
302
362
  strip site-specific noise, declare prefs per host in `config/extraction.json`
@@ -16,8 +16,8 @@
16
16
  Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
17
17
  to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
18
18
 
19
- [![PyPI](https://img.shields.io/pypi/v/switchback.svg)](https://pypi.org/project/switchback/)
20
- [![Python](https://img.shields.io/pypi/pyversions/switchback.svg)](https://pypi.org/project/switchback/)
19
+ [![PyPI](https://img.shields.io/pypi/v/switchback)](https://pypi.org/project/switchback/)
20
+ [![Python](https://img.shields.io/pypi/pyversions/switchback)](https://pypi.org/project/switchback/)
21
21
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
22
22
  [![CI](https://github.com/akash-kr/switchback/actions/workflows/ci.yml/badge.svg)](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
23
23
 
@@ -104,6 +104,34 @@ pip install "cloudscraper @ git+https://github.com/VeNoMouS/cloudscraper@3.0.0"
104
104
  Or run the whole thing as a container:
105
105
  `docker build -t switchback . && docker run -p 8799:8799 switchback`.
106
106
 
107
+ ### Production / cold-start deployment
108
+
109
+ The two heavy tiers pull dependencies that often can't be baked into a base image
110
+ and land *after* boot (e.g. an async install thread on Azure). Until they're
111
+ ready, those tiers report **`unavailable`** (a distinct outcome carrying the exact
112
+ fix) and the cascade falls through — they are never silently skipped. Checklist:
113
+
114
+ - **Tier 3 is the real workhorse for Cloudflare/JS sites** — make sure its browser
115
+ is installed: `patchright install chromium` (note: **patchright**, not vanilla
116
+ `playwright`). On a cold start, run this in your post-boot install step/thread;
117
+ Tier 3 flips to ready once it finishes.
118
+ - **Tier 2 needs the cloudscraper 3.x fork** (above) to attempt stealth. With the
119
+ frozen PyPI `cloudscraper` it reports `unavailable` and fails fast (no wasted
120
+ solve budget) instead of erroring mid-cascade. Tier 2 is a *weak* solver for
121
+ modern Cloudflare — treat it as a cheap try before the browser, not the primary.
122
+ - **Install Node.js** for Tier 2's v3 JS-VM challenges — faster and thread-safe
123
+ vs. the pure-Python js2py fallback (relevant under concurrent load).
124
+ - **Bound Tier 2's solve budget** with `SCRAPER_CLOUDSCRAPER_TIMEOUT_S` (default
125
+ `25`) so an unsolvable challenge can't eat the per-URL deadline before the
126
+ browser tier runs. Lower it (e.g. `12`) if Tier 2 rarely wins on your hosts.
127
+
128
+ **Verify readiness on the box** with the preflight check (doubles as a healthcheck
129
+ — exit 0 when the capable tiers are ready):
130
+
131
+ ```bash
132
+ switchback --doctor # or: python -m switchback --doctor
133
+ ```
134
+
107
135
  ## Use it from your app
108
136
 
109
137
  Three interchangeable entry points — all return the same shape
@@ -210,7 +238,9 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
210
238
  <details>
211
239
  <summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
212
240
 
241
+ - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
213
242
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
243
+ - `SCRAPER_FIRECRAWL_FALLBACK_AFTER_S` — after this many seconds on a URL, stop trying the local tiers and fall back to Firecrawl, so a hard host doesn't burn the whole deadline before the paid last resort gets a turn (25s; 0 = off)
214
244
  - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
215
245
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
216
246
  - `SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H` — auto-skip re-test window (24h; 0 = never)
@@ -219,6 +249,8 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
219
249
  - `SCRAPER_DISABLE_SESSION_CACHE` — turn off cf_clearance reuse
220
250
  - `SCRAPER_CONTENT_TTL_S` — URL→result cache TTL (**0 = off**; set e.g. 86400 to skip re-scraping a page within a day)
221
251
  - `SCRAPER_BACKOFF_BASE_MS` / `SCRAPER_BACKOFF_MAX_MS` — exponential backoff between tiers after a rate-limit/timeout (base 0 = off)
252
+ - `SCRAPER_TIER_RETRIES` — same-tier retries before falling through (default 0 = off; `N` → up to `1+N` tries per tier), with per-tier overrides `SCRAPER_TIER_RETRIES_<TIER>` (e.g. `SCRAPER_TIER_RETRIES_TIER3_BROWSER=2`)
253
+ - `SCRAPER_TIER_RETRY_ON` — failure classes eligible for a same-tier retry (default `timeout,rate_limited,connection`; widen to include `botwall,http_block` behind a rotating residential proxy so each retry gets a fresh IP). Retries are bounded by `SCRAPER_DEADLINE_S`; enabling them on the paid Firecrawl tier bills per attempt
222
254
  - `SCRAPER_LOGIN_HOOK` — `pkg.module:func` returning `{cookie: value}` for a host (see [Logged-in sessions](#logged-in-sessions))
223
255
  - `SCRAPER_EXTRACTION_FILE` — per-domain extraction prefs JSON (default `config/extraction.json`)
224
256
  - `SCRAPER_TRACE_SESSION` — opt-in: capture a Playwright trace (screenshots + DOM + network) per browser-tier attempt, written to `state/traces/`
@@ -238,6 +270,34 @@ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
238
270
  `GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
239
271
  `playwright show-trace <zip>`. Off by default (traces are MBs each).
240
272
 
273
+ ### Output formats
274
+ Markdown is the default and is unchanged. Pick a different shape globally with
275
+ `SCRAPER_OUTPUT_FORMAT`, or per call:
276
+
277
+ ```python
278
+ from switchback import scrape
279
+ scrape(["https://example.com/article"]) # markdown (default)
280
+ scrape(["https://example.com/article"], fmt="html") # raw HTML
281
+ scrape(["https://example.com/article"], fmt="markdown_trimmed")
282
+ ```
283
+
284
+ ```bash
285
+ switchback --format html_selectors https://example.com/article
286
+ curl -s localhost:8799/scrape -d '{"urls":["https://example.com"],"format":"html"}'
287
+ ```
288
+
289
+ | format | what you get |
290
+ | --- | --- |
291
+ | `markdown` | whole-page markdown (boilerplate stripped + per-domain prefs) — **default** |
292
+ | `markdown_trimmed` | markdown with extra ad/nav/boilerplate lines removed |
293
+ | `html` | the raw HTML exactly as fetched, untouched |
294
+ | `html_selectors` | cleaned HTML (boilerplate strip + per-domain `drop`/`selector`), not converted |
295
+
296
+ The chosen content rides in the result's `markdown` field; in the CLI/server JSON
297
+ the key is `markdown` for markdown formats and `html` for html formats. The
298
+ API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats fall back to
299
+ their text for those sources.
300
+
241
301
  ### Per-domain extraction
242
302
  Markdown of the whole page is the default. To scope a site to its content node or
243
303
  strip site-specific noise, declare prefs per host in `config/extraction.json`
@@ -59,9 +59,10 @@ def _service_up() -> bool:
59
59
  return False
60
60
 
61
61
 
62
- def _cli_scrape(urls: list[str]) -> list[dict]:
62
+ def _cli_scrape(urls: list[str], fmt: str | None = None) -> list[dict]:
63
+ flag = ["--format", fmt] if fmt else []
63
64
  proc = subprocess.run(
64
- [sys.executable, "-m", "switchback", *urls],
65
+ [sys.executable, "-m", "switchback", *flag, *urls],
65
66
  cwd=ENGINE_DIR, capture_output=True, text=True,
66
67
  )
67
68
  if proc.returncode not in (0, 1): # 1 == "no successes", still valid JSON ([])
@@ -69,15 +70,20 @@ def _cli_scrape(urls: list[str]) -> list[dict]:
69
70
  return json.loads(proc.stdout or "[]")
70
71
 
71
72
 
72
- def scrape(urls: str | list[str]) -> list[dict]:
73
- """Scrape one or many URLs through the engine cascade. Successes only."""
73
+ def scrape(urls: str | list[str], fmt: str | None = None) -> list[dict]:
74
+ """Scrape one or many URLs through the engine cascade. Successes only.
75
+
76
+ fmt selects the output format (markdown | markdown_trimmed | html |
77
+ html_selectors); None uses the engine default (markdown). For html formats the
78
+ content lands under a "html" key instead of "markdown"."""
74
79
  if isinstance(urls, str):
75
80
  urls = [urls]
76
81
  if not urls:
77
82
  return []
78
83
  if _service_up():
79
- return _http_post("/scrape", {"urls": urls})
80
- return _cli_scrape(urls)
84
+ payload = {"urls": urls, "format": fmt} if fmt else {"urls": urls}
85
+ return _http_post("/scrape", payload)
86
+ return _cli_scrape(urls, fmt)
81
87
 
82
88
 
83
89
  def search(query: str) -> list[dict]:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "switchback"
7
- version = "0.1.0"
7
+ version = "0.4.0"
8
8
  description = "One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -10,27 +10,30 @@ from __future__ import annotations
10
10
 
11
11
  import sys
12
12
 
13
+ from .normalize import output_key
13
14
  from .orchestrator import ScrapeOutcome, ScrapeResult, TierAttempt, run, run_detailed
14
15
  from .search import search # re-export: query → URLs (SearXNG)
15
16
 
16
17
 
17
- def scrape(urls: str | list[str]) -> list[ScrapeResult]:
18
+ def scrape(urls: str | list[str], fmt: str | None = None) -> list[ScrapeResult]:
18
19
  """Scrape one or many URLs through the cascade. Returns successes only.
19
20
 
21
+ fmt selects the output format (markdown | markdown_trimmed | html |
22
+ html_selectors); None uses the SCRAPER_OUTPUT_FORMAT default (markdown).
20
23
  For failures with classified reasons + the per-tier cascade, use
21
24
  scrape_detailed()."""
22
25
  if isinstance(urls, str):
23
26
  urls = [urls]
24
- return run(urls)
27
+ return run(urls, fmt)
25
28
 
26
29
 
27
- def scrape_detailed(urls: str | list[str]) -> list[ScrapeOutcome]:
30
+ def scrape_detailed(urls: str | list[str], fmt: str | None = None) -> list[ScrapeOutcome]:
28
31
  """Like scrape() but returns a ScrapeOutcome per URL — successes *and*
29
32
  failures, each with final_outcome, error_class, status_code, and the
30
- per-tier attempts that were made."""
33
+ per-tier attempts that were made. fmt as in scrape()."""
31
34
  if isinstance(urls, str):
32
35
  urls = [urls]
33
- return run_detailed(urls)
36
+ return run_detailed(urls, fmt)
34
37
 
35
38
 
36
39
  def _main() -> int:
@@ -50,14 +53,20 @@ def _main() -> int:
50
53
  _k = _k.strip()
51
54
  if _k and _k not in _os.environ:
52
55
  _os.environ[_k] = _v.strip()
53
- usage = ("usage: switchback <url> [<url> ...]\n"
56
+ usage = ("usage: switchback [--format FMT] <url> [<url> ...]\n"
54
57
  " switchback --search <query ...>\n"
55
- " (or: python -m switchback <url> ...)")
58
+ " switchback --doctor\n"
59
+ " (or: python -m switchback <url> ...)\n"
60
+ " FMT: markdown (default) | markdown_trimmed | html | html_selectors")
56
61
  # --help/-h is an explicit request: usage to stdout, exit 0 (don't treat it
57
62
  # as a URL to scrape). Check before any work so it stays fast and side-effect-free.
58
63
  if any(a in ("--help", "-h") for a in sys.argv[1:]):
59
64
  print(usage)
60
65
  return 0
66
+ # --doctor: preflight tier-readiness report (no scrape). Side-effect-free.
67
+ if "--doctor" in sys.argv[1:]:
68
+ from .doctor import report
69
+ return report()
61
70
  logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
62
71
  setup_logs() # also ship logs to the OTLP backend when configured
63
72
  if len(sys.argv) < 2:
@@ -69,9 +78,25 @@ def _main() -> int:
69
78
  [{"title": h.title, "url": h.url, "snippet": h.snippet} for h in hits],
70
79
  indent=2))
71
80
  return 0 if hits else 1
72
- results = scrape(sys.argv[1:])
81
+ # Optional --format / --format=FMT flag; everything else is a URL.
82
+ fmt: str | None = None
83
+ rest: list[str] = []
84
+ argv = sys.argv[1:]
85
+ i = 0
86
+ while i < len(argv):
87
+ a = argv[i]
88
+ if a == "--format" and i + 1 < len(argv):
89
+ fmt = argv[i + 1]; i += 2; continue
90
+ if a.startswith("--format="):
91
+ fmt = a.split("=", 1)[1]; i += 1; continue
92
+ rest.append(a); i += 1
93
+ if not rest:
94
+ print(usage, file=sys.stderr)
95
+ return 2
96
+ results = scrape(rest, fmt=fmt)
73
97
  print(json.dumps(
74
- [{"url": r.url, "source_method": r.source_method, "markdown": r.markdown}
98
+ [{"url": r.url, "source_method": r.source_method,
99
+ output_key(r.format): r.markdown}
75
100
  for r in results],
76
101
  indent=2))
77
102
  return 0 if results else 1
@@ -36,11 +36,14 @@ def enabled() -> bool:
36
36
  return _TTL_S > 0
37
37
 
38
38
 
39
- def _norm(url: str) -> str:
40
- """Drop the fragment; everything else is significant (query strings select
41
- content)."""
39
+ def _norm(url: str, fmt: str = "markdown") -> str:
40
+ """Cache key: URL with the fragment dropped (query strings select content).
41
+ Non-default output formats are namespaced so an html result is never served
42
+ for a markdown request; the default `markdown` key is unprefixed, so existing
43
+ caches and the default path are unchanged."""
42
44
  p = urlsplit(url)
43
- return urlunsplit((p.scheme, p.netloc, p.path, p.query, ""))
45
+ key = urlunsplit((p.scheme, p.netloc, p.path, p.query, ""))
46
+ return key if fmt == "markdown" else f"{fmt}\x00{key}"
44
47
 
45
48
 
46
49
  def _conn() -> sqlite3.Connection:
@@ -58,8 +61,8 @@ def _conn() -> sqlite3.Connection:
58
61
  return _CONN
59
62
 
60
63
 
61
- def get(url: str) -> tuple[str, str] | None:
62
- """Return ``(markdown, source_method)`` for a fresh cache hit, else None."""
64
+ def get(url: str, fmt: str = "markdown") -> tuple[str, str] | None:
65
+ """Return ``(content, source_method)`` for a fresh cache hit, else None."""
63
66
  if not enabled():
64
67
  return None
65
68
  conn = _conn() # NB: acquires _LOCK itself — must be outside the lock below
@@ -67,7 +70,7 @@ def get(url: str) -> tuple[str, str] | None:
67
70
  with _LOCK:
68
71
  row = conn.execute(
69
72
  "SELECT markdown, source_method, ts FROM cache WHERE url=?",
70
- (_norm(url),)).fetchone()
73
+ (_norm(url, fmt),)).fetchone()
71
74
  except Exception as e:
72
75
  logger.warning(f"content_cache: read failed: {e}")
73
76
  return None
@@ -79,7 +82,7 @@ def get(url: str) -> tuple[str, str] | None:
79
82
  return markdown, source_method
80
83
 
81
84
 
82
- def put(url: str, markdown: str, source_method: str) -> None:
85
+ def put(url: str, markdown: str, source_method: str, fmt: str = "markdown") -> None:
83
86
  """Store a successful scrape. No-op when disabled."""
84
87
  if not enabled():
85
88
  return
@@ -88,7 +91,7 @@ def put(url: str, markdown: str, source_method: str) -> None:
88
91
  with _LOCK:
89
92
  conn.execute("INSERT OR REPLACE INTO cache (url, markdown, source_method, ts) "
90
93
  "VALUES (?, ?, ?, ?)",
91
- (_norm(url), markdown, source_method, time.time()))
94
+ (_norm(url, fmt), markdown, source_method, time.time()))
92
95
  conn.commit()
93
96
  except Exception as e:
94
97
  logger.warning(f"content_cache: write failed: {e}")
@@ -0,0 +1,59 @@
1
+ """Preflight readiness check — `switchback doctor`.
2
+
3
+ Reports which tiers can actually run on this box and, when one can't, the exact
4
+ fix. Built for cold-start deploys (e.g. Azure) where the stealth browser is
5
+ installed by a background thread *after* boot: run this to confirm the tiers are
6
+ live before sending traffic, or to see why Tier 2/3 aren't catching anything.
7
+
8
+ Exit code: 0 if both capable local tiers (cloudscraper + browser) are ready,
9
+ else 1 — so it doubles as a healthcheck.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import shutil
15
+
16
+ from .tiers import tier2_cloudscraper, tier3_browser
17
+
18
+
19
+ def _camoufox() -> tuple[bool, str]:
20
+ if os.getenv("SCRAPER_DISABLE_CAMOUFOX"):
21
+ return False, "off (SCRAPER_DISABLE_CAMOUFOX set)"
22
+ try:
23
+ import camoufox # noqa: F401
24
+ except ImportError:
25
+ return False, 'not installed — pip install "switchback[camoufox]" && camoufox fetch'
26
+ return True, "camoufox installed"
27
+
28
+
29
+ def probe() -> list[tuple[str, bool, str]]:
30
+ """(label, ok, detail) for each tier/dependency that matters at runtime."""
31
+ cs_ok, cs_detail = tier2_cloudscraper.available()
32
+ br_ok, br_detail = tier3_browser.available()
33
+ node = shutil.which("node")
34
+ return [
35
+ ("tier2_cloudscraper", cs_ok, cs_detail),
36
+ ("tier3_browser", br_ok, br_detail),
37
+ ("tier3b_camoufox", *_camoufox()),
38
+ ("node (tier2 v3 concurrency)", bool(node),
39
+ node or "not on PATH — Tier 2 falls back to slower, thread-fragile js2py"),
40
+ ("tier4_firecrawl", bool(os.getenv("FIRECRAWL_API_KEY")),
41
+ "FIRECRAWL_API_KEY set" if os.getenv("FIRECRAWL_API_KEY")
42
+ else "off (no FIRECRAWL_API_KEY)"),
43
+ ]
44
+
45
+
46
+ def report() -> int:
47
+ rows = probe()
48
+ print("switchback doctor — tier readiness\n")
49
+ for label, ok, detail in rows:
50
+ mark = "OK " if ok else "MISS"
51
+ print(f" [{mark}] {label:30} {detail}")
52
+ cs_ok = rows[0][1]
53
+ br_ok = rows[1][1]
54
+ if cs_ok and br_ok:
55
+ print("\nCapable tiers ready.")
56
+ return 0
57
+ print("\nOne or more capable tiers are unavailable (see above). On a cold "
58
+ "start this may resolve once the async install thread finishes.")
59
+ return 1