switchback 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {switchback-0.1.0 → switchback-0.2.0}/.env.example +13 -0
- {switchback-0.1.0 → switchback-0.2.0}/CHANGELOG.md +12 -0
- {switchback-0.1.0 → switchback-0.2.0}/PKG-INFO +32 -3
- {switchback-0.1.0 → switchback-0.2.0}/README.md +31 -2
- {switchback-0.1.0 → switchback-0.2.0}/clients/python_client.py +12 -6
- {switchback-0.1.0 → switchback-0.2.0}/pyproject.toml +1 -1
- {switchback-0.1.0 → switchback-0.2.0}/switchback/api.py +29 -9
- {switchback-0.1.0 → switchback-0.2.0}/switchback/content_cache.py +12 -9
- switchback-0.2.0/switchback/normalize.py +183 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/orchestrator.py +19 -12
- {switchback-0.1.0 → switchback-0.2.0}/switchback/server.py +9 -4
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier4_firecrawl.py +13 -4
- {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/PKG-INFO +32 -3
- switchback-0.1.0/switchback/normalize.py +0 -81
- {switchback-0.1.0 → switchback-0.2.0}/CONTRIBUTING.md +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/LICENSE +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/MANIFEST.in +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/NOTICE +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/SECURITY.md +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/clients/node_bridge.md +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/config/botwall_skip_urls.txt +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/config/extraction.example.json +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/setup.cfg +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/__init__.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/__main__.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/concurrency.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/egress.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/extract.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/flags.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/policy/__init__.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/policy/botwall.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/policy/gates.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/py.typed +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/reporting.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/search.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/session_cache.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/session_trace.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/__init__.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/_browser.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier0_apis.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier1_http.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier2_cloudscraper.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier3_browser.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier3b_camoufox.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier_residential.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback/tracing.py +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/SOURCES.txt +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/dependency_links.txt +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/entry_points.txt +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/requires.txt +0 -0
- {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/top_level.txt +0 -0
|
@@ -14,6 +14,19 @@ OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
14
14
|
# ── Search (Tier-0 SearXNG, query → URLs) ───────────────────────────────────
|
|
15
15
|
SEARXNG_URL=http://localhost:8888
|
|
16
16
|
|
|
17
|
+
# ── Output format ───────────────────────────────────────────────────────────
|
|
18
|
+
# Shape of the scraped content. Default markdown is byte-identical to before;
|
|
19
|
+
# override per-call with scrape(fmt=...), the CLI --format flag, or the /scrape
|
|
20
|
+
# {"format": ...} field. html-family results land under a "html" key (instead of
|
|
21
|
+
# "markdown") in the CLI/server JSON.
|
|
22
|
+
# markdown whole-page markdown (default)
|
|
23
|
+
# markdown_trimmed markdown with extra ad/nav/boilerplate lines removed
|
|
24
|
+
# html raw HTML exactly as fetched (no cleaning)
|
|
25
|
+
# html_selectors cleaned HTML (boilerplate strip + per-domain drop/selector)
|
|
26
|
+
# Note: the API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats
|
|
27
|
+
# fall back to their text for those sources.
|
|
28
|
+
SCRAPER_OUTPUT_FORMAT=markdown
|
|
29
|
+
|
|
17
30
|
# ── Tier 2.5 · Jina Reader (r.jina.ai) ──────────────────────────────────────
|
|
18
31
|
# Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
|
|
19
32
|
JINA_API_KEY=
|
|
@@ -6,6 +6,18 @@ versioning while pre-1.0.
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.2.0] - 2026-06-25
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **Selectable output formats** — `SCRAPER_OUTPUT_FORMAT` (or per-call
|
|
13
|
+
`scrape(fmt=...)`, CLI `--format`, `/scrape` `{"format": ...}`) selects the
|
|
14
|
+
content shape: `markdown` (default, unchanged), `markdown_trimmed` (extra
|
|
15
|
+
ad/nav/boilerplate removed), `html` (raw), or `html_selectors` (cleaned HTML
|
|
16
|
+
with per-domain `drop`/`selector` applied). Default output is byte-identical;
|
|
17
|
+
html-family results use a `html` JSON key instead of `markdown`.
|
|
18
|
+
|
|
19
|
+
## [0.1.0] - 2026-06-23
|
|
20
|
+
|
|
9
21
|
### Added
|
|
10
22
|
- **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
|
|
11
23
|
DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: switchback
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
5
|
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
6
|
License: MIT
|
|
@@ -75,8 +75,8 @@ Dynamic: license-file
|
|
|
75
75
|
Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
|
|
76
76
|
to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
|
|
77
77
|
|
|
78
|
-
[](https://pypi.org/project/switchback/)
|
|
79
|
+
[](https://pypi.org/project/switchback/)
|
|
80
80
|
[](LICENSE)
|
|
81
81
|
[](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
|
|
82
82
|
|
|
@@ -269,6 +269,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
269
269
|
<details>
|
|
270
270
|
<summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
|
|
271
271
|
|
|
272
|
+
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
272
273
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
273
274
|
- `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
|
|
274
275
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
@@ -297,6 +298,34 @@ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
|
|
|
297
298
|
`GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
|
|
298
299
|
`playwright show-trace <zip>`. Off by default (traces are MBs each).
|
|
299
300
|
|
|
301
|
+
### Output formats
|
|
302
|
+
Markdown is the default and is unchanged. Pick a different shape globally with
|
|
303
|
+
`SCRAPER_OUTPUT_FORMAT`, or per call:
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
from switchback import scrape
|
|
307
|
+
scrape(["https://example.com/article"]) # markdown (default)
|
|
308
|
+
scrape(["https://example.com/article"], fmt="html") # raw HTML
|
|
309
|
+
scrape(["https://example.com/article"], fmt="markdown_trimmed")
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
switchback --format html_selectors https://example.com/article
|
|
314
|
+
curl -s localhost:8799/scrape -d '{"urls":["https://example.com"],"format":"html"}'
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
| format | what you get |
|
|
318
|
+
| --- | --- |
|
|
319
|
+
| `markdown` | whole-page markdown (boilerplate stripped + per-domain prefs) — **default** |
|
|
320
|
+
| `markdown_trimmed` | markdown with extra ad/nav/boilerplate lines removed |
|
|
321
|
+
| `html` | the raw HTML exactly as fetched, untouched |
|
|
322
|
+
| `html_selectors` | cleaned HTML (boilerplate strip + per-domain `drop`/`selector`), not converted |
|
|
323
|
+
|
|
324
|
+
The chosen content rides in the result's `markdown` field; in the CLI/server JSON
|
|
325
|
+
the key is `markdown` for markdown formats and `html` for html formats. The
|
|
326
|
+
API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats fall back to
|
|
327
|
+
their text for those sources.
|
|
328
|
+
|
|
300
329
|
### Per-domain extraction
|
|
301
330
|
Markdown of the whole page is the default. To scope a site to its content node or
|
|
302
331
|
strip site-specific noise, declare prefs per host in `config/extraction.json`
|
|
@@ -16,8 +16,8 @@
|
|
|
16
16
|
Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
|
|
17
17
|
to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
|
|
18
18
|
|
|
19
|
-
[](https://pypi.org/project/switchback/)
|
|
20
|
+
[](https://pypi.org/project/switchback/)
|
|
21
21
|
[](LICENSE)
|
|
22
22
|
[](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
|
|
23
23
|
|
|
@@ -210,6 +210,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
210
210
|
<details>
|
|
211
211
|
<summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
|
|
212
212
|
|
|
213
|
+
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
213
214
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
214
215
|
- `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
|
|
215
216
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
@@ -238,6 +239,34 @@ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
|
|
|
238
239
|
`GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
|
|
239
240
|
`playwright show-trace <zip>`. Off by default (traces are MBs each).
|
|
240
241
|
|
|
242
|
+
### Output formats
|
|
243
|
+
Markdown is the default and is unchanged. Pick a different shape globally with
|
|
244
|
+
`SCRAPER_OUTPUT_FORMAT`, or per call:
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
from switchback import scrape
|
|
248
|
+
scrape(["https://example.com/article"]) # markdown (default)
|
|
249
|
+
scrape(["https://example.com/article"], fmt="html") # raw HTML
|
|
250
|
+
scrape(["https://example.com/article"], fmt="markdown_trimmed")
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
switchback --format html_selectors https://example.com/article
|
|
255
|
+
curl -s localhost:8799/scrape -d '{"urls":["https://example.com"],"format":"html"}'
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
| format | what you get |
|
|
259
|
+
| --- | --- |
|
|
260
|
+
| `markdown` | whole-page markdown (boilerplate stripped + per-domain prefs) — **default** |
|
|
261
|
+
| `markdown_trimmed` | markdown with extra ad/nav/boilerplate lines removed |
|
|
262
|
+
| `html` | the raw HTML exactly as fetched, untouched |
|
|
263
|
+
| `html_selectors` | cleaned HTML (boilerplate strip + per-domain `drop`/`selector`), not converted |
|
|
264
|
+
|
|
265
|
+
The chosen content rides in the result's `markdown` field; in the CLI/server JSON
|
|
266
|
+
the key is `markdown` for markdown formats and `html` for html formats. The
|
|
267
|
+
API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats fall back to
|
|
268
|
+
their text for those sources.
|
|
269
|
+
|
|
241
270
|
### Per-domain extraction
|
|
242
271
|
Markdown of the whole page is the default. To scope a site to its content node or
|
|
243
272
|
strip site-specific noise, declare prefs per host in `config/extraction.json`
|
|
@@ -59,9 +59,10 @@ def _service_up() -> bool:
|
|
|
59
59
|
return False
|
|
60
60
|
|
|
61
61
|
|
|
62
|
-
def _cli_scrape(urls: list[str]) -> list[dict]:
|
|
62
|
+
def _cli_scrape(urls: list[str], fmt: str | None = None) -> list[dict]:
|
|
63
|
+
flag = ["--format", fmt] if fmt else []
|
|
63
64
|
proc = subprocess.run(
|
|
64
|
-
[sys.executable, "-m", "switchback", *urls],
|
|
65
|
+
[sys.executable, "-m", "switchback", *flag, *urls],
|
|
65
66
|
cwd=ENGINE_DIR, capture_output=True, text=True,
|
|
66
67
|
)
|
|
67
68
|
if proc.returncode not in (0, 1): # 1 == "no successes", still valid JSON ([])
|
|
@@ -69,15 +70,20 @@ def _cli_scrape(urls: list[str]) -> list[dict]:
|
|
|
69
70
|
return json.loads(proc.stdout or "[]")
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
def scrape(urls: str | list[str]) -> list[dict]:
|
|
73
|
-
"""Scrape one or many URLs through the engine cascade. Successes only.
|
|
73
|
+
def scrape(urls: str | list[str], fmt: str | None = None) -> list[dict]:
|
|
74
|
+
"""Scrape one or many URLs through the engine cascade. Successes only.
|
|
75
|
+
|
|
76
|
+
fmt selects the output format (markdown | markdown_trimmed | html |
|
|
77
|
+
html_selectors); None uses the engine default (markdown). For html formats the
|
|
78
|
+
content lands under a "html" key instead of "markdown"."""
|
|
74
79
|
if isinstance(urls, str):
|
|
75
80
|
urls = [urls]
|
|
76
81
|
if not urls:
|
|
77
82
|
return []
|
|
78
83
|
if _service_up():
|
|
79
|
-
|
|
80
|
-
|
|
84
|
+
payload = {"urls": urls, "format": fmt} if fmt else {"urls": urls}
|
|
85
|
+
return _http_post("/scrape", payload)
|
|
86
|
+
return _cli_scrape(urls, fmt)
|
|
81
87
|
|
|
82
88
|
|
|
83
89
|
def search(query: str) -> list[dict]:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "switchback"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -10,27 +10,30 @@ from __future__ import annotations
|
|
|
10
10
|
|
|
11
11
|
import sys
|
|
12
12
|
|
|
13
|
+
from .normalize import output_key
|
|
13
14
|
from .orchestrator import ScrapeOutcome, ScrapeResult, TierAttempt, run, run_detailed
|
|
14
15
|
from .search import search # re-export: query → URLs (SearXNG)
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
def scrape(urls: str | list[str]) -> list[ScrapeResult]:
|
|
18
|
+
def scrape(urls: str | list[str], fmt: str | None = None) -> list[ScrapeResult]:
|
|
18
19
|
"""Scrape one or many URLs through the cascade. Returns successes only.
|
|
19
20
|
|
|
21
|
+
fmt selects the output format (markdown | markdown_trimmed | html |
|
|
22
|
+
html_selectors); None uses the SCRAPER_OUTPUT_FORMAT default (markdown).
|
|
20
23
|
For failures with classified reasons + the per-tier cascade, use
|
|
21
24
|
scrape_detailed()."""
|
|
22
25
|
if isinstance(urls, str):
|
|
23
26
|
urls = [urls]
|
|
24
|
-
return run(urls)
|
|
27
|
+
return run(urls, fmt)
|
|
25
28
|
|
|
26
29
|
|
|
27
|
-
def scrape_detailed(urls: str | list[str]) -> list[ScrapeOutcome]:
|
|
30
|
+
def scrape_detailed(urls: str | list[str], fmt: str | None = None) -> list[ScrapeOutcome]:
|
|
28
31
|
"""Like scrape() but returns a ScrapeOutcome per URL — successes *and*
|
|
29
32
|
failures, each with final_outcome, error_class, status_code, and the
|
|
30
|
-
per-tier attempts that were made."""
|
|
33
|
+
per-tier attempts that were made. fmt as in scrape()."""
|
|
31
34
|
if isinstance(urls, str):
|
|
32
35
|
urls = [urls]
|
|
33
|
-
return run_detailed(urls)
|
|
36
|
+
return run_detailed(urls, fmt)
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
def _main() -> int:
|
|
@@ -50,9 +53,10 @@ def _main() -> int:
|
|
|
50
53
|
_k = _k.strip()
|
|
51
54
|
if _k and _k not in _os.environ:
|
|
52
55
|
_os.environ[_k] = _v.strip()
|
|
53
|
-
usage = ("usage: switchback <url> [<url> ...]\n"
|
|
56
|
+
usage = ("usage: switchback [--format FMT] <url> [<url> ...]\n"
|
|
54
57
|
" switchback --search <query ...>\n"
|
|
55
|
-
" (or: python -m switchback <url> ...)"
|
|
58
|
+
" (or: python -m switchback <url> ...)\n"
|
|
59
|
+
" FMT: markdown (default) | markdown_trimmed | html | html_selectors")
|
|
56
60
|
# --help/-h is an explicit request: usage to stdout, exit 0 (don't treat it
|
|
57
61
|
# as a URL to scrape). Check before any work so it stays fast and side-effect-free.
|
|
58
62
|
if any(a in ("--help", "-h") for a in sys.argv[1:]):
|
|
@@ -69,9 +73,25 @@ def _main() -> int:
|
|
|
69
73
|
[{"title": h.title, "url": h.url, "snippet": h.snippet} for h in hits],
|
|
70
74
|
indent=2))
|
|
71
75
|
return 0 if hits else 1
|
|
72
|
-
|
|
76
|
+
# Optional --format / --format=FMT flag; everything else is a URL.
|
|
77
|
+
fmt: str | None = None
|
|
78
|
+
rest: list[str] = []
|
|
79
|
+
argv = sys.argv[1:]
|
|
80
|
+
i = 0
|
|
81
|
+
while i < len(argv):
|
|
82
|
+
a = argv[i]
|
|
83
|
+
if a == "--format" and i + 1 < len(argv):
|
|
84
|
+
fmt = argv[i + 1]; i += 2; continue
|
|
85
|
+
if a.startswith("--format="):
|
|
86
|
+
fmt = a.split("=", 1)[1]; i += 1; continue
|
|
87
|
+
rest.append(a); i += 1
|
|
88
|
+
if not rest:
|
|
89
|
+
print(usage, file=sys.stderr)
|
|
90
|
+
return 2
|
|
91
|
+
results = scrape(rest, fmt=fmt)
|
|
73
92
|
print(json.dumps(
|
|
74
|
-
[{"url": r.url, "source_method": r.source_method,
|
|
93
|
+
[{"url": r.url, "source_method": r.source_method,
|
|
94
|
+
output_key(r.format): r.markdown}
|
|
75
95
|
for r in results],
|
|
76
96
|
indent=2))
|
|
77
97
|
return 0 if results else 1
|
|
@@ -36,11 +36,14 @@ def enabled() -> bool:
|
|
|
36
36
|
return _TTL_S > 0
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
def _norm(url: str) -> str:
|
|
40
|
-
"""
|
|
41
|
-
|
|
39
|
+
def _norm(url: str, fmt: str = "markdown") -> str:
|
|
40
|
+
"""Cache key: URL with the fragment dropped (query strings select content).
|
|
41
|
+
Non-default output formats are namespaced so an html result is never served
|
|
42
|
+
for a markdown request; the default `markdown` key is unprefixed, so existing
|
|
43
|
+
caches and the default path are unchanged."""
|
|
42
44
|
p = urlsplit(url)
|
|
43
|
-
|
|
45
|
+
key = urlunsplit((p.scheme, p.netloc, p.path, p.query, ""))
|
|
46
|
+
return key if fmt == "markdown" else f"{fmt}\x00{key}"
|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
def _conn() -> sqlite3.Connection:
|
|
@@ -58,8 +61,8 @@ def _conn() -> sqlite3.Connection:
|
|
|
58
61
|
return _CONN
|
|
59
62
|
|
|
60
63
|
|
|
61
|
-
def get(url: str) -> tuple[str, str] | None:
|
|
62
|
-
"""Return ``(
|
|
64
|
+
def get(url: str, fmt: str = "markdown") -> tuple[str, str] | None:
|
|
65
|
+
"""Return ``(content, source_method)`` for a fresh cache hit, else None."""
|
|
63
66
|
if not enabled():
|
|
64
67
|
return None
|
|
65
68
|
conn = _conn() # NB: acquires _LOCK itself — must be outside the lock below
|
|
@@ -67,7 +70,7 @@ def get(url: str) -> tuple[str, str] | None:
|
|
|
67
70
|
with _LOCK:
|
|
68
71
|
row = conn.execute(
|
|
69
72
|
"SELECT markdown, source_method, ts FROM cache WHERE url=?",
|
|
70
|
-
(_norm(url),)).fetchone()
|
|
73
|
+
(_norm(url, fmt),)).fetchone()
|
|
71
74
|
except Exception as e:
|
|
72
75
|
logger.warning(f"content_cache: read failed: {e}")
|
|
73
76
|
return None
|
|
@@ -79,7 +82,7 @@ def get(url: str) -> tuple[str, str] | None:
|
|
|
79
82
|
return markdown, source_method
|
|
80
83
|
|
|
81
84
|
|
|
82
|
-
def put(url: str, markdown: str, source_method: str) -> None:
|
|
85
|
+
def put(url: str, markdown: str, source_method: str, fmt: str = "markdown") -> None:
|
|
83
86
|
"""Store a successful scrape. No-op when disabled."""
|
|
84
87
|
if not enabled():
|
|
85
88
|
return
|
|
@@ -88,7 +91,7 @@ def put(url: str, markdown: str, source_method: str) -> None:
|
|
|
88
91
|
with _LOCK:
|
|
89
92
|
conn.execute("INSERT OR REPLACE INTO cache (url, markdown, source_method, ts) "
|
|
90
93
|
"VALUES (?, ?, ?, ?)",
|
|
91
|
-
(_norm(url), markdown, source_method, time.time()))
|
|
94
|
+
(_norm(url, fmt), markdown, source_method, time.time()))
|
|
92
95
|
conn.commit()
|
|
93
96
|
except Exception as e:
|
|
94
97
|
logger.warning(f"content_cache: write failed: {e}")
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Shared content normalization — HTML→Markdown and PDF→text.
|
|
2
|
+
|
|
3
|
+
Ported from musings-by-hermes/scripts/muse_helpers.py (the most mature version):
|
|
4
|
+
strips boilerplate, promotes lazy-loaded images, resolves relative URLs.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import threading
|
|
13
|
+
from contextlib import contextmanager
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
UA = ("Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 "
|
|
18
|
+
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
19
|
+
|
|
20
|
+
# ── Output format ───────────────────────────────────────────────────────────
|
|
21
|
+
# Default is markdown (today's behavior, byte-identical). Opt into other shapes
|
|
22
|
+
# globally via SCRAPER_OUTPUT_FORMAT, or per-call via the scrape(fmt=...) /
|
|
23
|
+
# --format / {"format": ...} overrides which set output_format_scope().
|
|
24
|
+
# markdown whole-page markdown (default)
|
|
25
|
+
# markdown_trimmed markdown with extra ad/nav/boilerplate lines removed
|
|
26
|
+
# html raw HTML exactly as fetched (no cleaning)
|
|
27
|
+
# html_selectors cleaned HTML (boilerplate strip + per-domain drop/selector),
|
|
28
|
+
# not converted to markdown
|
|
29
|
+
VALID_FORMATS = ("markdown", "markdown_trimmed", "html", "html_selectors")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _validate(fmt: str | None) -> str:
|
|
33
|
+
f = (fmt or "").strip().lower()
|
|
34
|
+
if f not in VALID_FORMATS:
|
|
35
|
+
logger.warning(f"unknown output format {fmt!r}; using 'markdown' "
|
|
36
|
+
f"(valid: {', '.join(VALID_FORMATS)})")
|
|
37
|
+
return "markdown"
|
|
38
|
+
return f
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
OUTPUT_FORMAT = _validate(os.getenv("SCRAPER_OUTPUT_FORMAT", "markdown"))
|
|
42
|
+
|
|
43
|
+
# Per-thread active-format override. Thread-local by construction (like egress):
|
|
44
|
+
# the orchestrator sets it on the worker thread that also runs the tier fetch +
|
|
45
|
+
# this module's conversion, so concurrent server requests can't bleed formats.
|
|
46
|
+
_scope = threading.local()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@contextmanager
|
|
50
|
+
def output_format_scope(fmt: str | None):
|
|
51
|
+
"""Set the active output format for the enclosed work (per-thread). A falsy
|
|
52
|
+
fmt means 'use the SCRAPER_OUTPUT_FORMAT default'. Always restored on exit."""
|
|
53
|
+
prev = getattr(_scope, "fmt", None)
|
|
54
|
+
_scope.fmt = _validate(fmt) if fmt else None
|
|
55
|
+
try:
|
|
56
|
+
yield
|
|
57
|
+
finally:
|
|
58
|
+
_scope.fmt = prev
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def active_format() -> str:
|
|
62
|
+
"""The format in effect for the current thread: the per-call override if set,
|
|
63
|
+
else the SCRAPER_OUTPUT_FORMAT default."""
|
|
64
|
+
return getattr(_scope, "fmt", None) or OUTPUT_FORMAT
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def output_key(fmt: str) -> str:
|
|
68
|
+
"""The JSON/result key for a format's content family: html-family → "html",
|
|
69
|
+
markdown-family → "markdown". Lets the default path stay {"...","markdown"}."""
|
|
70
|
+
return "html" if fmt.startswith("html") else "markdown"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _clean_html(html: str, base_url: str | None = None) -> str:
|
|
74
|
+
"""Return cleaned HTML: boilerplate stripped, per-domain drop/selector applied
|
|
75
|
+
(see switchback.extract), lazy-load image attrs promoted, relative image/link
|
|
76
|
+
URLs resolved against base_url. On any failure returns `html` unchanged."""
|
|
77
|
+
try:
|
|
78
|
+
from bs4 import BeautifulSoup
|
|
79
|
+
from urllib.parse import urljoin
|
|
80
|
+
|
|
81
|
+
from .extract import prefs_for
|
|
82
|
+
prefs = prefs_for(base_url)
|
|
83
|
+
|
|
84
|
+
soup = BeautifulSoup(html or "", "html.parser")
|
|
85
|
+
for tag in soup(["script", "style", "noscript", "nav", "header",
|
|
86
|
+
"footer", "aside", "form", "iframe"]):
|
|
87
|
+
tag.decompose()
|
|
88
|
+
# Per-domain: remove configured noise, then scope to the content node.
|
|
89
|
+
for sel in prefs.get("drop", []):
|
|
90
|
+
for tag in soup.select(sel):
|
|
91
|
+
tag.decompose()
|
|
92
|
+
selector = prefs.get("selector")
|
|
93
|
+
if selector:
|
|
94
|
+
node = soup.select_one(selector)
|
|
95
|
+
if node is not None:
|
|
96
|
+
soup = BeautifulSoup(str(node), "html.parser")
|
|
97
|
+
else:
|
|
98
|
+
logger.debug(f"extract: selector {selector!r} matched nothing for {base_url}")
|
|
99
|
+
for img in soup.find_all("img"):
|
|
100
|
+
src = (img.get("src") or img.get("data-src")
|
|
101
|
+
or img.get("data-original") or img.get("data-lazy-src"))
|
|
102
|
+
if not src and img.get("srcset"):
|
|
103
|
+
src = img["srcset"].split(",")[0].strip().split(" ")[0]
|
|
104
|
+
if src:
|
|
105
|
+
if base_url:
|
|
106
|
+
src = urljoin(base_url, src)
|
|
107
|
+
img["src"] = src
|
|
108
|
+
if base_url:
|
|
109
|
+
for a in soup.find_all("a", href=True):
|
|
110
|
+
a["href"] = urljoin(base_url, a["href"])
|
|
111
|
+
return str(soup)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.debug(f"soup pre-clean skipped: {e}")
|
|
114
|
+
return html
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# Lines that markdown_trimmed drops: standalone images, link-only/nav rows, and
|
|
118
|
+
# short promotional boilerplate. Conservative on purpose — prose is never touched.
|
|
119
|
+
_TRIM_IMG_RE = re.compile(r"^!\[[^\]]*\]\([^)]*\)$")
|
|
120
|
+
_TRIM_LINKS_ONLY_RE = re.compile(r"^(?:[-*>]\s*)?(?:\[[^\]]*\]\([^)]*\)[\s|·•\-–—]*)+$")
|
|
121
|
+
_TRIM_BOILERPLATE_RE = re.compile(
|
|
122
|
+
r"^(subscribe|sign\s*up|sign\s*in|log\s*in|logout|newsletter|advertisement|"
|
|
123
|
+
r"accept\s+all|cookie|follow\s+us|share\s+this)\b", re.I)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _trim_markdown(md: str) -> str:
|
|
127
|
+
"""Markdown minus common ad/nav/boilerplate noise. Drops only standalone-image
|
|
128
|
+
lines, link-only/nav rows, and short promotional boilerplate lines; keeps all
|
|
129
|
+
prose. Collapses 3+ blank lines to one."""
|
|
130
|
+
kept: list[str] = []
|
|
131
|
+
for line in md.splitlines():
|
|
132
|
+
s = line.strip()
|
|
133
|
+
if not s:
|
|
134
|
+
kept.append("")
|
|
135
|
+
continue
|
|
136
|
+
if _TRIM_IMG_RE.match(s) or _TRIM_LINKS_ONLY_RE.match(s):
|
|
137
|
+
continue
|
|
138
|
+
if len(s) <= 60 and _TRIM_BOILERPLATE_RE.match(s):
|
|
139
|
+
continue
|
|
140
|
+
kept.append(line)
|
|
141
|
+
return re.sub(r"\n{3,}", "\n\n", "\n".join(kept)).strip()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def render(html: str, base_url: str | None = None, fmt: str | None = None) -> str:
|
|
145
|
+
"""Render fetched HTML in `fmt` (default: the active output format).
|
|
146
|
+
|
|
147
|
+
- markdown whole-page markdown (boilerplate strip + per-domain prefs)
|
|
148
|
+
- markdown_trimmed markdown with extra ad/nav/boilerplate lines removed
|
|
149
|
+
- html the raw HTML, untouched
|
|
150
|
+
- html_selectors cleaned HTML (per-domain prefs applied), not converted
|
|
151
|
+
"""
|
|
152
|
+
fmt = _validate(fmt) if fmt is not None else active_format()
|
|
153
|
+
if fmt == "html":
|
|
154
|
+
return html or ""
|
|
155
|
+
if fmt == "html_selectors":
|
|
156
|
+
return (_clean_html(html, base_url) or "").strip()
|
|
157
|
+
try:
|
|
158
|
+
from markdownify import markdownify
|
|
159
|
+
cleaned = _clean_html(html, base_url)
|
|
160
|
+
md = (markdownify(cleaned, heading_style="ATX", code_language="",
|
|
161
|
+
bullets="-", strip=["script", "style"]) or "").strip()
|
|
162
|
+
return _trim_markdown(md) if fmt == "markdown_trimmed" else md
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.warning(f"markdownify failed: {e}")
|
|
165
|
+
return (html or "").strip()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def html_to_markdown(html: str, base_url: str | None = None) -> str:
|
|
169
|
+
"""Render `html` in the active output format (default markdown). Name kept for
|
|
170
|
+
back-compat: every tier calls this, so it automatically honors the selected
|
|
171
|
+
SCRAPER_OUTPUT_FORMAT / per-call format with no per-tier changes."""
|
|
172
|
+
return render(html, base_url)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def pdf_bytes_to_text(data: bytes) -> str:
|
|
176
|
+
"""Extract text from PDF bytes. In-memory only — nothing written to disk."""
|
|
177
|
+
from pypdf import PdfReader
|
|
178
|
+
buf = io.BytesIO(data)
|
|
179
|
+
try:
|
|
180
|
+
reader = PdfReader(buf)
|
|
181
|
+
return "\n\n".join((p.extract_text() or "") for p in reader.pages).strip()
|
|
182
|
+
finally:
|
|
183
|
+
buf.close()
|
|
@@ -17,6 +17,7 @@ import time
|
|
|
17
17
|
from dataclasses import dataclass, field
|
|
18
18
|
|
|
19
19
|
from . import content_cache, egress, session_cache
|
|
20
|
+
from .normalize import active_format, output_format_scope
|
|
20
21
|
from .policy import botwall
|
|
21
22
|
from .policy.gates import BotWall, RateLimited, ShortContent, classify_error, host_of
|
|
22
23
|
from .tiers import TIERS, INDEX
|
|
@@ -68,8 +69,9 @@ _FAILURE_PRIORITY = {
|
|
|
68
69
|
@dataclass
|
|
69
70
|
class ScrapeResult:
|
|
70
71
|
url: str
|
|
71
|
-
markdown: str
|
|
72
|
+
markdown: str # the rendered content (format named by `format`)
|
|
72
73
|
source_method: str # tier NAME that won
|
|
74
|
+
format: str = "markdown" # markdown | markdown_trimmed | html | html_selectors
|
|
73
75
|
|
|
74
76
|
|
|
75
77
|
@dataclass
|
|
@@ -95,6 +97,7 @@ class ScrapeOutcome:
|
|
|
95
97
|
latency_ms: int | None = None
|
|
96
98
|
egress: str = "direct" # "egress" if routed via SCRAPER_EGRESS_PROXY, else "direct"
|
|
97
99
|
wire_bytes: int = 0 # bytes transferred over the network (cost basis for proxy GB)
|
|
100
|
+
format: str = "markdown" # output format of `markdown` (the content field)
|
|
98
101
|
attempts: list[TierAttempt] = field(default_factory=list)
|
|
99
102
|
|
|
100
103
|
|
|
@@ -180,7 +183,7 @@ def _run_one(url: str, db: dict) -> ScrapeOutcome:
|
|
|
180
183
|
if botwall.is_url_skipped(url, db):
|
|
181
184
|
return _skipped(url, root, "url_excluded",
|
|
182
185
|
db.get("urls", {}).get(url, {}).get("reason", ""))
|
|
183
|
-
hit = content_cache.get(url)
|
|
186
|
+
hit = content_cache.get(url, active_format())
|
|
184
187
|
if hit:
|
|
185
188
|
md, method = hit
|
|
186
189
|
root.set(Attr.OUTCOME, "cache_hit")
|
|
@@ -188,7 +191,7 @@ def _run_one(url: str, db: dict) -> ScrapeOutcome:
|
|
|
188
191
|
root.set(Attr.MD_LEN, len(md))
|
|
189
192
|
logger.info(f"cache_hit {url} (was {method})")
|
|
190
193
|
return ScrapeOutcome(url, True, markdown=md, source_method=method,
|
|
191
|
-
final_outcome="ok")
|
|
194
|
+
final_outcome="ok", format=active_format())
|
|
192
195
|
# A needs_egress host runs the whole cascade in the egress scope, so the
|
|
193
196
|
# tiers route through SCRAPER_EGRESS_PROXY (when set); easy hosts stay
|
|
194
197
|
# direct and never spend residential bandwidth.
|
|
@@ -300,7 +303,7 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
|
|
|
300
303
|
sp.set(Attr.SOURCE, tier.NAME)
|
|
301
304
|
sp.set(Attr.LATENCY_MS, dt)
|
|
302
305
|
botwall.record(db, url, tier.NAME, "ok", md_len=len(md), latency_ms=dt)
|
|
303
|
-
content_cache.put(url, md, tier.NAME)
|
|
306
|
+
content_cache.put(url, md, tier.NAME, active_format())
|
|
304
307
|
root.set(Attr.OUTCOME, "ok")
|
|
305
308
|
root.set(Attr.SOURCE, tier.NAME)
|
|
306
309
|
root.set(Attr.LATENCY_MS, total)
|
|
@@ -308,7 +311,8 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
|
|
|
308
311
|
logger.info(
|
|
309
312
|
f"{tier.NAME} OK {url} md_len={len(md)} {dt}ms (total {total}ms)")
|
|
310
313
|
return ScrapeOutcome(url, True, markdown=md, source_method=tier.NAME,
|
|
311
|
-
final_outcome="ok", latency_ms=total,
|
|
314
|
+
final_outcome="ok", latency_ms=total,
|
|
315
|
+
format=active_format(), attempts=attempts)
|
|
312
316
|
|
|
313
317
|
total = int((time.monotonic() - t0) * 1000)
|
|
314
318
|
ec, sc = _dominant_failure(attempts)
|
|
@@ -323,21 +327,24 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
|
|
|
323
327
|
latency_ms=total, attempts=attempts)
|
|
324
328
|
|
|
325
329
|
|
|
326
|
-
def run_detailed(urls: list[str]) -> list[ScrapeOutcome]:
|
|
330
|
+
def run_detailed(urls: list[str], fmt: str | None = None) -> list[ScrapeOutcome]:
|
|
327
331
|
"""Scrape each URL; return a full ScrapeOutcome (success or failure with the
|
|
328
|
-
per-tier cascade and a classified reason) for every URL.
|
|
332
|
+
per-tier cascade and a classified reason) for every URL.
|
|
333
|
+
|
|
334
|
+
fmt overrides SCRAPER_OUTPUT_FORMAT for this call (None = use the default)."""
|
|
329
335
|
db = botwall.load_db()
|
|
330
336
|
out = []
|
|
331
337
|
try:
|
|
332
|
-
|
|
333
|
-
|
|
338
|
+
with output_format_scope(fmt):
|
|
339
|
+
for url in urls:
|
|
340
|
+
out.append(_run_one(url, db))
|
|
334
341
|
finally:
|
|
335
342
|
botwall.save_db(db)
|
|
336
343
|
flush()
|
|
337
344
|
return out
|
|
338
345
|
|
|
339
346
|
|
|
340
|
-
def run(urls: list[str]) -> list[ScrapeResult]:
|
|
347
|
+
def run(urls: list[str], fmt: str | None = None) -> list[ScrapeResult]:
|
|
341
348
|
"""Successes only (backward-compatible). Use run_detailed() for failures."""
|
|
342
|
-
return [ScrapeResult(o.url, o.markdown, o.source_method)
|
|
343
|
-
for o in run_detailed(urls) if o.ok]
|
|
349
|
+
return [ScrapeResult(o.url, o.markdown, o.source_method, o.format)
|
|
350
|
+
for o in run_detailed(urls, fmt) if o.ok]
|
|
@@ -25,15 +25,17 @@ from pydantic import BaseModel
|
|
|
25
25
|
|
|
26
26
|
from . import session_trace
|
|
27
27
|
from .api import scrape
|
|
28
|
+
from .normalize import output_key
|
|
28
29
|
from .reporting import build_report, domain_report
|
|
29
30
|
from .search import search
|
|
30
31
|
from .tracing import setup_logs
|
|
31
32
|
|
|
32
|
-
app = FastAPI(title="switchback", version="0.
|
|
33
|
+
app = FastAPI(title="switchback", version="0.2.0")
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class ScrapeRequest(BaseModel):
|
|
36
37
|
urls: list[str]
|
|
38
|
+
format: str | None = None # markdown (default) | markdown_trimmed | html | html_selectors
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
@app.get("/healthz")
|
|
@@ -43,9 +45,12 @@ def healthz() -> dict:
|
|
|
43
45
|
|
|
44
46
|
@app.post("/scrape")
|
|
45
47
|
def scrape_endpoint(req: ScrapeRequest) -> list[dict]:
|
|
46
|
-
"""Run URLs through the cascade. Returns successes only (failed URLs omitted).
|
|
47
|
-
|
|
48
|
-
|
|
48
|
+
"""Run URLs through the cascade. Returns successes only (failed URLs omitted).
|
|
49
|
+
Optional "format" selects the output shape; the content key is "markdown" for
|
|
50
|
+
markdown formats and "html" for html formats."""
|
|
51
|
+
return [{"url": r.url, "source_method": r.source_method,
|
|
52
|
+
output_key(r.format): r.markdown}
|
|
53
|
+
for r in scrape(req.urls, fmt=req.format)]
|
|
49
54
|
|
|
50
55
|
|
|
51
56
|
@app.get("/search")
|
|
@@ -9,6 +9,7 @@ from __future__ import annotations
|
|
|
9
9
|
import os
|
|
10
10
|
import threading
|
|
11
11
|
|
|
12
|
+
from ..normalize import active_format, render
|
|
12
13
|
from ..policy.gates import check
|
|
13
14
|
|
|
14
15
|
NAME = "tier4_firecrawl"
|
|
@@ -19,12 +20,18 @@ def disabled() -> bool:
|
|
|
19
20
|
return bool(os.getenv("SCRAPER_DISABLE_FIRECRAWL"))
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
def _scrape(url: str) -> str:
|
|
23
|
+
def _scrape(url: str, fmt: str) -> str:
|
|
23
24
|
from firecrawl import Firecrawl
|
|
24
25
|
app = Firecrawl(api_key=os.environ["FIRECRAWL_API_KEY"])
|
|
25
|
-
|
|
26
|
+
if fmt == "markdown":
|
|
27
|
+
doc = app.scrape(url, formats=["markdown"])
|
|
28
|
+
d = doc.model_dump() if hasattr(doc, "model_dump") else (doc if isinstance(doc, dict) else {})
|
|
29
|
+
return check(url, (d.get("markdown") or "").strip())
|
|
30
|
+
# Non-default formats: fetch HTML and derive every shape through normalize, so
|
|
31
|
+
# html / html_selectors / markdown_trimmed match the rest of the cascade.
|
|
32
|
+
doc = app.scrape(url, formats=["html"])
|
|
26
33
|
d = doc.model_dump() if hasattr(doc, "model_dump") else (doc if isinstance(doc, dict) else {})
|
|
27
|
-
return check(url, (d.get("
|
|
34
|
+
return check(url, render(d.get("html") or "", base_url=url, fmt=fmt))
|
|
28
35
|
|
|
29
36
|
|
|
30
37
|
def fetch(url: str) -> str:
|
|
@@ -32,11 +39,13 @@ def fetch(url: str) -> str:
|
|
|
32
39
|
# the calling thread, which then makes a later sync-Playwright browser tier in
|
|
33
40
|
# the same batch raise "Sync API inside the asyncio loop". A worker thread
|
|
34
41
|
# confines that loop so the browser tiers stay usable across a multi-URL run.
|
|
42
|
+
# active_format() is thread-local, so read it here (main thread) and pass it in.
|
|
35
43
|
box: dict = {}
|
|
44
|
+
fmt = active_format()
|
|
36
45
|
|
|
37
46
|
def work():
|
|
38
47
|
try:
|
|
39
|
-
box["md"] = _scrape(url)
|
|
48
|
+
box["md"] = _scrape(url, fmt)
|
|
40
49
|
except BaseException as e: # noqa: BLE001 — re-raised to the caller below
|
|
41
50
|
box["err"] = e
|
|
42
51
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: switchback
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
|
|
5
5
|
Author-email: Akash Kodavuru <akash@theaklabs.com>
|
|
6
6
|
License: MIT
|
|
@@ -75,8 +75,8 @@ Dynamic: license-file
|
|
|
75
75
|
Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
|
|
76
76
|
to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
|
|
77
77
|
|
|
78
|
-
[](https://pypi.org/project/switchback/)
|
|
79
|
+
[](https://pypi.org/project/switchback/)
|
|
80
80
|
[](LICENSE)
|
|
81
81
|
[](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
|
|
82
82
|
|
|
@@ -269,6 +269,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
|
|
|
269
269
|
<details>
|
|
270
270
|
<summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
|
|
271
271
|
|
|
272
|
+
- `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
|
|
272
273
|
- `SCRAPER_DEADLINE_S` — per-URL budget (45s)
|
|
273
274
|
- `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
|
|
274
275
|
- `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
|
|
@@ -297,6 +298,34 @@ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
|
|
|
297
298
|
`GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
|
|
298
299
|
`playwright show-trace <zip>`. Off by default (traces are MBs each).
|
|
299
300
|
|
|
301
|
+
### Output formats
|
|
302
|
+
Markdown is the default and is unchanged. Pick a different shape globally with
|
|
303
|
+
`SCRAPER_OUTPUT_FORMAT`, or per call:
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
from switchback import scrape
|
|
307
|
+
scrape(["https://example.com/article"]) # markdown (default)
|
|
308
|
+
scrape(["https://example.com/article"], fmt="html") # raw HTML
|
|
309
|
+
scrape(["https://example.com/article"], fmt="markdown_trimmed")
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
switchback --format html_selectors https://example.com/article
|
|
314
|
+
curl -s localhost:8799/scrape -d '{"urls":["https://example.com"],"format":"html"}'
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
| format | what you get |
|
|
318
|
+
| --- | --- |
|
|
319
|
+
| `markdown` | whole-page markdown (boilerplate stripped + per-domain prefs) — **default** |
|
|
320
|
+
| `markdown_trimmed` | markdown with extra ad/nav/boilerplate lines removed |
|
|
321
|
+
| `html` | the raw HTML exactly as fetched, untouched |
|
|
322
|
+
| `html_selectors` | cleaned HTML (boilerplate strip + per-domain `drop`/`selector`), not converted |
|
|
323
|
+
|
|
324
|
+
The chosen content rides in the result's `markdown` field; in the CLI/server JSON
|
|
325
|
+
the key is `markdown` for markdown formats and `html` for html formats. The
|
|
326
|
+
API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats fall back to
|
|
327
|
+
their text for those sources.
|
|
328
|
+
|
|
300
329
|
### Per-domain extraction
|
|
301
330
|
Markdown of the whole page is the default. To scope a site to its content node or
|
|
302
331
|
strip site-specific noise, declare prefs per host in `config/extraction.json`
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
"""Shared content normalization — HTML→Markdown and PDF→text.
|
|
2
|
-
|
|
3
|
-
Ported from musings-by-hermes/scripts/muse_helpers.py (the most mature version):
|
|
4
|
-
strips boilerplate, promotes lazy-loaded images, resolves relative URLs.
|
|
5
|
-
"""
|
|
6
|
-
from __future__ import annotations
|
|
7
|
-
|
|
8
|
-
import io
|
|
9
|
-
import logging
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
|
-
UA = ("Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 "
|
|
14
|
-
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def html_to_markdown(html: str, base_url: str | None = None) -> str:
|
|
18
|
-
"""HTML → Markdown, preserving images/blockquotes/code.
|
|
19
|
-
|
|
20
|
-
- Strips script/style/nav/header/footer/aside boilerplate.
|
|
21
|
-
- Applies any per-domain extraction prefs (scope selector / extra drops),
|
|
22
|
-
see switchback.extract.
|
|
23
|
-
- Promotes lazy-load attrs (data-src, data-original, srcset) to src.
|
|
24
|
-
- Resolves relative image/link URLs against base_url.
|
|
25
|
-
"""
|
|
26
|
-
try:
|
|
27
|
-
from markdownify import markdownify
|
|
28
|
-
try:
|
|
29
|
-
from bs4 import BeautifulSoup
|
|
30
|
-
from urllib.parse import urljoin
|
|
31
|
-
|
|
32
|
-
from .extract import prefs_for
|
|
33
|
-
prefs = prefs_for(base_url)
|
|
34
|
-
|
|
35
|
-
soup = BeautifulSoup(html or "", "html.parser")
|
|
36
|
-
for tag in soup(["script", "style", "noscript", "nav", "header",
|
|
37
|
-
"footer", "aside", "form", "iframe"]):
|
|
38
|
-
tag.decompose()
|
|
39
|
-
# Per-domain: remove configured noise, then scope to the content node.
|
|
40
|
-
for sel in prefs.get("drop", []):
|
|
41
|
-
for tag in soup.select(sel):
|
|
42
|
-
tag.decompose()
|
|
43
|
-
selector = prefs.get("selector")
|
|
44
|
-
if selector:
|
|
45
|
-
node = soup.select_one(selector)
|
|
46
|
-
if node is not None:
|
|
47
|
-
soup = BeautifulSoup(str(node), "html.parser")
|
|
48
|
-
else:
|
|
49
|
-
logger.debug(f"extract: selector {selector!r} matched nothing for {base_url}")
|
|
50
|
-
for img in soup.find_all("img"):
|
|
51
|
-
src = (img.get("src") or img.get("data-src")
|
|
52
|
-
or img.get("data-original") or img.get("data-lazy-src"))
|
|
53
|
-
if not src and img.get("srcset"):
|
|
54
|
-
src = img["srcset"].split(",")[0].strip().split(" ")[0]
|
|
55
|
-
if src:
|
|
56
|
-
if base_url:
|
|
57
|
-
src = urljoin(base_url, src)
|
|
58
|
-
img["src"] = src
|
|
59
|
-
if base_url:
|
|
60
|
-
for a in soup.find_all("a", href=True):
|
|
61
|
-
a["href"] = urljoin(base_url, a["href"])
|
|
62
|
-
html = str(soup)
|
|
63
|
-
except Exception as e:
|
|
64
|
-
logger.debug(f"soup pre-clean skipped: {e}")
|
|
65
|
-
md = markdownify(html, heading_style="ATX", code_language="",
|
|
66
|
-
bullets="-", strip=["script", "style"])
|
|
67
|
-
return (md or "").strip()
|
|
68
|
-
except Exception as e:
|
|
69
|
-
logger.warning(f"markdownify failed: {e}")
|
|
70
|
-
return (html or "").strip()
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def pdf_bytes_to_text(data: bytes) -> str:
|
|
74
|
-
"""Extract text from PDF bytes. In-memory only — nothing written to disk."""
|
|
75
|
-
from pypdf import PdfReader
|
|
76
|
-
buf = io.BytesIO(data)
|
|
77
|
-
try:
|
|
78
|
-
reader = PdfReader(buf)
|
|
79
|
-
return "\n\n".join((p.extract_text() or "") for p in reader.pages).strip()
|
|
80
|
-
finally:
|
|
81
|
-
buf.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|