switchback 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {switchback-0.1.0 → switchback-0.2.0}/.env.example +13 -0
  2. {switchback-0.1.0 → switchback-0.2.0}/CHANGELOG.md +12 -0
  3. {switchback-0.1.0 → switchback-0.2.0}/PKG-INFO +32 -3
  4. {switchback-0.1.0 → switchback-0.2.0}/README.md +31 -2
  5. {switchback-0.1.0 → switchback-0.2.0}/clients/python_client.py +12 -6
  6. {switchback-0.1.0 → switchback-0.2.0}/pyproject.toml +1 -1
  7. {switchback-0.1.0 → switchback-0.2.0}/switchback/api.py +29 -9
  8. {switchback-0.1.0 → switchback-0.2.0}/switchback/content_cache.py +12 -9
  9. switchback-0.2.0/switchback/normalize.py +183 -0
  10. {switchback-0.1.0 → switchback-0.2.0}/switchback/orchestrator.py +19 -12
  11. {switchback-0.1.0 → switchback-0.2.0}/switchback/server.py +9 -4
  12. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier4_firecrawl.py +13 -4
  13. {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/PKG-INFO +32 -3
  14. switchback-0.1.0/switchback/normalize.py +0 -81
  15. {switchback-0.1.0 → switchback-0.2.0}/CONTRIBUTING.md +0 -0
  16. {switchback-0.1.0 → switchback-0.2.0}/LICENSE +0 -0
  17. {switchback-0.1.0 → switchback-0.2.0}/MANIFEST.in +0 -0
  18. {switchback-0.1.0 → switchback-0.2.0}/NOTICE +0 -0
  19. {switchback-0.1.0 → switchback-0.2.0}/SECURITY.md +0 -0
  20. {switchback-0.1.0 → switchback-0.2.0}/clients/node_bridge.md +0 -0
  21. {switchback-0.1.0 → switchback-0.2.0}/config/botwall_skip_urls.txt +0 -0
  22. {switchback-0.1.0 → switchback-0.2.0}/config/extraction.example.json +0 -0
  23. {switchback-0.1.0 → switchback-0.2.0}/setup.cfg +0 -0
  24. {switchback-0.1.0 → switchback-0.2.0}/switchback/__init__.py +0 -0
  25. {switchback-0.1.0 → switchback-0.2.0}/switchback/__main__.py +0 -0
  26. {switchback-0.1.0 → switchback-0.2.0}/switchback/concurrency.py +0 -0
  27. {switchback-0.1.0 → switchback-0.2.0}/switchback/egress.py +0 -0
  28. {switchback-0.1.0 → switchback-0.2.0}/switchback/extract.py +0 -0
  29. {switchback-0.1.0 → switchback-0.2.0}/switchback/flags.py +0 -0
  30. {switchback-0.1.0 → switchback-0.2.0}/switchback/policy/__init__.py +0 -0
  31. {switchback-0.1.0 → switchback-0.2.0}/switchback/policy/botwall.py +0 -0
  32. {switchback-0.1.0 → switchback-0.2.0}/switchback/policy/gates.py +0 -0
  33. {switchback-0.1.0 → switchback-0.2.0}/switchback/py.typed +0 -0
  34. {switchback-0.1.0 → switchback-0.2.0}/switchback/reporting.py +0 -0
  35. {switchback-0.1.0 → switchback-0.2.0}/switchback/search.py +0 -0
  36. {switchback-0.1.0 → switchback-0.2.0}/switchback/session_cache.py +0 -0
  37. {switchback-0.1.0 → switchback-0.2.0}/switchback/session_trace.py +0 -0
  38. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/__init__.py +0 -0
  39. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/_browser.py +0 -0
  40. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier0_apis.py +0 -0
  41. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier1_http.py +0 -0
  42. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier2_cloudscraper.py +0 -0
  43. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier3_browser.py +0 -0
  44. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier3b_camoufox.py +0 -0
  45. {switchback-0.1.0 → switchback-0.2.0}/switchback/tiers/tier_residential.py +0 -0
  46. {switchback-0.1.0 → switchback-0.2.0}/switchback/tracing.py +0 -0
  47. {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/SOURCES.txt +0 -0
  48. {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/dependency_links.txt +0 -0
  49. {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/entry_points.txt +0 -0
  50. {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/requires.txt +0 -0
  51. {switchback-0.1.0 → switchback-0.2.0}/switchback.egg-info/top_level.txt +0 -0
@@ -14,6 +14,19 @@ OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
14
14
  # ── Search (Tier-0 SearXNG, query → URLs) ───────────────────────────────────
15
15
  SEARXNG_URL=http://localhost:8888
16
16
 
17
+ # ── Output format ───────────────────────────────────────────────────────────
18
+ # Shape of the scraped content. Default markdown is byte-identical to before;
19
+ # override per-call with scrape(fmt=...), the CLI --format flag, or the /scrape
20
+ # {"format": ...} field. html-family results land under a "html" key (instead of
21
+ # "markdown") in the CLI/server JSON.
22
+ # markdown whole-page markdown (default)
23
+ # markdown_trimmed markdown with extra ad/nav/boilerplate lines removed
24
+ # html raw HTML exactly as fetched (no cleaning)
25
+ # html_selectors cleaned HTML (boilerplate strip + per-domain drop/selector)
26
+ # Note: the API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats
27
+ # fall back to their text for those sources.
28
+ SCRAPER_OUTPUT_FORMAT=markdown
29
+
17
30
  # ── Tier 2.5 · Jina Reader (r.jina.ai) ──────────────────────────────────────
18
31
  # Optional: keyless works at 20 RPM. A key gives 500 RPM + a 10M-token grant.
19
32
  JINA_API_KEY=
@@ -6,6 +6,18 @@ versioning while pre-1.0.
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.2.0] - 2026-06-25
10
+
11
+ ### Added
12
+ - **Selectable output formats** — `SCRAPER_OUTPUT_FORMAT` (or per-call
13
+ `scrape(fmt=...)`, CLI `--format`, `/scrape` `{"format": ...}`) selects the
14
+ content shape: `markdown` (default, unchanged), `markdown_trimmed` (extra
15
+ ad/nav/boilerplate removed), `html` (raw), or `html_selectors` (cleaned HTML
16
+ with per-domain `drop`/`selector` applied). Default output is byte-identical;
17
+ html-family results use a `html` JSON key instead of `markdown`.
18
+
19
+ ## [0.1.0] - 2026-06-23
20
+
9
21
  ### Added
10
22
  - **Challenge-type learning** — bot-walls are classified by vendor (Cloudflare,
11
23
  DataDome, Akamai, PerimeterX, Incapsula, Google) and counted per host in the
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: switchback
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
5
5
  Author-email: Akash Kodavuru <akash@theaklabs.com>
6
6
  License: MIT
@@ -75,8 +75,8 @@ Dynamic: license-file
75
75
  Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
76
76
  to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
77
77
 
78
- [![PyPI](https://img.shields.io/pypi/v/switchback.svg)](https://pypi.org/project/switchback/)
79
- [![Python](https://img.shields.io/pypi/pyversions/switchback.svg)](https://pypi.org/project/switchback/)
78
+ [![PyPI](https://img.shields.io/pypi/v/switchback)](https://pypi.org/project/switchback/)
79
+ [![Python](https://img.shields.io/pypi/pyversions/switchback)](https://pypi.org/project/switchback/)
80
80
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
81
81
  [![CI](https://github.com/akash-kr/switchback/actions/workflows/ci.yml/badge.svg)](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
82
82
 
@@ -269,6 +269,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
269
269
  <details>
270
270
  <summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
271
271
 
272
+ - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
272
273
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
273
274
  - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
274
275
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
@@ -297,6 +298,34 @@ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
297
298
  `GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
298
299
  `playwright show-trace <zip>`. Off by default (traces are MBs each).
299
300
 
301
+ ### Output formats
302
+ Markdown is the default and is unchanged. Pick a different shape globally with
303
+ `SCRAPER_OUTPUT_FORMAT`, or per call:
304
+
305
+ ```python
306
+ from switchback import scrape
307
+ scrape(["https://example.com/article"]) # markdown (default)
308
+ scrape(["https://example.com/article"], fmt="html") # raw HTML
309
+ scrape(["https://example.com/article"], fmt="markdown_trimmed")
310
+ ```
311
+
312
+ ```bash
313
+ switchback --format html_selectors https://example.com/article
314
+ curl -s localhost:8799/scrape -d '{"urls":["https://example.com"],"format":"html"}'
315
+ ```
316
+
317
+ | format | what you get |
318
+ | --- | --- |
319
+ | `markdown` | whole-page markdown (boilerplate stripped + per-domain prefs) — **default** |
320
+ | `markdown_trimmed` | markdown with extra ad/nav/boilerplate lines removed |
321
+ | `html` | the raw HTML exactly as fetched, untouched |
322
+ | `html_selectors` | cleaned HTML (boilerplate strip + per-domain `drop`/`selector`), not converted |
323
+
324
+ The chosen content rides in the result's `markdown` field; in the CLI/server JSON
325
+ the key is `markdown` for markdown formats and `html` for html formats. The
326
+ API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats fall back to
327
+ their text for those sources.
328
+
300
329
  ### Per-domain extraction
301
330
  Markdown of the whole page is the default. To scope a site to its content node or
302
331
  strip site-specific noise, declare prefs per host in `config/extraction.json`
@@ -16,8 +16,8 @@
16
16
  Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
17
17
  to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
18
18
 
19
- [![PyPI](https://img.shields.io/pypi/v/switchback.svg)](https://pypi.org/project/switchback/)
20
- [![Python](https://img.shields.io/pypi/pyversions/switchback.svg)](https://pypi.org/project/switchback/)
19
+ [![PyPI](https://img.shields.io/pypi/v/switchback)](https://pypi.org/project/switchback/)
20
+ [![Python](https://img.shields.io/pypi/pyversions/switchback)](https://pypi.org/project/switchback/)
21
21
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
22
22
  [![CI](https://github.com/akash-kr/switchback/actions/workflows/ci.yml/badge.svg)](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
23
23
 
@@ -210,6 +210,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
210
210
  <details>
211
211
  <summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
212
212
 
213
+ - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
213
214
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
214
215
  - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
215
216
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
@@ -238,6 +239,34 @@ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
238
239
  `GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
239
240
  `playwright show-trace <zip>`. Off by default (traces are MBs each).
240
241
 
242
+ ### Output formats
243
+ Markdown is the default and is unchanged. Pick a different shape globally with
244
+ `SCRAPER_OUTPUT_FORMAT`, or per call:
245
+
246
+ ```python
247
+ from switchback import scrape
248
+ scrape(["https://example.com/article"]) # markdown (default)
249
+ scrape(["https://example.com/article"], fmt="html") # raw HTML
250
+ scrape(["https://example.com/article"], fmt="markdown_trimmed")
251
+ ```
252
+
253
+ ```bash
254
+ switchback --format html_selectors https://example.com/article
255
+ curl -s localhost:8799/scrape -d '{"urls":["https://example.com"],"format":"html"}'
256
+ ```
257
+
258
+ | format | what you get |
259
+ | --- | --- |
260
+ | `markdown` | whole-page markdown (boilerplate stripped + per-domain prefs) — **default** |
261
+ | `markdown_trimmed` | markdown with extra ad/nav/boilerplate lines removed |
262
+ | `html` | the raw HTML exactly as fetched, untouched |
263
+ | `html_selectors` | cleaned HTML (boilerplate strip + per-domain `drop`/`selector`), not converted |
264
+
265
+ The chosen content rides in the result's `markdown` field; in the CLI/server JSON
266
+ the key is `markdown` for markdown formats and `html` for html formats. The
267
+ API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats fall back to
268
+ their text for those sources.
269
+
241
270
  ### Per-domain extraction
242
271
  Markdown of the whole page is the default. To scope a site to its content node or
243
272
  strip site-specific noise, declare prefs per host in `config/extraction.json`
@@ -59,9 +59,10 @@ def _service_up() -> bool:
59
59
  return False
60
60
 
61
61
 
62
- def _cli_scrape(urls: list[str]) -> list[dict]:
62
+ def _cli_scrape(urls: list[str], fmt: str | None = None) -> list[dict]:
63
+ flag = ["--format", fmt] if fmt else []
63
64
  proc = subprocess.run(
64
- [sys.executable, "-m", "switchback", *urls],
65
+ [sys.executable, "-m", "switchback", *flag, *urls],
65
66
  cwd=ENGINE_DIR, capture_output=True, text=True,
66
67
  )
67
68
  if proc.returncode not in (0, 1): # 1 == "no successes", still valid JSON ([])
@@ -69,15 +70,20 @@ def _cli_scrape(urls: list[str]) -> list[dict]:
69
70
  return json.loads(proc.stdout or "[]")
70
71
 
71
72
 
72
- def scrape(urls: str | list[str]) -> list[dict]:
73
- """Scrape one or many URLs through the engine cascade. Successes only."""
73
+ def scrape(urls: str | list[str], fmt: str | None = None) -> list[dict]:
74
+ """Scrape one or many URLs through the engine cascade. Successes only.
75
+
76
+ fmt selects the output format (markdown | markdown_trimmed | html |
77
+ html_selectors); None uses the engine default (markdown). For html formats the
78
+ content lands under a "html" key instead of "markdown"."""
74
79
  if isinstance(urls, str):
75
80
  urls = [urls]
76
81
  if not urls:
77
82
  return []
78
83
  if _service_up():
79
- return _http_post("/scrape", {"urls": urls})
80
- return _cli_scrape(urls)
84
+ payload = {"urls": urls, "format": fmt} if fmt else {"urls": urls}
85
+ return _http_post("/scrape", payload)
86
+ return _cli_scrape(urls, fmt)
81
87
 
82
88
 
83
89
  def search(query: str) -> list[dict]:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "switchback"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -10,27 +10,30 @@ from __future__ import annotations
10
10
 
11
11
  import sys
12
12
 
13
+ from .normalize import output_key
13
14
  from .orchestrator import ScrapeOutcome, ScrapeResult, TierAttempt, run, run_detailed
14
15
  from .search import search # re-export: query → URLs (SearXNG)
15
16
 
16
17
 
17
- def scrape(urls: str | list[str]) -> list[ScrapeResult]:
18
+ def scrape(urls: str | list[str], fmt: str | None = None) -> list[ScrapeResult]:
18
19
  """Scrape one or many URLs through the cascade. Returns successes only.
19
20
 
21
+ fmt selects the output format (markdown | markdown_trimmed | html |
22
+ html_selectors); None uses the SCRAPER_OUTPUT_FORMAT default (markdown).
20
23
  For failures with classified reasons + the per-tier cascade, use
21
24
  scrape_detailed()."""
22
25
  if isinstance(urls, str):
23
26
  urls = [urls]
24
- return run(urls)
27
+ return run(urls, fmt)
25
28
 
26
29
 
27
- def scrape_detailed(urls: str | list[str]) -> list[ScrapeOutcome]:
30
+ def scrape_detailed(urls: str | list[str], fmt: str | None = None) -> list[ScrapeOutcome]:
28
31
  """Like scrape() but returns a ScrapeOutcome per URL — successes *and*
29
32
  failures, each with final_outcome, error_class, status_code, and the
30
- per-tier attempts that were made."""
33
+ per-tier attempts that were made. fmt as in scrape()."""
31
34
  if isinstance(urls, str):
32
35
  urls = [urls]
33
- return run_detailed(urls)
36
+ return run_detailed(urls, fmt)
34
37
 
35
38
 
36
39
  def _main() -> int:
@@ -50,9 +53,10 @@ def _main() -> int:
50
53
  _k = _k.strip()
51
54
  if _k and _k not in _os.environ:
52
55
  _os.environ[_k] = _v.strip()
53
- usage = ("usage: switchback <url> [<url> ...]\n"
56
+ usage = ("usage: switchback [--format FMT] <url> [<url> ...]\n"
54
57
  " switchback --search <query ...>\n"
55
- " (or: python -m switchback <url> ...)")
58
+ " (or: python -m switchback <url> ...)\n"
59
+ " FMT: markdown (default) | markdown_trimmed | html | html_selectors")
56
60
  # --help/-h is an explicit request: usage to stdout, exit 0 (don't treat it
57
61
  # as a URL to scrape). Check before any work so it stays fast and side-effect-free.
58
62
  if any(a in ("--help", "-h") for a in sys.argv[1:]):
@@ -69,9 +73,25 @@ def _main() -> int:
69
73
  [{"title": h.title, "url": h.url, "snippet": h.snippet} for h in hits],
70
74
  indent=2))
71
75
  return 0 if hits else 1
72
- results = scrape(sys.argv[1:])
76
+ # Optional --format / --format=FMT flag; everything else is a URL.
77
+ fmt: str | None = None
78
+ rest: list[str] = []
79
+ argv = sys.argv[1:]
80
+ i = 0
81
+ while i < len(argv):
82
+ a = argv[i]
83
+ if a == "--format" and i + 1 < len(argv):
84
+ fmt = argv[i + 1]; i += 2; continue
85
+ if a.startswith("--format="):
86
+ fmt = a.split("=", 1)[1]; i += 1; continue
87
+ rest.append(a); i += 1
88
+ if not rest:
89
+ print(usage, file=sys.stderr)
90
+ return 2
91
+ results = scrape(rest, fmt=fmt)
73
92
  print(json.dumps(
74
- [{"url": r.url, "source_method": r.source_method, "markdown": r.markdown}
93
+ [{"url": r.url, "source_method": r.source_method,
94
+ output_key(r.format): r.markdown}
75
95
  for r in results],
76
96
  indent=2))
77
97
  return 0 if results else 1
@@ -36,11 +36,14 @@ def enabled() -> bool:
36
36
  return _TTL_S > 0
37
37
 
38
38
 
39
- def _norm(url: str) -> str:
40
- """Drop the fragment; everything else is significant (query strings select
41
- content)."""
39
+ def _norm(url: str, fmt: str = "markdown") -> str:
40
+ """Cache key: URL with the fragment dropped (query strings select content).
41
+ Non-default output formats are namespaced so an html result is never served
42
+ for a markdown request; the default `markdown` key is unprefixed, so existing
43
+ caches and the default path are unchanged."""
42
44
  p = urlsplit(url)
43
- return urlunsplit((p.scheme, p.netloc, p.path, p.query, ""))
45
+ key = urlunsplit((p.scheme, p.netloc, p.path, p.query, ""))
46
+ return key if fmt == "markdown" else f"{fmt}\x00{key}"
44
47
 
45
48
 
46
49
  def _conn() -> sqlite3.Connection:
@@ -58,8 +61,8 @@ def _conn() -> sqlite3.Connection:
58
61
  return _CONN
59
62
 
60
63
 
61
- def get(url: str) -> tuple[str, str] | None:
62
- """Return ``(markdown, source_method)`` for a fresh cache hit, else None."""
64
+ def get(url: str, fmt: str = "markdown") -> tuple[str, str] | None:
65
+ """Return ``(content, source_method)`` for a fresh cache hit, else None."""
63
66
  if not enabled():
64
67
  return None
65
68
  conn = _conn() # NB: acquires _LOCK itself — must be outside the lock below
@@ -67,7 +70,7 @@ def get(url: str) -> tuple[str, str] | None:
67
70
  with _LOCK:
68
71
  row = conn.execute(
69
72
  "SELECT markdown, source_method, ts FROM cache WHERE url=?",
70
- (_norm(url),)).fetchone()
73
+ (_norm(url, fmt),)).fetchone()
71
74
  except Exception as e:
72
75
  logger.warning(f"content_cache: read failed: {e}")
73
76
  return None
@@ -79,7 +82,7 @@ def get(url: str) -> tuple[str, str] | None:
79
82
  return markdown, source_method
80
83
 
81
84
 
82
- def put(url: str, markdown: str, source_method: str) -> None:
85
+ def put(url: str, markdown: str, source_method: str, fmt: str = "markdown") -> None:
83
86
  """Store a successful scrape. No-op when disabled."""
84
87
  if not enabled():
85
88
  return
@@ -88,7 +91,7 @@ def put(url: str, markdown: str, source_method: str) -> None:
88
91
  with _LOCK:
89
92
  conn.execute("INSERT OR REPLACE INTO cache (url, markdown, source_method, ts) "
90
93
  "VALUES (?, ?, ?, ?)",
91
- (_norm(url), markdown, source_method, time.time()))
94
+ (_norm(url, fmt), markdown, source_method, time.time()))
92
95
  conn.commit()
93
96
  except Exception as e:
94
97
  logger.warning(f"content_cache: write failed: {e}")
@@ -0,0 +1,183 @@
1
+ """Shared content normalization — HTML→Markdown and PDF→text.
2
+
3
+ Ported from musings-by-hermes/scripts/muse_helpers.py (the most mature version):
4
+ strips boilerplate, promotes lazy-loaded images, resolves relative URLs.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import io
9
+ import logging
10
+ import os
11
+ import re
12
+ import threading
13
+ from contextlib import contextmanager
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ UA = ("Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 "
18
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
19
+
20
+ # ── Output format ───────────────────────────────────────────────────────────
21
+ # Default is markdown (today's behavior, byte-identical). Opt into other shapes
22
+ # globally via SCRAPER_OUTPUT_FORMAT, or per-call via the scrape(fmt=...) /
23
+ # --format / {"format": ...} overrides which set output_format_scope().
24
+ # markdown whole-page markdown (default)
25
+ # markdown_trimmed markdown with extra ad/nav/boilerplate lines removed
26
+ # html raw HTML exactly as fetched (no cleaning)
27
+ # html_selectors cleaned HTML (boilerplate strip + per-domain drop/selector),
28
+ # not converted to markdown
29
+ VALID_FORMATS = ("markdown", "markdown_trimmed", "html", "html_selectors")
30
+
31
+
32
+ def _validate(fmt: str | None) -> str:
33
+ f = (fmt or "").strip().lower()
34
+ if f not in VALID_FORMATS:
35
+ logger.warning(f"unknown output format {fmt!r}; using 'markdown' "
36
+ f"(valid: {', '.join(VALID_FORMATS)})")
37
+ return "markdown"
38
+ return f
39
+
40
+
41
+ OUTPUT_FORMAT = _validate(os.getenv("SCRAPER_OUTPUT_FORMAT", "markdown"))
42
+
43
+ # Per-thread active-format override. Thread-local by construction (like egress):
44
+ # the orchestrator sets it on the worker thread that also runs the tier fetch +
45
+ # this module's conversion, so concurrent server requests can't bleed formats.
46
+ _scope = threading.local()
47
+
48
+
49
+ @contextmanager
50
+ def output_format_scope(fmt: str | None):
51
+ """Set the active output format for the enclosed work (per-thread). A falsy
52
+ fmt means 'use the SCRAPER_OUTPUT_FORMAT default'. Always restored on exit."""
53
+ prev = getattr(_scope, "fmt", None)
54
+ _scope.fmt = _validate(fmt) if fmt else None
55
+ try:
56
+ yield
57
+ finally:
58
+ _scope.fmt = prev
59
+
60
+
61
+ def active_format() -> str:
62
+ """The format in effect for the current thread: the per-call override if set,
63
+ else the SCRAPER_OUTPUT_FORMAT default."""
64
+ return getattr(_scope, "fmt", None) or OUTPUT_FORMAT
65
+
66
+
67
+ def output_key(fmt: str) -> str:
68
+ """The JSON/result key for a format's content family: html-family → "html",
69
+ markdown-family → "markdown". Lets the default path stay {"...","markdown"}."""
70
+ return "html" if fmt.startswith("html") else "markdown"
71
+
72
+
73
+ def _clean_html(html: str, base_url: str | None = None) -> str:
74
+ """Return cleaned HTML: boilerplate stripped, per-domain drop/selector applied
75
+ (see switchback.extract), lazy-load image attrs promoted, relative image/link
76
+ URLs resolved against base_url. On any failure returns `html` unchanged."""
77
+ try:
78
+ from bs4 import BeautifulSoup
79
+ from urllib.parse import urljoin
80
+
81
+ from .extract import prefs_for
82
+ prefs = prefs_for(base_url)
83
+
84
+ soup = BeautifulSoup(html or "", "html.parser")
85
+ for tag in soup(["script", "style", "noscript", "nav", "header",
86
+ "footer", "aside", "form", "iframe"]):
87
+ tag.decompose()
88
+ # Per-domain: remove configured noise, then scope to the content node.
89
+ for sel in prefs.get("drop", []):
90
+ for tag in soup.select(sel):
91
+ tag.decompose()
92
+ selector = prefs.get("selector")
93
+ if selector:
94
+ node = soup.select_one(selector)
95
+ if node is not None:
96
+ soup = BeautifulSoup(str(node), "html.parser")
97
+ else:
98
+ logger.debug(f"extract: selector {selector!r} matched nothing for {base_url}")
99
+ for img in soup.find_all("img"):
100
+ src = (img.get("src") or img.get("data-src")
101
+ or img.get("data-original") or img.get("data-lazy-src"))
102
+ if not src and img.get("srcset"):
103
+ src = img["srcset"].split(",")[0].strip().split(" ")[0]
104
+ if src:
105
+ if base_url:
106
+ src = urljoin(base_url, src)
107
+ img["src"] = src
108
+ if base_url:
109
+ for a in soup.find_all("a", href=True):
110
+ a["href"] = urljoin(base_url, a["href"])
111
+ return str(soup)
112
+ except Exception as e:
113
+ logger.debug(f"soup pre-clean skipped: {e}")
114
+ return html
115
+
116
+
117
+ # Lines that markdown_trimmed drops: standalone images, link-only/nav rows, and
118
+ # short promotional boilerplate. Conservative on purpose — prose is never touched.
119
+ _TRIM_IMG_RE = re.compile(r"^!\[[^\]]*\]\([^)]*\)$")
120
+ _TRIM_LINKS_ONLY_RE = re.compile(r"^(?:[-*>]\s*)?(?:\[[^\]]*\]\([^)]*\)[\s|·•\-–—]*)+$")
121
+ _TRIM_BOILERPLATE_RE = re.compile(
122
+ r"^(subscribe|sign\s*up|sign\s*in|log\s*in|logout|newsletter|advertisement|"
123
+ r"accept\s+all|cookie|follow\s+us|share\s+this)\b", re.I)
124
+
125
+
126
+ def _trim_markdown(md: str) -> str:
127
+ """Markdown minus common ad/nav/boilerplate noise. Drops only standalone-image
128
+ lines, link-only/nav rows, and short promotional boilerplate lines; keeps all
129
+ prose. Collapses 3+ blank lines to one."""
130
+ kept: list[str] = []
131
+ for line in md.splitlines():
132
+ s = line.strip()
133
+ if not s:
134
+ kept.append("")
135
+ continue
136
+ if _TRIM_IMG_RE.match(s) or _TRIM_LINKS_ONLY_RE.match(s):
137
+ continue
138
+ if len(s) <= 60 and _TRIM_BOILERPLATE_RE.match(s):
139
+ continue
140
+ kept.append(line)
141
+ return re.sub(r"\n{3,}", "\n\n", "\n".join(kept)).strip()
142
+
143
+
144
+ def render(html: str, base_url: str | None = None, fmt: str | None = None) -> str:
145
+ """Render fetched HTML in `fmt` (default: the active output format).
146
+
147
+ - markdown whole-page markdown (boilerplate strip + per-domain prefs)
148
+ - markdown_trimmed markdown with extra ad/nav/boilerplate lines removed
149
+ - html the raw HTML, untouched
150
+ - html_selectors cleaned HTML (per-domain prefs applied), not converted
151
+ """
152
+ fmt = _validate(fmt) if fmt is not None else active_format()
153
+ if fmt == "html":
154
+ return html or ""
155
+ if fmt == "html_selectors":
156
+ return (_clean_html(html, base_url) or "").strip()
157
+ try:
158
+ from markdownify import markdownify
159
+ cleaned = _clean_html(html, base_url)
160
+ md = (markdownify(cleaned, heading_style="ATX", code_language="",
161
+ bullets="-", strip=["script", "style"]) or "").strip()
162
+ return _trim_markdown(md) if fmt == "markdown_trimmed" else md
163
+ except Exception as e:
164
+ logger.warning(f"markdownify failed: {e}")
165
+ return (html or "").strip()
166
+
167
+
168
+ def html_to_markdown(html: str, base_url: str | None = None) -> str:
169
+ """Render `html` in the active output format (default markdown). Name kept for
170
+ back-compat: every tier calls this, so it automatically honors the selected
171
+ SCRAPER_OUTPUT_FORMAT / per-call format with no per-tier changes."""
172
+ return render(html, base_url)
173
+
174
+
175
+ def pdf_bytes_to_text(data: bytes) -> str:
176
+ """Extract text from PDF bytes. In-memory only — nothing written to disk."""
177
+ from pypdf import PdfReader
178
+ buf = io.BytesIO(data)
179
+ try:
180
+ reader = PdfReader(buf)
181
+ return "\n\n".join((p.extract_text() or "") for p in reader.pages).strip()
182
+ finally:
183
+ buf.close()
@@ -17,6 +17,7 @@ import time
17
17
  from dataclasses import dataclass, field
18
18
 
19
19
  from . import content_cache, egress, session_cache
20
+ from .normalize import active_format, output_format_scope
20
21
  from .policy import botwall
21
22
  from .policy.gates import BotWall, RateLimited, ShortContent, classify_error, host_of
22
23
  from .tiers import TIERS, INDEX
@@ -68,8 +69,9 @@ _FAILURE_PRIORITY = {
68
69
  @dataclass
69
70
  class ScrapeResult:
70
71
  url: str
71
- markdown: str
72
+ markdown: str # the rendered content (format named by `format`)
72
73
  source_method: str # tier NAME that won
74
+ format: str = "markdown" # markdown | markdown_trimmed | html | html_selectors
73
75
 
74
76
 
75
77
  @dataclass
@@ -95,6 +97,7 @@ class ScrapeOutcome:
95
97
  latency_ms: int | None = None
96
98
  egress: str = "direct" # "egress" if routed via SCRAPER_EGRESS_PROXY, else "direct"
97
99
  wire_bytes: int = 0 # bytes transferred over the network (cost basis for proxy GB)
100
+ format: str = "markdown" # output format of `markdown` (the content field)
98
101
  attempts: list[TierAttempt] = field(default_factory=list)
99
102
 
100
103
 
@@ -180,7 +183,7 @@ def _run_one(url: str, db: dict) -> ScrapeOutcome:
180
183
  if botwall.is_url_skipped(url, db):
181
184
  return _skipped(url, root, "url_excluded",
182
185
  db.get("urls", {}).get(url, {}).get("reason", ""))
183
- hit = content_cache.get(url)
186
+ hit = content_cache.get(url, active_format())
184
187
  if hit:
185
188
  md, method = hit
186
189
  root.set(Attr.OUTCOME, "cache_hit")
@@ -188,7 +191,7 @@ def _run_one(url: str, db: dict) -> ScrapeOutcome:
188
191
  root.set(Attr.MD_LEN, len(md))
189
192
  logger.info(f"cache_hit {url} (was {method})")
190
193
  return ScrapeOutcome(url, True, markdown=md, source_method=method,
191
- final_outcome="ok")
194
+ final_outcome="ok", format=active_format())
192
195
  # A needs_egress host runs the whole cascade in the egress scope, so the
193
196
  # tiers route through SCRAPER_EGRESS_PROXY (when set); easy hosts stay
194
197
  # direct and never spend residential bandwidth.
@@ -300,7 +303,7 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
300
303
  sp.set(Attr.SOURCE, tier.NAME)
301
304
  sp.set(Attr.LATENCY_MS, dt)
302
305
  botwall.record(db, url, tier.NAME, "ok", md_len=len(md), latency_ms=dt)
303
- content_cache.put(url, md, tier.NAME)
306
+ content_cache.put(url, md, tier.NAME, active_format())
304
307
  root.set(Attr.OUTCOME, "ok")
305
308
  root.set(Attr.SOURCE, tier.NAME)
306
309
  root.set(Attr.LATENCY_MS, total)
@@ -308,7 +311,8 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
308
311
  logger.info(
309
312
  f"{tier.NAME} OK {url} md_len={len(md)} {dt}ms (total {total}ms)")
310
313
  return ScrapeOutcome(url, True, markdown=md, source_method=tier.NAME,
311
- final_outcome="ok", latency_ms=total, attempts=attempts)
314
+ final_outcome="ok", latency_ms=total,
315
+ format=active_format(), attempts=attempts)
312
316
 
313
317
  total = int((time.monotonic() - t0) * 1000)
314
318
  ec, sc = _dominant_failure(attempts)
@@ -323,21 +327,24 @@ def _run_cascade(url, host, db, root, t0, deadline) -> ScrapeOutcome:
323
327
  latency_ms=total, attempts=attempts)
324
328
 
325
329
 
326
- def run_detailed(urls: list[str]) -> list[ScrapeOutcome]:
330
+ def run_detailed(urls: list[str], fmt: str | None = None) -> list[ScrapeOutcome]:
327
331
  """Scrape each URL; return a full ScrapeOutcome (success or failure with the
328
- per-tier cascade and a classified reason) for every URL."""
332
+ per-tier cascade and a classified reason) for every URL.
333
+
334
+ fmt overrides SCRAPER_OUTPUT_FORMAT for this call (None = use the default)."""
329
335
  db = botwall.load_db()
330
336
  out = []
331
337
  try:
332
- for url in urls:
333
- out.append(_run_one(url, db))
338
+ with output_format_scope(fmt):
339
+ for url in urls:
340
+ out.append(_run_one(url, db))
334
341
  finally:
335
342
  botwall.save_db(db)
336
343
  flush()
337
344
  return out
338
345
 
339
346
 
340
- def run(urls: list[str]) -> list[ScrapeResult]:
347
+ def run(urls: list[str], fmt: str | None = None) -> list[ScrapeResult]:
341
348
  """Successes only (backward-compatible). Use run_detailed() for failures."""
342
- return [ScrapeResult(o.url, o.markdown, o.source_method)
343
- for o in run_detailed(urls) if o.ok]
349
+ return [ScrapeResult(o.url, o.markdown, o.source_method, o.format)
350
+ for o in run_detailed(urls, fmt) if o.ok]
@@ -25,15 +25,17 @@ from pydantic import BaseModel
25
25
 
26
26
  from . import session_trace
27
27
  from .api import scrape
28
+ from .normalize import output_key
28
29
  from .reporting import build_report, domain_report
29
30
  from .search import search
30
31
  from .tracing import setup_logs
31
32
 
32
- app = FastAPI(title="switchback", version="0.1.0")
33
+ app = FastAPI(title="switchback", version="0.2.0")
33
34
 
34
35
 
35
36
  class ScrapeRequest(BaseModel):
36
37
  urls: list[str]
38
+ format: str | None = None # markdown (default) | markdown_trimmed | html | html_selectors
37
39
 
38
40
 
39
41
  @app.get("/healthz")
@@ -43,9 +45,12 @@ def healthz() -> dict:
43
45
 
44
46
  @app.post("/scrape")
45
47
  def scrape_endpoint(req: ScrapeRequest) -> list[dict]:
46
- """Run URLs through the cascade. Returns successes only (failed URLs omitted)."""
47
- return [{"url": r.url, "source_method": r.source_method, "markdown": r.markdown}
48
- for r in scrape(req.urls)]
48
+ """Run URLs through the cascade. Returns successes only (failed URLs omitted).
49
+ Optional "format" selects the output shape; the content key is "markdown" for
50
+ markdown formats and "html" for html formats."""
51
+ return [{"url": r.url, "source_method": r.source_method,
52
+ output_key(r.format): r.markdown}
53
+ for r in scrape(req.urls, fmt=req.format)]
49
54
 
50
55
 
51
56
  @app.get("/search")
@@ -9,6 +9,7 @@ from __future__ import annotations
9
9
  import os
10
10
  import threading
11
11
 
12
+ from ..normalize import active_format, render
12
13
  from ..policy.gates import check
13
14
 
14
15
  NAME = "tier4_firecrawl"
@@ -19,12 +20,18 @@ def disabled() -> bool:
19
20
  return bool(os.getenv("SCRAPER_DISABLE_FIRECRAWL"))
20
21
 
21
22
 
22
- def _scrape(url: str) -> str:
23
+ def _scrape(url: str, fmt: str) -> str:
23
24
  from firecrawl import Firecrawl
24
25
  app = Firecrawl(api_key=os.environ["FIRECRAWL_API_KEY"])
25
- doc = app.scrape(url, formats=["markdown"])
26
+ if fmt == "markdown":
27
+ doc = app.scrape(url, formats=["markdown"])
28
+ d = doc.model_dump() if hasattr(doc, "model_dump") else (doc if isinstance(doc, dict) else {})
29
+ return check(url, (d.get("markdown") or "").strip())
30
+ # Non-default formats: fetch HTML and derive every shape through normalize, so
31
+ # html / html_selectors / markdown_trimmed match the rest of the cascade.
32
+ doc = app.scrape(url, formats=["html"])
26
33
  d = doc.model_dump() if hasattr(doc, "model_dump") else (doc if isinstance(doc, dict) else {})
27
- return check(url, (d.get("markdown") or "").strip())
34
+ return check(url, render(d.get("html") or "", base_url=url, fmt=fmt))
28
35
 
29
36
 
30
37
  def fetch(url: str) -> str:
@@ -32,11 +39,13 @@ def fetch(url: str) -> str:
32
39
  # the calling thread, which then makes a later sync-Playwright browser tier in
33
40
  # the same batch raise "Sync API inside the asyncio loop". A worker thread
34
41
  # confines that loop so the browser tiers stay usable across a multi-URL run.
42
+ # active_format() is thread-local, so read it here (main thread) and pass it in.
35
43
  box: dict = {}
44
+ fmt = active_format()
36
45
 
37
46
  def work():
38
47
  try:
39
- box["md"] = _scrape(url)
48
+ box["md"] = _scrape(url, fmt)
40
49
  except BaseException as e: # noqa: BLE001 — re-raised to the caller below
41
50
  box["err"] = e
42
51
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: switchback
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: One cost-ordered scrape cascade (HTTP → stealth browser → paid), shared by every tool.
5
5
  Author-email: Akash Kodavuru <akash@theaklabs.com>
6
6
  License: MIT
@@ -75,8 +75,8 @@ Dynamic: license-file
75
75
  Give it a URL; it tries the cheapest way to get clean Markdown first and only escalates
76
76
  to a heavier (slower, costlier) tier when the cheap one is walled. Stops at the first success.
77
77
 
78
- [![PyPI](https://img.shields.io/pypi/v/switchback.svg)](https://pypi.org/project/switchback/)
79
- [![Python](https://img.shields.io/pypi/pyversions/switchback.svg)](https://pypi.org/project/switchback/)
78
+ [![PyPI](https://img.shields.io/pypi/v/switchback)](https://pypi.org/project/switchback/)
79
+ [![Python](https://img.shields.io/pypi/pyversions/switchback)](https://pypi.org/project/switchback/)
80
80
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
81
81
  [![CI](https://github.com/akash-kr/switchback/actions/workflows/ci.yml/badge.svg)](https://github.com/akash-kr/switchback/actions/workflows/ci.yml)
82
82
 
@@ -269,6 +269,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
269
269
  <details>
270
270
  <summary><b>Tunables</b> — budgets, timeouts, caches, backoff</summary>
271
271
 
272
+ - `SCRAPER_OUTPUT_FORMAT` — output shape: `markdown` (default) · `markdown_trimmed` · `html` · `html_selectors` (see [Output formats](#output-formats))
272
273
  - `SCRAPER_DEADLINE_S` — per-URL budget (45s)
273
274
  - `SCRAPER_CAMOUFOX_TIMEOUT_MS` — (45000)
274
275
  - `SCRAPER_BROWSER_CONCURRENCY` — max simultaneous headless browsers (default 1)
@@ -297,6 +298,34 @@ trace zip to `state/traces/`. Manage them over HTTP — `GET /traces` (list),
297
298
  `GET /traces/{id}` (download), `DELETE /traces/{id}` — and open one with
298
299
  `playwright show-trace <zip>`. Off by default (traces are MBs each).
299
300
 
301
+ ### Output formats
302
+ Markdown is the default and is unchanged. Pick a different shape globally with
303
+ `SCRAPER_OUTPUT_FORMAT`, or per call:
304
+
305
+ ```python
306
+ from switchback import scrape
307
+ scrape(["https://example.com/article"]) # markdown (default)
308
+ scrape(["https://example.com/article"], fmt="html") # raw HTML
309
+ scrape(["https://example.com/article"], fmt="markdown_trimmed")
310
+ ```
311
+
312
+ ```bash
313
+ switchback --format html_selectors https://example.com/article
314
+ curl -s localhost:8799/scrape -d '{"urls":["https://example.com"],"format":"html"}'
315
+ ```
316
+
317
+ | format | what you get |
318
+ | --- | --- |
319
+ | `markdown` | whole-page markdown (boilerplate stripped + per-domain prefs) — **default** |
320
+ | `markdown_trimmed` | markdown with extra ad/nav/boilerplate lines removed |
321
+ | `html` | the raw HTML exactly as fetched, untouched |
322
+ | `html_selectors` | cleaned HTML (boilerplate strip + per-domain `drop`/`selector`), not converted |
323
+
324
+ The chosen content rides in the result's `markdown` field; in the CLI/server JSON
325
+ the key is `markdown` for markdown formats and `html` for html formats. The
326
+ API/PDF tiers (arXiv synth, PDF→text) have no HTML, so html formats fall back to
327
+ their text for those sources.
328
+
300
329
  ### Per-domain extraction
301
330
  Markdown of the whole page is the default. To scope a site to its content node or
302
331
  strip site-specific noise, declare prefs per host in `config/extraction.json`
@@ -1,81 +0,0 @@
1
- """Shared content normalization — HTML→Markdown and PDF→text.
2
-
3
- Ported from musings-by-hermes/scripts/muse_helpers.py (the most mature version):
4
- strips boilerplate, promotes lazy-loaded images, resolves relative URLs.
5
- """
6
- from __future__ import annotations
7
-
8
- import io
9
- import logging
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
- UA = ("Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 "
14
- "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
15
-
16
-
17
- def html_to_markdown(html: str, base_url: str | None = None) -> str:
18
- """HTML → Markdown, preserving images/blockquotes/code.
19
-
20
- - Strips script/style/nav/header/footer/aside boilerplate.
21
- - Applies any per-domain extraction prefs (scope selector / extra drops),
22
- see switchback.extract.
23
- - Promotes lazy-load attrs (data-src, data-original, srcset) to src.
24
- - Resolves relative image/link URLs against base_url.
25
- """
26
- try:
27
- from markdownify import markdownify
28
- try:
29
- from bs4 import BeautifulSoup
30
- from urllib.parse import urljoin
31
-
32
- from .extract import prefs_for
33
- prefs = prefs_for(base_url)
34
-
35
- soup = BeautifulSoup(html or "", "html.parser")
36
- for tag in soup(["script", "style", "noscript", "nav", "header",
37
- "footer", "aside", "form", "iframe"]):
38
- tag.decompose()
39
- # Per-domain: remove configured noise, then scope to the content node.
40
- for sel in prefs.get("drop", []):
41
- for tag in soup.select(sel):
42
- tag.decompose()
43
- selector = prefs.get("selector")
44
- if selector:
45
- node = soup.select_one(selector)
46
- if node is not None:
47
- soup = BeautifulSoup(str(node), "html.parser")
48
- else:
49
- logger.debug(f"extract: selector {selector!r} matched nothing for {base_url}")
50
- for img in soup.find_all("img"):
51
- src = (img.get("src") or img.get("data-src")
52
- or img.get("data-original") or img.get("data-lazy-src"))
53
- if not src and img.get("srcset"):
54
- src = img["srcset"].split(",")[0].strip().split(" ")[0]
55
- if src:
56
- if base_url:
57
- src = urljoin(base_url, src)
58
- img["src"] = src
59
- if base_url:
60
- for a in soup.find_all("a", href=True):
61
- a["href"] = urljoin(base_url, a["href"])
62
- html = str(soup)
63
- except Exception as e:
64
- logger.debug(f"soup pre-clean skipped: {e}")
65
- md = markdownify(html, heading_style="ATX", code_language="",
66
- bullets="-", strip=["script", "style"])
67
- return (md or "").strip()
68
- except Exception as e:
69
- logger.warning(f"markdownify failed: {e}")
70
- return (html or "").strip()
71
-
72
-
73
- def pdf_bytes_to_text(data: bytes) -> str:
74
- """Extract text from PDF bytes. In-memory only — nothing written to disk."""
75
- from pypdf import PdfReader
76
- buf = io.BytesIO(data)
77
- try:
78
- reader = PdfReader(buf)
79
- return "\n\n".join((p.extract_text() or "") for p in reader.pages).strip()
80
- finally:
81
- buf.close()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes