switchback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
switchback/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ """switchback — one cost-ordered scrape cascade, used by every tool.
2
+
3
+ Public API:
4
+ from switchback import scrape, search
5
+ results = scrape(["https://example.com/article"])
6
+ hits = search("web scraping") # query → URLs (SearXNG)
7
+ """
8
+ from .api import scrape, scrape_detailed, ScrapeOutcome, ScrapeResult, TierAttempt
9
+ from .search import search, SearchResult
10
+
11
+ __all__ = ["scrape", "scrape_detailed", "ScrapeOutcome", "ScrapeResult",
12
+ "TierAttempt", "search", "SearchResult"]
switchback/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ """`python -m switchback <url> [<url> ...]`"""
2
+ from .api import _main
3
+
4
+ raise SystemExit(_main())
switchback/api.py ADDED
@@ -0,0 +1,81 @@
1
+ """Public entry point + CLI.
2
+
3
+ from switchback import scrape
4
+ results = scrape(["https://example.com/article"])
5
+
6
+ # or: python -m switchback.api <url> [<url> ...]
7
+ # python -m switchback.api --search <query ...>
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import sys
12
+
13
+ from .orchestrator import ScrapeOutcome, ScrapeResult, TierAttempt, run, run_detailed
14
+ from .search import search # re-export: query → URLs (SearXNG)
15
+
16
+
17
+ def scrape(urls: str | list[str]) -> list[ScrapeResult]:
18
+ """Scrape one or many URLs through the cascade. Returns successes only.
19
+
20
+ For failures with classified reasons + the per-tier cascade, use
21
+ scrape_detailed()."""
22
+ if isinstance(urls, str):
23
+ urls = [urls]
24
+ return run(urls)
25
+
26
+
27
+ def scrape_detailed(urls: str | list[str]) -> list[ScrapeOutcome]:
28
+ """Like scrape() but returns a ScrapeOutcome per URL — successes *and*
29
+ failures, each with final_outcome, error_class, status_code, and the
30
+ per-tier attempts that were made."""
31
+ if isinstance(urls, str):
32
+ urls = [urls]
33
+ return run_detailed(urls)
34
+
35
+
36
+ def _main() -> int:
37
+ import json
38
+ import logging
39
+ import pathlib
40
+ from .tracing import setup_logs
41
+ # Auto-load .env from the repo root so OTEL/SCRAPER vars are set even when
42
+ # invoked as a subprocess (parent process needn't export them explicitly).
43
+ _env = pathlib.Path(__file__).parent.parent / ".env"
44
+ if _env.exists():
45
+ import os as _os
46
+ for _line in _env.read_text().splitlines():
47
+ _line = _line.strip()
48
+ if _line and not _line.startswith("#") and "=" in _line:
49
+ _k, _, _v = _line.partition("=")
50
+ _k = _k.strip()
51
+ if _k and _k not in _os.environ:
52
+ _os.environ[_k] = _v.strip()
53
+ usage = ("usage: switchback <url> [<url> ...]\n"
54
+ " switchback --search <query ...>\n"
55
+ " (or: python -m switchback <url> ...)")
56
+ # --help/-h is an explicit request: usage to stdout, exit 0 (don't treat it
57
+ # as a URL to scrape). Check before any work so it stays fast and side-effect-free.
58
+ if any(a in ("--help", "-h") for a in sys.argv[1:]):
59
+ print(usage)
60
+ return 0
61
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
62
+ setup_logs() # also ship logs to the OTLP backend when configured
63
+ if len(sys.argv) < 2:
64
+ print(usage, file=sys.stderr)
65
+ return 2
66
+ if sys.argv[1] == "--search":
67
+ hits = search(" ".join(sys.argv[2:]))
68
+ print(json.dumps(
69
+ [{"title": h.title, "url": h.url, "snippet": h.snippet} for h in hits],
70
+ indent=2))
71
+ return 0 if hits else 1
72
+ results = scrape(sys.argv[1:])
73
+ print(json.dumps(
74
+ [{"url": r.url, "source_method": r.source_method, "markdown": r.markdown}
75
+ for r in results],
76
+ indent=2))
77
+ return 0 if results else 1
78
+
79
+
80
+ if __name__ == "__main__":
81
+ raise SystemExit(_main())
@@ -0,0 +1,37 @@
1
+ """Process-wide cap on simultaneous headless browsers.
2
+
3
+ The browser tiers (patchright ~150MB, Camoufox ~600MB) are the memory-heavy
4
+ rungs. The engine scrapes sequentially today, but when callers run scrapes in
5
+ parallel this semaphore bounds how many browsers spin up at once. Default 1
6
+ matches the sequential design (one browser, one footprint); raise it with
7
+ SCRAPER_BROWSER_CONCURRENCY once you know the box has headroom.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import threading
14
+ import time
15
+ from contextlib import contextmanager
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ MAX_BROWSER_CONCURRENCY = max(1, int(os.getenv("SCRAPER_BROWSER_CONCURRENCY", "1")))
20
+ _sem = threading.BoundedSemaphore(MAX_BROWSER_CONCURRENCY)
21
+
22
+
23
+ @contextmanager
24
+ def browser_slot(label: str = "browser"):
25
+ """Acquire one of the MAX_BROWSER_CONCURRENCY browser slots for the duration
26
+ of a browser launch; blocks if all slots are in use. Logs the wait when a
27
+ caller actually had to queue (a signal the cap is saturated)."""
28
+ t0 = time.monotonic()
29
+ _sem.acquire()
30
+ waited = time.monotonic() - t0
31
+ if waited > 0.1:
32
+ logger.info(f"{label}: waited {waited * 1000:.0f}ms for a browser slot "
33
+ f"(SCRAPER_BROWSER_CONCURRENCY={MAX_BROWSER_CONCURRENCY})")
34
+ try:
35
+ yield
36
+ finally:
37
+ _sem.release()
@@ -0,0 +1,94 @@
1
+ """URL → result cache, so an already-scraped page isn't scraped again.
2
+
3
+ Off by default (article content goes stale): set ``SCRAPER_CONTENT_TTL_S`` to a
4
+ positive number of seconds to enable it, e.g. 86400 to dedupe re-scrapes within a
5
+ day. A hit short-circuits the whole cascade before any tier (or proxy byte) runs.
6
+
7
+ Backed by stdlib sqlite (``state/content_cache.db``), not a JSON blob: at
8
+ curiouscats' ~300k URLs/month a single JSON file would be reloaded and rewritten
9
+ in full on every access. sqlite keys by URL and stays O(1). The cache is keyed by
10
+ normalised URL (fragment dropped); the egress scope is irrelevant to the *content*
11
+ so it isn't part of the key.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ import os
17
+ import sqlite3
18
+ import threading
19
+ import time
20
+ from urllib.parse import urlsplit, urlunsplit
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ _DEFAULT_STATE_DIR = os.path.join(
25
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "state")
26
+ _STATE_DIR = os.getenv("SCRAPER_STATE_DIR", _DEFAULT_STATE_DIR)
27
+ DB_PATH = os.path.join(_STATE_DIR, "content_cache.db")
28
+
29
+ _TTL_S = float(os.getenv("SCRAPER_CONTENT_TTL_S", "0")) # 0 = disabled
30
+
31
+ _LOCK = threading.Lock()
32
+ _CONN: sqlite3.Connection | None = None
33
+
34
+
35
+ def enabled() -> bool:
36
+ return _TTL_S > 0
37
+
38
+
39
+ def _norm(url: str) -> str:
40
+ """Drop the fragment; everything else is significant (query strings select
41
+ content)."""
42
+ p = urlsplit(url)
43
+ return urlunsplit((p.scheme, p.netloc, p.path, p.query, ""))
44
+
45
+
46
+ def _conn() -> sqlite3.Connection:
47
+ global _CONN
48
+ if _CONN is not None:
49
+ return _CONN
50
+ with _LOCK:
51
+ if _CONN is None:
52
+ os.makedirs(_STATE_DIR, exist_ok=True)
53
+ c = sqlite3.connect(DB_PATH, check_same_thread=False)
54
+ c.execute("CREATE TABLE IF NOT EXISTS cache ("
55
+ "url TEXT PRIMARY KEY, markdown TEXT, source_method TEXT, ts REAL)")
56
+ c.commit()
57
+ _CONN = c
58
+ return _CONN
59
+
60
+
61
+ def get(url: str) -> tuple[str, str] | None:
62
+ """Return ``(markdown, source_method)`` for a fresh cache hit, else None."""
63
+ if not enabled():
64
+ return None
65
+ conn = _conn() # NB: acquires _LOCK itself — must be outside the lock below
66
+ try:
67
+ with _LOCK:
68
+ row = conn.execute(
69
+ "SELECT markdown, source_method, ts FROM cache WHERE url=?",
70
+ (_norm(url),)).fetchone()
71
+ except Exception as e:
72
+ logger.warning(f"content_cache: read failed: {e}")
73
+ return None
74
+ if not row:
75
+ return None
76
+ markdown, source_method, ts = row
77
+ if time.time() - ts > _TTL_S:
78
+ return None
79
+ return markdown, source_method
80
+
81
+
82
+ def put(url: str, markdown: str, source_method: str) -> None:
83
+ """Store a successful scrape. No-op when disabled."""
84
+ if not enabled():
85
+ return
86
+ conn = _conn() # NB: acquires _LOCK itself — must be outside the lock below
87
+ try:
88
+ with _LOCK:
89
+ conn.execute("INSERT OR REPLACE INTO cache (url, markdown, source_method, ts) "
90
+ "VALUES (?, ?, ?, ?)",
91
+ (_norm(url), markdown, source_method, time.time()))
92
+ conn.commit()
93
+ except Exception as e:
94
+ logger.warning(f"content_cache: write failed: {e}")
switchback/egress.py ADDED
@@ -0,0 +1,108 @@
1
+ """Egress configuration — proxy wiring shared across tiers.
2
+
3
+ Two optional env vars:
4
+
5
+ SCRAPER_PROXY applied to *every* request (all tiers, all URLs).
6
+ SCRAPER_EGRESS_PROXY applied only while a request is in the "egress scope" —
7
+ i.e. for a host the policy flagged needs_egress. This is
8
+ the cost-scoped lever: the easy majority that already
9
+ succeeds free at the datacenter IP stays direct, and only
10
+ the hard, walled hosts spend the (often metered)
11
+ residential proxy bandwidth.
12
+
13
+ The orchestrator opens an egress scope around the cascade for needs_egress hosts
14
+ (see ``egress_scope``); the per-tier helpers resolve the right proxy for the
15
+ current scope. Both shapes are returned — the requests/curl_cffi ``proxies`` dict
16
+ and the Playwright/Camoufox ``proxy`` dict. Nothing set / not in scope → None and
17
+ the tier runs on the direct connection.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import os
22
+ import threading
23
+ from contextlib import contextmanager
24
+ from urllib.parse import urlsplit
25
+
26
+ # Per-thread scope flag. Threading-safe by construction: each worker thread in a
27
+ # ThreadPoolExecutor gets its own instance (default unset → False), and the
28
+ # orchestrator sets/reads it within the same thread that runs the tier fetch.
29
+ _scope = threading.local()
30
+
31
+
32
+ @contextmanager
33
+ def egress_scope(enabled: bool):
34
+ """Mark the enclosed work as egress-scoped (per-thread). While enabled and
35
+ SCRAPER_EGRESS_PROXY is set, the proxy helpers return that proxy. Always
36
+ restores the previous scope on exit, including on early return/raise."""
37
+ prev = getattr(_scope, "egress", False)
38
+ _scope.egress = bool(enabled)
39
+ try:
40
+ yield
41
+ finally:
42
+ _scope.egress = prev
43
+
44
+
45
+ def add_wire_bytes(n: int) -> None:
46
+ """Tally bytes actually transferred over the network for the current URL's
47
+ cascade (per-thread). HTTP tiers add the response size; browser tiers sum each
48
+ loaded resource. The orchestrator reads this to cost residential bandwidth on
49
+ real wire bytes, not the cleaned markdown."""
50
+ _scope.wire_bytes = getattr(_scope, "wire_bytes", 0) + int(n)
51
+
52
+
53
+ def take_wire_bytes() -> int:
54
+ """Return and reset the per-thread wire-byte tally (call once per URL)."""
55
+ n = getattr(_scope, "wire_bytes", 0)
56
+ _scope.wire_bytes = 0
57
+ return n
58
+
59
+
60
+ def has_egress_proxy() -> bool:
61
+ """True when a residential/escalation proxy is configured."""
62
+ return bool(os.getenv("SCRAPER_EGRESS_PROXY"))
63
+
64
+
65
+ def in_egress_scope() -> bool:
66
+ """Raw per-thread egress flag. Lets a tier that offloads its blocking call to
67
+ a worker thread re-apply the scope there (thread-locals don't inherit)."""
68
+ return getattr(_scope, "egress", False)
69
+
70
+
71
+ def scope_label() -> str:
72
+ """'egress' when the request routes through the residential egress proxy,
73
+ else 'direct'. cf_clearance is IP-bound, so the session cache keys on this
74
+ to never replay a cookie across the direct/proxy boundary."""
75
+ if getattr(_scope, "egress", False) and os.getenv("SCRAPER_EGRESS_PROXY"):
76
+ return "egress"
77
+ return "direct"
78
+
79
+
80
+ def _active_proxy_url() -> str | None:
81
+ """The proxy URL for the current scope: the egress proxy when in scope and
82
+ set, otherwise the global proxy (or None)."""
83
+ if getattr(_scope, "egress", False) and os.getenv("SCRAPER_EGRESS_PROXY"):
84
+ return os.getenv("SCRAPER_EGRESS_PROXY")
85
+ return os.getenv("SCRAPER_PROXY") or None
86
+
87
+
88
+ def requests_proxies() -> dict | None:
89
+ """For curl_cffi / requests / cloudscraper: {"http": url, "https": url}."""
90
+ url = _active_proxy_url()
91
+ return {"http": url, "https": url} if url else None
92
+
93
+
94
+ def playwright_proxy() -> dict | None:
95
+ """For patchright / camoufox: {"server", "username"?, "password"?}."""
96
+ url = _active_proxy_url()
97
+ if not url:
98
+ return None
99
+ parts = urlsplit(url)
100
+ server = f"{parts.scheme}://{parts.hostname}"
101
+ if parts.port:
102
+ server += f":{parts.port}"
103
+ cfg: dict[str, str] = {"server": server}
104
+ if parts.username:
105
+ cfg["username"] = parts.username
106
+ if parts.password:
107
+ cfg["password"] = parts.password
108
+ return cfg
switchback/extract.py ADDED
@@ -0,0 +1,56 @@
1
+ """Per-domain extraction preferences — remember how to carve each site.
2
+
3
+ Markdown of the whole page is the default. Some sites need scoping (drop the
4
+ mega-nav, keep the article) or specific elements pulled out. Rather than hardcode
5
+ that per tier, declare it once per host in ``config/extraction.json`` and every
6
+ tier's normalize step picks it up:
7
+
8
+ {
9
+ "www.example.com": {"selector": "article.main", "drop": [".ad", ".related"]},
10
+ "blog.example.org": {"selector": "main .post-body"}
11
+ }
12
+
13
+ selector CSS selector to scope to (first match wins); page minus the rest.
14
+ drop extra CSS selectors to remove before converting (ads, share bars).
15
+
16
+ Matching is by exact host (FQDN), consistent with the botwall policy. Absent /
17
+ unparseable config → no prefs, default whole-page markdown. Override the path
18
+ with SCRAPER_EXTRACTION_FILE.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ import logging
24
+ import os
25
+
26
+ from .policy.gates import host_of
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
31
+ _DEFAULT_FILE = os.path.join(_PROJECT_ROOT, "config", "extraction.json")
32
+ PREFS_FILE = os.getenv("SCRAPER_EXTRACTION_FILE", _DEFAULT_FILE)
33
+
34
+ _PREFS: dict | None = None
35
+
36
+
37
+ def _load() -> dict:
38
+ global _PREFS
39
+ if _PREFS is not None:
40
+ return _PREFS
41
+ prefs: dict = {}
42
+ if os.path.exists(PREFS_FILE):
43
+ try:
44
+ with open(PREFS_FILE) as f:
45
+ prefs = json.load(f) or {}
46
+ except Exception as e:
47
+ logger.warning(f"extract: could not read {PREFS_FILE}: {e}")
48
+ _PREFS = prefs
49
+ return _PREFS
50
+
51
+
52
+ def prefs_for(url: str | None) -> dict:
53
+ """Extraction prefs for this URL's host, or {} when none are configured."""
54
+ if not url:
55
+ return {}
56
+ return _load().get(host_of(url), {})
switchback/flags.py ADDED
@@ -0,0 +1,96 @@
1
+ """Periodic flagging — surface the things worth a human glance, on a schedule.
2
+
3
+ Requirement 13: "the system should flag some of these details from time to
4
+ time." This isn't a daemon — it's a single pass over the metrics rollup that
5
+ emits a digest to the logger (which ships to the OTLP backend when setup_logs() is on).
6
+ Run it from cron / the /loop skill / any scheduler:
7
+
8
+ python -m switchback.flags # text digest to stdout + logs
9
+ python -m switchback.flags --minutes 60 # only the last hour
10
+ python -m switchback.flags --json # machine-readable digest
11
+
12
+ What it flags:
13
+ • domains still landing on paid Firecrawl (winning_tier == tier4_firecrawl)
14
+ • domains escalated to residential egress (needs_egress)
15
+ • domains throwing the most bot-wall challenges (by vendor)
16
+ • low coverage / negative cost savings in the window
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import logging
23
+ from datetime import datetime, timedelta, timezone
24
+
25
+ from .reporting import build_report
26
+ from .tracing import setup_logs
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # A domain is "stuck" if its winning tier is the paid one — these are the hosts
31
+ # that still cost money and are the prime targets for a new tier / cookie / rule.
32
+ _PAID_TIER = "tier4_firecrawl"
33
+
34
+
35
+ def build_digest(minutes: int | None = None) -> dict:
36
+ since = datetime.now(timezone.utc) - timedelta(minutes=minutes) if minutes else None
37
+ rep = build_report(since=since)
38
+ domains = rep["domains"]
39
+
40
+ stuck = sorted(
41
+ (h for h, d in domains.items() if d.get("winning_tier") == _PAID_TIER),
42
+ key=lambda h: -domains[h]["attempts"],
43
+ )
44
+ egress = [h for h, d in domains.items() if d.get("needs_egress")]
45
+ challengers = sorted(
46
+ ((h, d["challenges"]) for h, d in domains.items() if d.get("challenges")),
47
+ key=lambda kv: -sum(kv[1].values()),
48
+ )
49
+ return {
50
+ "window_minutes": minutes,
51
+ "coverage": rep["coverage"],
52
+ "cost": rep["cost"],
53
+ "stuck_on_firecrawl": stuck,
54
+ "needs_egress": egress,
55
+ "top_challenged": [{"host": h, "challenges": c} for h, c in challengers[:10]],
56
+ }
57
+
58
+
59
+ def emit(digest: dict) -> None:
60
+ """Log the noteworthy parts at WARNING so they surface in the OTLP backend."""
61
+ cov, cost = digest["coverage"], digest["cost"]
62
+ logger.info("flags: coverage %.1f%% (%d/%d urls), savings $%.4f (%.1f%%)",
63
+ cov["success_pct"], cov["succeeded"], cov["unique_urls"],
64
+ cost["savings_usd"], cost["savings_pct"])
65
+ if digest["stuck_on_firecrawl"]:
66
+ logger.warning("flags: %d domain(s) still on paid Firecrawl: %s",
67
+ len(digest["stuck_on_firecrawl"]),
68
+ ", ".join(digest["stuck_on_firecrawl"][:20]))
69
+ if digest["needs_egress"]:
70
+ logger.warning("flags: %d domain(s) escalated to residential egress: %s",
71
+ len(digest["needs_egress"]), ", ".join(digest["needs_egress"][:20]))
72
+ for item in digest["top_challenged"]:
73
+ logger.info("flags: %s challenges %s", item["host"], item["challenges"])
74
+ if cost["savings_usd"] < 0:
75
+ logger.warning("flags: NEGATIVE savings ($%.4f) — engine cost exceeds the "
76
+ "Firecrawl baseline in this window", cost["savings_usd"])
77
+
78
+
79
+ def main() -> None:
80
+ ap = argparse.ArgumentParser(description=__doc__,
81
+ formatter_class=argparse.RawDescriptionHelpFormatter)
82
+ ap.add_argument("--minutes", type=int, help="Only consider the last N minutes")
83
+ ap.add_argument("--json", action="store_true", help="Emit the digest as JSON")
84
+ args = ap.parse_args()
85
+
86
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
87
+ setup_logs() # ship the digest to the OTLP backend too when configured
88
+ digest = build_digest(args.minutes)
89
+ if args.json:
90
+ print(json.dumps(digest, indent=2))
91
+ else:
92
+ emit(digest)
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
@@ -0,0 +1,81 @@
1
+ """Shared content normalization — HTML→Markdown and PDF→text.
2
+
3
+ Ported from musings-by-hermes/scripts/muse_helpers.py (the most mature version):
4
+ strips boilerplate, promotes lazy-loaded images, resolves relative URLs.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import io
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ UA = ("Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 "
14
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
15
+
16
+
17
+ def html_to_markdown(html: str, base_url: str | None = None) -> str:
18
+ """HTML → Markdown, preserving images/blockquotes/code.
19
+
20
+ - Strips script/style/nav/header/footer/aside boilerplate.
21
+ - Applies any per-domain extraction prefs (scope selector / extra drops),
22
+ see switchback.extract.
23
+ - Promotes lazy-load attrs (data-src, data-original, srcset) to src.
24
+ - Resolves relative image/link URLs against base_url.
25
+ """
26
+ try:
27
+ from markdownify import markdownify
28
+ try:
29
+ from bs4 import BeautifulSoup
30
+ from urllib.parse import urljoin
31
+
32
+ from .extract import prefs_for
33
+ prefs = prefs_for(base_url)
34
+
35
+ soup = BeautifulSoup(html or "", "html.parser")
36
+ for tag in soup(["script", "style", "noscript", "nav", "header",
37
+ "footer", "aside", "form", "iframe"]):
38
+ tag.decompose()
39
+ # Per-domain: remove configured noise, then scope to the content node.
40
+ for sel in prefs.get("drop", []):
41
+ for tag in soup.select(sel):
42
+ tag.decompose()
43
+ selector = prefs.get("selector")
44
+ if selector:
45
+ node = soup.select_one(selector)
46
+ if node is not None:
47
+ soup = BeautifulSoup(str(node), "html.parser")
48
+ else:
49
+ logger.debug(f"extract: selector {selector!r} matched nothing for {base_url}")
50
+ for img in soup.find_all("img"):
51
+ src = (img.get("src") or img.get("data-src")
52
+ or img.get("data-original") or img.get("data-lazy-src"))
53
+ if not src and img.get("srcset"):
54
+ src = img["srcset"].split(",")[0].strip().split(" ")[0]
55
+ if src:
56
+ if base_url:
57
+ src = urljoin(base_url, src)
58
+ img["src"] = src
59
+ if base_url:
60
+ for a in soup.find_all("a", href=True):
61
+ a["href"] = urljoin(base_url, a["href"])
62
+ html = str(soup)
63
+ except Exception as e:
64
+ logger.debug(f"soup pre-clean skipped: {e}")
65
+ md = markdownify(html, heading_style="ATX", code_language="",
66
+ bullets="-", strip=["script", "style"])
67
+ return (md or "").strip()
68
+ except Exception as e:
69
+ logger.warning(f"markdownify failed: {e}")
70
+ return (html or "").strip()
71
+
72
+
73
+ def pdf_bytes_to_text(data: bytes) -> str:
74
+ """Extract text from PDF bytes. In-memory only — nothing written to disk."""
75
+ from pypdf import PdfReader
76
+ buf = io.BytesIO(data)
77
+ try:
78
+ reader = PdfReader(buf)
79
+ return "\n\n".join((p.extract_text() or "") for p in reader.pages).strip()
80
+ finally:
81
+ buf.close()