switchback 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,393 @@
1
+ """Botwall v2 — adaptive per-host + per-URL routing policy.
2
+
3
+ Ported from musings' botwall, with one addition: it records *which tier wins*
4
+ per host (not just failures), so the orchestrator can start a known-hard host at
5
+ its winning tier instead of replaying tiers that always miss.
6
+
7
+ Skip granularity
8
+ ----------------
9
+ - Host-level skip: only for seeded hard-block domains and manual overrides.
10
+ Auto-promotion never elevates an entire domain to skip.
11
+ - URL-level skip: individual articles/paths are excluded after PROMOTE_URL_AFTER
12
+ consecutive hard failures (botwall hit or short content), so one bad URL never
13
+ taints its whole domain.
14
+
15
+ State files (all in SCRAPER_STATE_DIR):
16
+ botwall_db.json — host + URL records (authoritative state)
17
+ botwall_events.jsonl — every scrape outcome (machine-readable audit trail)
18
+ botwall_excluded.jsonl — every URL-level exclusion event (machine-readable)
19
+ botwall_excluded.log — same, human-readable one-liner per exclusion
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import logging
25
+ import os
26
+ import threading
27
+ from datetime import datetime, timezone
28
+
29
+ from .gates import host_of
30
+
31
+ logger = logging.getLogger(__name__)
32
+ _DB_WRITE_LOCK = threading.Lock()
33
+
34
+ _DEFAULT_STATE_DIR = os.path.join(
35
+ os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "state")
36
+ _STATE_DIR = os.getenv("SCRAPER_STATE_DIR", _DEFAULT_STATE_DIR)
37
+ os.makedirs(_STATE_DIR, exist_ok=True)
38
+ DB_PATH = os.path.join(_STATE_DIR, "botwall_db.json")
39
+ EVENTS_PATH = os.path.join(_STATE_DIR, "botwall_events.jsonl")
40
+ EXCLUDED_JSONL = os.path.join(_STATE_DIR, "botwall_excluded.jsonl")
41
+ EXCLUDED_LOG = os.path.join(_STATE_DIR, "botwall_excluded.log")
42
+
43
+ _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
44
+ _DEFAULT_SKIP_URLS_FILE = os.path.join(_PROJECT_ROOT, "config", "botwall_skip_urls.txt")
45
+ SKIP_URLS_FILE = os.getenv("SCRAPER_BOTWALL_SKIP_URLS_FILE", _DEFAULT_SKIP_URLS_FILE)
46
+
47
+ # Hosts known to hard-block; seeded on first run, then self-maintained.
48
+ SEED_HOSTS = {
49
+ "www.sciencedirect.com": "seed: Cloudflare 1015",
50
+ "sciencedirect.com": "seed: Cloudflare 1015",
51
+ "linkinghub.elsevier.com": "seed: redirects to sciencedirect",
52
+ "onlinelibrary.wiley.com": "seed: Cloudflare",
53
+ "www.tandfonline.com": "seed: Cloudflare",
54
+ "tandfonline.com": "seed: Cloudflare",
55
+ }
56
+
57
+ # ── Config (all overridable via env vars) ─────────────────────────────────────
58
+ #
59
+ # SCRAPER_BOTWALL_URL_SKIP_AFTER int ≥1, default 2
60
+ # Hard failures (botwall/short-content) on the *same URL* before that URL is
61
+ # excluded. Set to 0 to disable URL-level auto-exclusion entirely.
62
+ #
63
+ # SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER int ≥1, default 0 (disabled)
64
+ # Hard failures across *any* URLs on the same domain before the whole domain is
65
+ # skip-listed. 0 (default) means domains are never auto-skipped — only seeded
66
+ # hard-block domains and manual overrides are domain-level skips.
67
+ #
68
+ # SCRAPER_BOTWALL_COUNT_FIRECRAWL bool default false
69
+ # When true, each time Firecrawl is invoked for a host it counts as a failure
70
+ # toward the domain skip threshold (original v1 behaviour). No effect if
71
+ # SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER is 0.
72
+
73
+ def _int_env(name: str, default: int) -> int:
74
+ try:
75
+ return int(os.getenv(name, str(default)))
76
+ except ValueError:
77
+ logger.warning(f"botwall: invalid {name}; using {default}")
78
+ return default
79
+
80
+ PROMOTE_URL_AFTER = _int_env("SCRAPER_BOTWALL_URL_SKIP_AFTER", 2)
81
+ PROMOTE_DOMAIN_AFTER = _int_env("SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER", 0)
82
+ COUNT_FIRECRAWL = os.getenv("SCRAPER_BOTWALL_COUNT_FIRECRAWL", "").lower() in ("1", "true", "yes")
83
+
84
+ # Hours an auto-skipped URL stays excluded before it's re-tested (self-healing).
85
+ # 0 = never re-test (legacy permanent skip). After the cooldown the URL is tried
86
+ # again; if it fails the cooldown re-stamps and its host is flagged needs_egress
87
+ # so the next attempt routes through the residential tier instead of giving up.
88
+ URL_SKIP_COOLDOWN_H = _int_env("SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H", 24)
89
+
90
+ # Host-level egress escalation: after this many egress-worthy failures at the
91
+ # *local* tiers, the host is flagged needs_egress so future attempts route to the
92
+ # residential tier. 0 disables escalation.
93
+ PROMOTE_EGRESS_AFTER = _int_env("SCRAPER_BOTWALL_EGRESS_AFTER", 2)
94
+
95
+ # Outcomes a *single URL* is excluded for (deterministic per-URL failures). A 429
96
+ # is deliberately excluded — it's transient, so it escalates egress but never
97
+ # permanently skips the URL.
98
+ _URL_SKIP_OUTCOMES = ("botwall", "short_content", "http_block")
99
+
100
+ # Outcomes that mean "this IP/identity is the problem" → escalate to residential
101
+ # egress. Includes the transient 429 (a different IP dodges the rate limit).
102
+ _EGRESS_OUTCOMES = ("botwall", "short_content", "http_block", "rate_limited")
103
+
104
+
105
+ def _now() -> str:
106
+ return datetime.now(timezone.utc).isoformat()
107
+
108
+
109
+ def _age_seconds(ts: str | None) -> float | None:
110
+ """Seconds since an ISO timestamp, or None if missing/unparseable."""
111
+ if not ts:
112
+ return None
113
+ try:
114
+ return (datetime.now(timezone.utc) - datetime.fromisoformat(ts)).total_seconds()
115
+ except Exception:
116
+ return None
117
+
118
+
119
+ def _new_record(reason="", status="allow") -> dict:
120
+ now = _now()
121
+ return {
122
+ "status": status, "reason": reason,
123
+ "winning_tier": None,
124
+ "tier_stats": {}, # {tier: {ok, miss}}
125
+ "total_attempts": 0, "successes": 0,
126
+ "first_seen": now, "last_event": now, "manual_override": None,
127
+ }
128
+
129
+
130
+
131
+ def _parse_skip_urls_file(path: str) -> dict[str, str]:
132
+ """Return {url: reason} from a skip-urls config file."""
133
+ out: dict[str, str] = {}
134
+ if not os.path.exists(path):
135
+ return out
136
+ try:
137
+ with open(path) as f:
138
+ for raw in f:
139
+ line = raw.strip()
140
+ if not line or line.startswith("#"):
141
+ continue
142
+ if " #" in line:
143
+ url, _, reason = line.partition(" #")
144
+ out[url.strip()] = reason.strip()
145
+ else:
146
+ out[line] = "manual: botwall_skip_urls.txt"
147
+ except Exception as e:
148
+ logger.warning(f"botwall: could not read {path}: {e}")
149
+ return out
150
+
151
+
152
+ def load_db() -> dict:
153
+ db = {"version": 2, "updated_at": "", "hosts": {}, "urls": {}}
154
+ if os.path.exists(DB_PATH):
155
+ try:
156
+ with open(DB_PATH) as f:
157
+ db = json.load(f)
158
+ except Exception as e:
159
+ logger.error(f"botwall: load failed ({e}); starting fresh")
160
+ hosts = db.setdefault("hosts", {})
161
+ urls = db.setdefault("urls", {})
162
+ changed = False
163
+ for host, reason in SEED_HOSTS.items():
164
+ if host not in hosts:
165
+ hosts[host] = _new_record(reason=reason, status="skip")
166
+ changed = True
167
+ for url, reason in _parse_skip_urls_file(SKIP_URLS_FILE).items():
168
+ if url not in urls:
169
+ now = _now()
170
+ urls[url] = {"status": "skip", "reason": f"seed: {reason}",
171
+ "failures": 0, "first_seen": now, "last_event": now}
172
+ changed = True
173
+ if changed or not os.path.exists(DB_PATH):
174
+ save_db(db)
175
+ return db
176
+
177
+
178
+ def save_db(db: dict) -> None:
179
+ db["updated_at"] = _now()
180
+ tmp = f"{DB_PATH}.tmp.{os.getpid()}.{threading.get_ident()}"
181
+ with _DB_WRITE_LOCK:
182
+ with open(tmp, "w") as f:
183
+ json.dump(db, f, indent=2, sort_keys=True)
184
+ os.replace(tmp, DB_PATH)
185
+
186
+
187
+ def is_skipped(host: str, db: dict) -> bool:
188
+ rec = db.get("hosts", {}).get(host)
189
+ if not rec:
190
+ return False
191
+ if rec.get("manual_override") == "allow":
192
+ return False
193
+ if rec.get("manual_override") == "skip":
194
+ return True
195
+ return rec.get("status") == "skip"
196
+
197
+
198
+ def winning_tier(host: str, db: dict) -> str | None:
199
+ rec = db.get("hosts", {}).get(host)
200
+ return rec.get("winning_tier") if rec else None
201
+
202
+
203
+ def needs_egress(host: str, db: dict) -> bool:
204
+ """True when the local tiers have repeatedly walled this host, so future
205
+ attempts should escalate to the residential-egress tier."""
206
+ rec = db.get("hosts", {}).get(host)
207
+ return bool(rec and rec.get("needs_egress"))
208
+
209
+
210
+ def _log_event(url, tier, outcome, md_len=None, error=None, latency_ms=None,
211
+ status_code=None, challenge=None) -> None:
212
+ ev = {"ts": _now(), "url": url, "host": host_of(url), "tier": tier,
213
+ "outcome": outcome, "md_len": md_len, "latency_ms": latency_ms,
214
+ "error": error, "status_code": status_code, "challenge": challenge}
215
+ try:
216
+ with open(EVENTS_PATH, "a") as f:
217
+ f.write(json.dumps(ev) + "\n")
218
+ except Exception as e:
219
+ logger.warning(f"botwall: event log failed: {e}")
220
+
221
+
222
+ def log_final(url: str, outcome: str, latency_ms=None, error=None,
223
+ status_code=None) -> None:
224
+ """Write one aggregate event for a URL's final cascade result (all_failed /
225
+ deadline_exceeded / *_skipped). Makes 'why did this URL ultimately fail'
226
+ a first-class row instead of something you reconstruct by grouping per-tier
227
+ events. tier is logged as '<cascade>' to distinguish it."""
228
+ _log_event(url, "<cascade>", outcome, error=error, latency_ms=latency_ms,
229
+ status_code=status_code)
230
+
231
+
232
+ def is_url_skipped(url: str, db: dict) -> bool:
233
+ """True when this URL is currently excluded.
234
+
235
+ Auto-skips decay: after URL_SKIP_COOLDOWN_H the URL is re-tested (returns
236
+ False) so a host that recovers self-heals. Seeded/manual skips and a
237
+ cooldown of 0 stay permanent (legacy behaviour)."""
238
+ rec = db.get("urls", {}).get(url)
239
+ if not (rec and rec.get("status") == "skip"):
240
+ return False
241
+ reason = str(rec.get("reason", ""))
242
+ if reason.startswith(("seed:", "manual:")) or not URL_SKIP_COOLDOWN_H:
243
+ return True
244
+ age = _age_seconds(rec.get("last_event"))
245
+ return age is not None and age < URL_SKIP_COOLDOWN_H * 3600
246
+
247
+
248
+ def _log_exclusion(url: str, reason: str) -> None:
249
+ """Write to both the structured JSONL and the human-readable log."""
250
+ ev = {"ts": _now(), "url": url, "host": host_of(url), "reason": reason}
251
+ try:
252
+ with open(EXCLUDED_JSONL, "a") as f:
253
+ f.write(json.dumps(ev) + "\n")
254
+ with open(EXCLUDED_LOG, "a") as f:
255
+ f.write(f"{ev['ts']} EXCLUDED {url} [{reason}]\n")
256
+ except Exception as e:
257
+ logger.warning(f"botwall: exclusion log failed: {e}")
258
+
259
+
260
+ def _track_url_failure(url: str, outcome: str, db: dict) -> None:
261
+ """Increment per-URL failure counter; exclude the URL when threshold is hit.
262
+
263
+ Counts the deterministic per-URL failures (botwall / short_content / a hard
264
+ 403/401 http_block). Transient outcomes (rate limits, timeouts, network) do
265
+ not accumulate toward a permanent exclusion.
266
+ """
267
+ if outcome not in _URL_SKIP_OUTCOMES:
268
+ return
269
+ urls = db.setdefault("urls", {})
270
+ rec = urls.get(url)
271
+ if rec is None:
272
+ rec = {"status": "allow", "failures": 0, "reason": "",
273
+ "first_seen": _now(), "last_event": _now()}
274
+ urls[url] = rec
275
+ rec["failures"] += 1
276
+ rec["last_event"] = _now() # re-stamp: extends the cooldown window
277
+ if rec.get("status") == "skip":
278
+ # Failed again on a post-cooldown re-test — the local tiers can't clear
279
+ # this. Stay skipped (cooldown re-stamped above) and escalate the host's
280
+ # egress so the next attempt routes through the residential tier.
281
+ _mark_needs_egress(host_of(url), db)
282
+ return
283
+ if rec["failures"] >= PROMOTE_URL_AFTER:
284
+ rec["status"] = "skip"
285
+ rec["reason"] = f"auto: {rec['failures']}× {outcome}"
286
+ logger.info(f"botwall excluded URL: {url} ({rec['reason']})")
287
+ _log_exclusion(url, rec["reason"])
288
+
289
+
290
+ def _mark_needs_egress(host: str, db: dict) -> None:
291
+ """Flag a host so future attempts start at the residential-egress tier."""
292
+ rec = db.get("hosts", {}).get(host)
293
+ if rec is not None and not rec.get("needs_egress"):
294
+ rec["needs_egress"] = True
295
+ logger.info(f"botwall: host flagged needs_egress (local tiers walled): {host}")
296
+
297
+
298
+ def _track_egress(host: str, tier: str, outcome: str, db: dict) -> None:
299
+ """Escalate a host to residential egress after PROMOTE_EGRESS_AFTER
300
+ egress-worthy failures at the *local* tiers.
301
+
302
+ This is the fix for hard HTTP blocks: a 403/401/429 raises (→ http_block /
303
+ rate_limited) and so never tripped the old botwall/short_content-only path,
304
+ leaving the datacenter-IP-blocked hosts that residential egress is *for*
305
+ unescalated. We don't count the residential tier's own misses (circular)."""
306
+ if not PROMOTE_EGRESS_AFTER or outcome not in _EGRESS_OUTCOMES:
307
+ return
308
+ if tier == "tier_residential":
309
+ return
310
+ rec = db.get("hosts", {}).get(host)
311
+ if rec is None or rec.get("needs_egress"):
312
+ return
313
+ rec["egress_failures"] = rec.get("egress_failures", 0) + 1
314
+ if rec["egress_failures"] >= PROMOTE_EGRESS_AFTER:
315
+ _mark_needs_egress(host, db)
316
+
317
+
318
+ def _clear_url_skip(url: str, db: dict) -> None:
319
+ """Self-heal: a previously-excluded URL just succeeded, so un-skip it."""
320
+ rec = db.get("urls", {}).get(url)
321
+ if rec and rec.get("status") == "skip":
322
+ rec.update(status="allow", failures=0, reason="", last_event=_now())
323
+ logger.info(f"botwall: URL skip cleared after success: {url}")
324
+
325
+
326
+ def _track_domain_failure(host: str, outcome: str, db: dict) -> None:
327
+ """Optionally auto-skip a domain after PROMOTE_DOMAIN_AFTER hard failures.
328
+
329
+ Only active when SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER > 0.
330
+ """
331
+ if not PROMOTE_DOMAIN_AFTER:
332
+ return
333
+ counts_as_failure = outcome in ("botwall", "short_content") or (
334
+ COUNT_FIRECRAWL and outcome == "firecrawl_used"
335
+ )
336
+ if not counts_as_failure:
337
+ return
338
+ rec = db["hosts"].get(host)
339
+ if not rec or rec.get("manual_override") or rec.get("status") == "skip":
340
+ return
341
+ rec.setdefault("domain_failures", 0)
342
+ rec["domain_failures"] += 1
343
+ if rec["domain_failures"] >= PROMOTE_DOMAIN_AFTER:
344
+ rec["status"] = "skip"
345
+ rec["reason"] = f"auto: {rec['domain_failures']}× domain {outcome}"
346
+ logger.info(f"botwall domain skip-listed: {host} ({rec['reason']})")
347
+ _log_exclusion(f"domain:{host}", rec["reason"])
348
+
349
+
350
+ def record(db: dict, url: str, tier: str, outcome: str, md_len=None, error=None,
351
+ latency_ms=None, status_code=None, challenge=None) -> None:
352
+ """Update host counters + per-tier stats + winning_tier; track URL-, egress-,
353
+ and (optionally) domain-level failures; log event.
354
+
355
+ outcome ∈ {ok, short_content, botwall, http_block, rate_limited, timeout,
356
+ connection, http_error, error, firecrawl_used}.
357
+
358
+ `challenge` names the bot-wall vendor (cloudflare / datadome / akamai / …)
359
+ when one was served. Counts accumulate per host (the host key is the full
360
+ FQDN, so this is already per-subdomain); domain-level rollups are derived in
361
+ the reporting layer.
362
+ """
363
+ host = host_of(url)
364
+ if not host:
365
+ return
366
+ hosts = db.setdefault("hosts", {})
367
+ rec = hosts.get(host) or _new_record()
368
+ rec["total_attempts"] += 1
369
+ rec["last_event"] = _now()
370
+
371
+ if challenge:
372
+ counts = rec.setdefault("challenge_counts", {})
373
+ counts[challenge] = counts.get(challenge, 0) + 1
374
+
375
+ stats = rec.setdefault("tier_stats", {}).setdefault(tier, {"ok": 0, "miss": 0})
376
+ if outcome == "ok":
377
+ rec["successes"] += 1
378
+ stats["ok"] += 1
379
+ rec["winning_tier"] = tier
380
+ rec["needs_egress"] = False # host recovered
381
+ rec["egress_failures"] = 0 # reset the escalation counter
382
+ _clear_url_skip(url, db) # self-heal a previously-excluded URL
383
+ else:
384
+ stats["miss"] += 1
385
+
386
+ hosts[host] = rec
387
+
388
+ _track_url_failure(url, outcome, db)
389
+ _track_egress(host, tier, outcome, db)
390
+ _track_domain_failure(host, outcome, db)
391
+
392
+ _log_event(url, tier, outcome, md_len=md_len, error=error,
393
+ latency_ms=latency_ms, status_code=status_code, challenge=challenge)
@@ -0,0 +1,173 @@
1
+ """Quality gates — minimum acceptable content length per host.
2
+
3
+ A page that renders to a few hundred chars of nav is a failure, not a success;
4
+ the gate makes a tier "fall through" instead of returning junk.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import re
9
+ from urllib.parse import urlparse
10
+
11
+ MIN_MD_LEN = 2000 # default floor
12
+
13
+ # Hosts whose articles are legitimately short (API stubs, curated explainers).
14
+ MIN_MD_LEN_PER_HOST = {
15
+ "arxiv.org": 500,
16
+ "export.arxiv.org": 500,
17
+ "en.wikipedia.org": 1000,
18
+ "www.metmuseum.org": 500,
19
+ }
20
+
21
+
22
+ def host_of(url: str) -> str:
23
+ try:
24
+ return (urlparse(url).hostname or "").lower()
25
+ except Exception:
26
+ return ""
27
+
28
+
29
+ def min_len_for(url: str) -> int:
30
+ return MIN_MD_LEN_PER_HOST.get(host_of(url), MIN_MD_LEN)
31
+
32
+
33
+ # High-precision bot-wall / block-page markers, each tagged with the vendor that
34
+ # serves it. A page can clear the length gate yet be a Cloudflare "Just a
35
+ # moment..." interstitial (long but worthless), so the length floor alone isn't
36
+ # enough. These are scanned ONLY in the head of the content (the title /
37
+ # first-heading region) — a long article that merely mentions one of these
38
+ # phrases in its body won't trip the gate. Keep this list narrow: a false
39
+ # positive (rejecting a real page) is worse than missing an exotic wall.
40
+ #
41
+ # The vendor tag is what lets the policy *learn which wall* a host serves
42
+ # (recorded per host in botwall_db.json), so dashboards can show challenges by
43
+ # domain and routing can adapt. Order matters: the first match wins, so put the
44
+ # vendor-specific phrases before the generic ones.
45
+ _BOTWALL_MARKERS = (
46
+ ("just a moment...", "cloudflare"),
47
+ ("checking your browser", "cloudflare"),
48
+ ("attention required! | cloudflare", "cloudflare"),
49
+ ("verifying you are human", "cloudflare"), # Turnstile newer copy
50
+ ("verify you are human", "cloudflare"),
51
+ ("enable javascript and cookies to continue", "cloudflare"),
52
+ ("request unsuccessful. incapsula", "incapsula"), # Imperva Incapsula
53
+ ("pardon our interruption", "perimeterx"), # PerimeterX / HUMAN
54
+ ("press & hold", "perimeterx"), # PerimeterX challenge
55
+ ("humans only", "datadome"), # DataDome (e.g. Glassdoor)
56
+ ("access denied", "akamai"), # Akamai / generic 403
57
+ ("unusual traffic from your computer", "google"), # Google bot interstitial
58
+ ("are you a human", "generic"),
59
+ ("ddos protection by", "generic"), # generic CDN challenge
60
+ )
61
+ _BOTWALL_HEAD_CHARS = 600
62
+
63
+
64
+ def classify_botwall(md: str | None) -> str | None:
65
+ """Return the vendor of the bot-wall in the head of `md` (cloudflare /
66
+ incapsula / perimeterx / datadome / akamai / google / generic), or None if
67
+ the content doesn't look like a wall. First marker match wins."""
68
+ if not md:
69
+ return None
70
+ head = md[:_BOTWALL_HEAD_CHARS].lower()
71
+ for marker, vendor in _BOTWALL_MARKERS:
72
+ if marker in head:
73
+ return vendor
74
+ return None
75
+
76
+
77
+ def _looks_like_botwall(md: str) -> bool:
78
+ return classify_botwall(md) is not None
79
+
80
+
81
+ # A Cloudflare *JS challenge* specifically — the thing cloudscraper (Tier 2) can
82
+ # actually solve. Distinct from a generic block: a Cloudflare WAF 1020 / DataDome
83
+ # / origin 403 is served-by-CF-or-not but un-solvable, so it must NOT match here.
84
+ _CF_CHALLENGE_MARKERS = (
85
+ "just a moment",
86
+ "checking your browser",
87
+ "verifying you are human",
88
+ "enable javascript and cookies to continue",
89
+ )
90
+
91
+
92
+ def is_cf_challenge(headers, body: str | None) -> bool:
93
+ """True when an (often 403/503) response is a Cloudflare JS challenge that
94
+ Tier 2 can clear — served by Cloudflare AND carrying a challenge signal."""
95
+ h = {str(k).lower(): str(v) for k, v in dict(headers or {}).items()}
96
+ by_cf = h.get("server", "").lower() == "cloudflare" or "cf-ray" in h
97
+ if not by_cf:
98
+ return False
99
+ if h.get("cf-mitigated", "").lower() == "challenge":
100
+ return True
101
+ head = (body or "")[:_BOTWALL_HEAD_CHARS].lower()
102
+ return any(m in head for m in _CF_CHALLENGE_MARKERS)
103
+
104
+
105
+ def _status_of(exc: BaseException) -> int | None:
106
+ """Best-effort HTTP status from an exception: a response object if the
107
+ library attached one (requests/cloudscraper/curl_cffi), else the first 4xx/5xx
108
+ found in the message (curl_cffi/urllib render it as text, e.g. 'HTTP Error 403')."""
109
+ resp = getattr(exc, "response", None)
110
+ code = getattr(resp, "status_code", None)
111
+ if isinstance(code, int):
112
+ return code
113
+ m = re.search(r"\b([45]\d\d)\b", str(exc))
114
+ return int(m.group(1)) if m else None
115
+
116
+
117
+ def classify_error(exc: BaseException) -> tuple[str, int | None]:
118
+ """Map a raised tier exception to (error_class, status_code).
119
+
120
+ error_class ∈ {http_block, rate_limited, timeout, connection, http_error,
121
+ error}. This is what lets the policy treat a hard 403/401 (datacenter-IP /
122
+ UA block) or a 429 as egress-worthy — the cheaper tiers raise these instead
123
+ of returning a marker page, so without this they'd never escalate."""
124
+ status = _status_of(exc)
125
+ name = type(exc).__name__.lower()
126
+ msg = str(exc).lower()
127
+ if status in (401, 403):
128
+ return "http_block", status
129
+ if status == 429:
130
+ return "rate_limited", status
131
+ if "timeout" in name or "timed out" in msg or "timeout" in msg:
132
+ return "timeout", status
133
+ if any(s in msg for s in (
134
+ "could not resolve", "name or service not known", "getaddrinfo",
135
+ "connection refused", "connection reset", "failed to connect",
136
+ "ssl", "certificate")):
137
+ return "connection", status
138
+ if status and 400 <= status < 600:
139
+ return "http_error", status
140
+ return "error", status
141
+
142
+
143
+ def check(url: str, md: str | None) -> str:
144
+ """Return md if it clears the gates, else raise BotWall / ShortContent."""
145
+ vendor = classify_botwall(md)
146
+ if vendor:
147
+ raise BotWall(f"bot-wall / block page detected ({vendor})", vendor=vendor)
148
+ gate = min_len_for(url)
149
+ n = len(md) if md else 0
150
+ if n < gate:
151
+ raise ShortContent(f"body too short: {n} < {gate}")
152
+ return md
153
+
154
+
155
+ class ShortContent(RuntimeError):
156
+ """Content fetched but below the quality gate — treated as a tier miss."""
157
+
158
+
159
+ class BotWall(RuntimeError):
160
+ """Content fetched but it's a bot-wall / block interstitial (e.g. Cloudflare
161
+ "Just a moment...") rather than the real page — treated as a tier miss so the
162
+ cascade falls through to a stealthier tier. `vendor` names the wall
163
+ (cloudflare / datadome / akamai / …) when known, so the policy can learn
164
+ which challenge a host serves."""
165
+
166
+ def __init__(self, *args, vendor: str | None = None):
167
+ super().__init__(*args)
168
+ self.vendor = vendor
169
+
170
+
171
+ class RateLimited(RuntimeError):
172
+ """Tier hit an upstream rate/quota limit (e.g. HTTP 429) — traced distinctly
173
+ from a normal failure so limit-pressure is visible in the dashboard."""
switchback/py.typed ADDED
File without changes