switchback 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- switchback/__init__.py +12 -0
- switchback/__main__.py +4 -0
- switchback/api.py +81 -0
- switchback/concurrency.py +37 -0
- switchback/content_cache.py +94 -0
- switchback/egress.py +108 -0
- switchback/extract.py +56 -0
- switchback/flags.py +96 -0
- switchback/normalize.py +81 -0
- switchback/orchestrator.py +343 -0
- switchback/policy/__init__.py +0 -0
- switchback/policy/botwall.py +393 -0
- switchback/policy/gates.py +173 -0
- switchback/py.typed +0 -0
- switchback/reporting.py +236 -0
- switchback/search.py +39 -0
- switchback/server.py +114 -0
- switchback/session_cache.py +274 -0
- switchback/session_trace.py +96 -0
- switchback/tiers/__init__.py +24 -0
- switchback/tiers/_browser.py +50 -0
- switchback/tiers/tier0_apis.py +77 -0
- switchback/tiers/tier1_http.py +65 -0
- switchback/tiers/tier2_cloudscraper.py +135 -0
- switchback/tiers/tier3_browser.py +59 -0
- switchback/tiers/tier3b_camoufox.py +89 -0
- switchback/tiers/tier4_firecrawl.py +48 -0
- switchback/tiers/tier_residential.py +57 -0
- switchback/tracing.py +152 -0
- switchback-0.1.0.dist-info/METADATA +325 -0
- switchback-0.1.0.dist-info/RECORD +36 -0
- switchback-0.1.0.dist-info/WHEEL +5 -0
- switchback-0.1.0.dist-info/entry_points.txt +3 -0
- switchback-0.1.0.dist-info/licenses/LICENSE +21 -0
- switchback-0.1.0.dist-info/licenses/NOTICE +34 -0
- switchback-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
"""Botwall v2 — adaptive per-host + per-URL routing policy.
|
|
2
|
+
|
|
3
|
+
Ported from musings' botwall, with one addition: it records *which tier wins*
|
|
4
|
+
per host (not just failures), so the orchestrator can start a known-hard host at
|
|
5
|
+
its winning tier instead of replaying tiers that always miss.
|
|
6
|
+
|
|
7
|
+
Skip granularity
|
|
8
|
+
----------------
|
|
9
|
+
- Host-level skip: only for seeded hard-block domains and manual overrides.
|
|
10
|
+
Auto-promotion never elevates an entire domain to skip.
|
|
11
|
+
- URL-level skip: individual articles/paths are excluded after PROMOTE_URL_AFTER
|
|
12
|
+
consecutive hard failures (botwall hit or short content), so one bad URL never
|
|
13
|
+
taints its whole domain.
|
|
14
|
+
|
|
15
|
+
State files (all in SCRAPER_STATE_DIR):
|
|
16
|
+
botwall_db.json — host + URL records (authoritative state)
|
|
17
|
+
botwall_events.jsonl — every scrape outcome (machine-readable audit trail)
|
|
18
|
+
botwall_excluded.jsonl — every URL-level exclusion event (machine-readable)
|
|
19
|
+
botwall_excluded.log — same, human-readable one-liner per exclusion
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import logging
|
|
25
|
+
import os
|
|
26
|
+
import threading
|
|
27
|
+
from datetime import datetime, timezone
|
|
28
|
+
|
|
29
|
+
from .gates import host_of
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
_DB_WRITE_LOCK = threading.Lock()
|
|
33
|
+
|
|
34
|
+
_DEFAULT_STATE_DIR = os.path.join(
|
|
35
|
+
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "state")
|
|
36
|
+
_STATE_DIR = os.getenv("SCRAPER_STATE_DIR", _DEFAULT_STATE_DIR)
|
|
37
|
+
os.makedirs(_STATE_DIR, exist_ok=True)
|
|
38
|
+
DB_PATH = os.path.join(_STATE_DIR, "botwall_db.json")
|
|
39
|
+
EVENTS_PATH = os.path.join(_STATE_DIR, "botwall_events.jsonl")
|
|
40
|
+
EXCLUDED_JSONL = os.path.join(_STATE_DIR, "botwall_excluded.jsonl")
|
|
41
|
+
EXCLUDED_LOG = os.path.join(_STATE_DIR, "botwall_excluded.log")
|
|
42
|
+
|
|
43
|
+
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
44
|
+
_DEFAULT_SKIP_URLS_FILE = os.path.join(_PROJECT_ROOT, "config", "botwall_skip_urls.txt")
|
|
45
|
+
SKIP_URLS_FILE = os.getenv("SCRAPER_BOTWALL_SKIP_URLS_FILE", _DEFAULT_SKIP_URLS_FILE)
|
|
46
|
+
|
|
47
|
+
# Hosts known to hard-block; seeded on first run, then self-maintained.
|
|
48
|
+
SEED_HOSTS = {
|
|
49
|
+
"www.sciencedirect.com": "seed: Cloudflare 1015",
|
|
50
|
+
"sciencedirect.com": "seed: Cloudflare 1015",
|
|
51
|
+
"linkinghub.elsevier.com": "seed: redirects to sciencedirect",
|
|
52
|
+
"onlinelibrary.wiley.com": "seed: Cloudflare",
|
|
53
|
+
"www.tandfonline.com": "seed: Cloudflare",
|
|
54
|
+
"tandfonline.com": "seed: Cloudflare",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# ── Config (all overridable via env vars) ─────────────────────────────────────
|
|
58
|
+
#
|
|
59
|
+
# SCRAPER_BOTWALL_URL_SKIP_AFTER int ≥1, default 2
|
|
60
|
+
# Hard failures (botwall/short-content) on the *same URL* before that URL is
|
|
61
|
+
# excluded. Set to 0 to disable URL-level auto-exclusion entirely.
|
|
62
|
+
#
|
|
63
|
+
# SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER int ≥1, default 0 (disabled)
|
|
64
|
+
# Hard failures across *any* URLs on the same domain before the whole domain is
|
|
65
|
+
# skip-listed. 0 (default) means domains are never auto-skipped — only seeded
|
|
66
|
+
# hard-block domains and manual overrides are domain-level skips.
|
|
67
|
+
#
|
|
68
|
+
# SCRAPER_BOTWALL_COUNT_FIRECRAWL bool default false
|
|
69
|
+
# When true, each time Firecrawl is invoked for a host it counts as a failure
|
|
70
|
+
# toward the domain skip threshold (original v1 behaviour). No effect if
|
|
71
|
+
# SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER is 0.
|
|
72
|
+
|
|
73
|
+
def _int_env(name: str, default: int) -> int:
|
|
74
|
+
try:
|
|
75
|
+
return int(os.getenv(name, str(default)))
|
|
76
|
+
except ValueError:
|
|
77
|
+
logger.warning(f"botwall: invalid {name}; using {default}")
|
|
78
|
+
return default
|
|
79
|
+
|
|
80
|
+
PROMOTE_URL_AFTER = _int_env("SCRAPER_BOTWALL_URL_SKIP_AFTER", 2)
|
|
81
|
+
PROMOTE_DOMAIN_AFTER = _int_env("SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER", 0)
|
|
82
|
+
COUNT_FIRECRAWL = os.getenv("SCRAPER_BOTWALL_COUNT_FIRECRAWL", "").lower() in ("1", "true", "yes")
|
|
83
|
+
|
|
84
|
+
# Hours an auto-skipped URL stays excluded before it's re-tested (self-healing).
|
|
85
|
+
# 0 = never re-test (legacy permanent skip). After the cooldown the URL is tried
|
|
86
|
+
# again; if it fails the cooldown re-stamps and its host is flagged needs_egress
|
|
87
|
+
# so the next attempt routes through the residential tier instead of giving up.
|
|
88
|
+
URL_SKIP_COOLDOWN_H = _int_env("SCRAPER_BOTWALL_URL_SKIP_COOLDOWN_H", 24)
|
|
89
|
+
|
|
90
|
+
# Host-level egress escalation: after this many egress-worthy failures at the
|
|
91
|
+
# *local* tiers, the host is flagged needs_egress so future attempts route to the
|
|
92
|
+
# residential tier. 0 disables escalation.
|
|
93
|
+
PROMOTE_EGRESS_AFTER = _int_env("SCRAPER_BOTWALL_EGRESS_AFTER", 2)
|
|
94
|
+
|
|
95
|
+
# Outcomes a *single URL* is excluded for (deterministic per-URL failures). A 429
|
|
96
|
+
# is deliberately excluded — it's transient, so it escalates egress but never
|
|
97
|
+
# permanently skips the URL.
|
|
98
|
+
_URL_SKIP_OUTCOMES = ("botwall", "short_content", "http_block")
|
|
99
|
+
|
|
100
|
+
# Outcomes that mean "this IP/identity is the problem" → escalate to residential
|
|
101
|
+
# egress. Includes the transient 429 (a different IP dodges the rate limit).
|
|
102
|
+
_EGRESS_OUTCOMES = ("botwall", "short_content", "http_block", "rate_limited")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _now() -> str:
|
|
106
|
+
return datetime.now(timezone.utc).isoformat()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _age_seconds(ts: str | None) -> float | None:
|
|
110
|
+
"""Seconds since an ISO timestamp, or None if missing/unparseable."""
|
|
111
|
+
if not ts:
|
|
112
|
+
return None
|
|
113
|
+
try:
|
|
114
|
+
return (datetime.now(timezone.utc) - datetime.fromisoformat(ts)).total_seconds()
|
|
115
|
+
except Exception:
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _new_record(reason="", status="allow") -> dict:
|
|
120
|
+
now = _now()
|
|
121
|
+
return {
|
|
122
|
+
"status": status, "reason": reason,
|
|
123
|
+
"winning_tier": None,
|
|
124
|
+
"tier_stats": {}, # {tier: {ok, miss}}
|
|
125
|
+
"total_attempts": 0, "successes": 0,
|
|
126
|
+
"first_seen": now, "last_event": now, "manual_override": None,
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _parse_skip_urls_file(path: str) -> dict[str, str]:
|
|
132
|
+
"""Return {url: reason} from a skip-urls config file."""
|
|
133
|
+
out: dict[str, str] = {}
|
|
134
|
+
if not os.path.exists(path):
|
|
135
|
+
return out
|
|
136
|
+
try:
|
|
137
|
+
with open(path) as f:
|
|
138
|
+
for raw in f:
|
|
139
|
+
line = raw.strip()
|
|
140
|
+
if not line or line.startswith("#"):
|
|
141
|
+
continue
|
|
142
|
+
if " #" in line:
|
|
143
|
+
url, _, reason = line.partition(" #")
|
|
144
|
+
out[url.strip()] = reason.strip()
|
|
145
|
+
else:
|
|
146
|
+
out[line] = "manual: botwall_skip_urls.txt"
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.warning(f"botwall: could not read {path}: {e}")
|
|
149
|
+
return out
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def load_db() -> dict:
|
|
153
|
+
db = {"version": 2, "updated_at": "", "hosts": {}, "urls": {}}
|
|
154
|
+
if os.path.exists(DB_PATH):
|
|
155
|
+
try:
|
|
156
|
+
with open(DB_PATH) as f:
|
|
157
|
+
db = json.load(f)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f"botwall: load failed ({e}); starting fresh")
|
|
160
|
+
hosts = db.setdefault("hosts", {})
|
|
161
|
+
urls = db.setdefault("urls", {})
|
|
162
|
+
changed = False
|
|
163
|
+
for host, reason in SEED_HOSTS.items():
|
|
164
|
+
if host not in hosts:
|
|
165
|
+
hosts[host] = _new_record(reason=reason, status="skip")
|
|
166
|
+
changed = True
|
|
167
|
+
for url, reason in _parse_skip_urls_file(SKIP_URLS_FILE).items():
|
|
168
|
+
if url not in urls:
|
|
169
|
+
now = _now()
|
|
170
|
+
urls[url] = {"status": "skip", "reason": f"seed: {reason}",
|
|
171
|
+
"failures": 0, "first_seen": now, "last_event": now}
|
|
172
|
+
changed = True
|
|
173
|
+
if changed or not os.path.exists(DB_PATH):
|
|
174
|
+
save_db(db)
|
|
175
|
+
return db
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def save_db(db: dict) -> None:
|
|
179
|
+
db["updated_at"] = _now()
|
|
180
|
+
tmp = f"{DB_PATH}.tmp.{os.getpid()}.{threading.get_ident()}"
|
|
181
|
+
with _DB_WRITE_LOCK:
|
|
182
|
+
with open(tmp, "w") as f:
|
|
183
|
+
json.dump(db, f, indent=2, sort_keys=True)
|
|
184
|
+
os.replace(tmp, DB_PATH)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def is_skipped(host: str, db: dict) -> bool:
|
|
188
|
+
rec = db.get("hosts", {}).get(host)
|
|
189
|
+
if not rec:
|
|
190
|
+
return False
|
|
191
|
+
if rec.get("manual_override") == "allow":
|
|
192
|
+
return False
|
|
193
|
+
if rec.get("manual_override") == "skip":
|
|
194
|
+
return True
|
|
195
|
+
return rec.get("status") == "skip"
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def winning_tier(host: str, db: dict) -> str | None:
|
|
199
|
+
rec = db.get("hosts", {}).get(host)
|
|
200
|
+
return rec.get("winning_tier") if rec else None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def needs_egress(host: str, db: dict) -> bool:
|
|
204
|
+
"""True when the local tiers have repeatedly walled this host, so future
|
|
205
|
+
attempts should escalate to the residential-egress tier."""
|
|
206
|
+
rec = db.get("hosts", {}).get(host)
|
|
207
|
+
return bool(rec and rec.get("needs_egress"))
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _log_event(url, tier, outcome, md_len=None, error=None, latency_ms=None,
|
|
211
|
+
status_code=None, challenge=None) -> None:
|
|
212
|
+
ev = {"ts": _now(), "url": url, "host": host_of(url), "tier": tier,
|
|
213
|
+
"outcome": outcome, "md_len": md_len, "latency_ms": latency_ms,
|
|
214
|
+
"error": error, "status_code": status_code, "challenge": challenge}
|
|
215
|
+
try:
|
|
216
|
+
with open(EVENTS_PATH, "a") as f:
|
|
217
|
+
f.write(json.dumps(ev) + "\n")
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.warning(f"botwall: event log failed: {e}")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def log_final(url: str, outcome: str, latency_ms=None, error=None,
|
|
223
|
+
status_code=None) -> None:
|
|
224
|
+
"""Write one aggregate event for a URL's final cascade result (all_failed /
|
|
225
|
+
deadline_exceeded / *_skipped). Makes 'why did this URL ultimately fail'
|
|
226
|
+
a first-class row instead of something you reconstruct by grouping per-tier
|
|
227
|
+
events. tier is logged as '<cascade>' to distinguish it."""
|
|
228
|
+
_log_event(url, "<cascade>", outcome, error=error, latency_ms=latency_ms,
|
|
229
|
+
status_code=status_code)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def is_url_skipped(url: str, db: dict) -> bool:
|
|
233
|
+
"""True when this URL is currently excluded.
|
|
234
|
+
|
|
235
|
+
Auto-skips decay: after URL_SKIP_COOLDOWN_H the URL is re-tested (returns
|
|
236
|
+
False) so a host that recovers self-heals. Seeded/manual skips and a
|
|
237
|
+
cooldown of 0 stay permanent (legacy behaviour)."""
|
|
238
|
+
rec = db.get("urls", {}).get(url)
|
|
239
|
+
if not (rec and rec.get("status") == "skip"):
|
|
240
|
+
return False
|
|
241
|
+
reason = str(rec.get("reason", ""))
|
|
242
|
+
if reason.startswith(("seed:", "manual:")) or not URL_SKIP_COOLDOWN_H:
|
|
243
|
+
return True
|
|
244
|
+
age = _age_seconds(rec.get("last_event"))
|
|
245
|
+
return age is not None and age < URL_SKIP_COOLDOWN_H * 3600
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _log_exclusion(url: str, reason: str) -> None:
|
|
249
|
+
"""Write to both the structured JSONL and the human-readable log."""
|
|
250
|
+
ev = {"ts": _now(), "url": url, "host": host_of(url), "reason": reason}
|
|
251
|
+
try:
|
|
252
|
+
with open(EXCLUDED_JSONL, "a") as f:
|
|
253
|
+
f.write(json.dumps(ev) + "\n")
|
|
254
|
+
with open(EXCLUDED_LOG, "a") as f:
|
|
255
|
+
f.write(f"{ev['ts']} EXCLUDED {url} [{reason}]\n")
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.warning(f"botwall: exclusion log failed: {e}")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _track_url_failure(url: str, outcome: str, db: dict) -> None:
|
|
261
|
+
"""Increment per-URL failure counter; exclude the URL when threshold is hit.
|
|
262
|
+
|
|
263
|
+
Counts the deterministic per-URL failures (botwall / short_content / a hard
|
|
264
|
+
403/401 http_block). Transient outcomes (rate limits, timeouts, network) do
|
|
265
|
+
not accumulate toward a permanent exclusion.
|
|
266
|
+
"""
|
|
267
|
+
if outcome not in _URL_SKIP_OUTCOMES:
|
|
268
|
+
return
|
|
269
|
+
urls = db.setdefault("urls", {})
|
|
270
|
+
rec = urls.get(url)
|
|
271
|
+
if rec is None:
|
|
272
|
+
rec = {"status": "allow", "failures": 0, "reason": "",
|
|
273
|
+
"first_seen": _now(), "last_event": _now()}
|
|
274
|
+
urls[url] = rec
|
|
275
|
+
rec["failures"] += 1
|
|
276
|
+
rec["last_event"] = _now() # re-stamp: extends the cooldown window
|
|
277
|
+
if rec.get("status") == "skip":
|
|
278
|
+
# Failed again on a post-cooldown re-test — the local tiers can't clear
|
|
279
|
+
# this. Stay skipped (cooldown re-stamped above) and escalate the host's
|
|
280
|
+
# egress so the next attempt routes through the residential tier.
|
|
281
|
+
_mark_needs_egress(host_of(url), db)
|
|
282
|
+
return
|
|
283
|
+
if rec["failures"] >= PROMOTE_URL_AFTER:
|
|
284
|
+
rec["status"] = "skip"
|
|
285
|
+
rec["reason"] = f"auto: {rec['failures']}× {outcome}"
|
|
286
|
+
logger.info(f"botwall excluded URL: {url} ({rec['reason']})")
|
|
287
|
+
_log_exclusion(url, rec["reason"])
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _mark_needs_egress(host: str, db: dict) -> None:
|
|
291
|
+
"""Flag a host so future attempts start at the residential-egress tier."""
|
|
292
|
+
rec = db.get("hosts", {}).get(host)
|
|
293
|
+
if rec is not None and not rec.get("needs_egress"):
|
|
294
|
+
rec["needs_egress"] = True
|
|
295
|
+
logger.info(f"botwall: host flagged needs_egress (local tiers walled): {host}")
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _track_egress(host: str, tier: str, outcome: str, db: dict) -> None:
|
|
299
|
+
"""Escalate a host to residential egress after PROMOTE_EGRESS_AFTER
|
|
300
|
+
egress-worthy failures at the *local* tiers.
|
|
301
|
+
|
|
302
|
+
This is the fix for hard HTTP blocks: a 403/401/429 raises (→ http_block /
|
|
303
|
+
rate_limited) and so never tripped the old botwall/short_content-only path,
|
|
304
|
+
leaving the datacenter-IP-blocked hosts that residential egress is *for*
|
|
305
|
+
unescalated. We don't count the residential tier's own misses (circular)."""
|
|
306
|
+
if not PROMOTE_EGRESS_AFTER or outcome not in _EGRESS_OUTCOMES:
|
|
307
|
+
return
|
|
308
|
+
if tier == "tier_residential":
|
|
309
|
+
return
|
|
310
|
+
rec = db.get("hosts", {}).get(host)
|
|
311
|
+
if rec is None or rec.get("needs_egress"):
|
|
312
|
+
return
|
|
313
|
+
rec["egress_failures"] = rec.get("egress_failures", 0) + 1
|
|
314
|
+
if rec["egress_failures"] >= PROMOTE_EGRESS_AFTER:
|
|
315
|
+
_mark_needs_egress(host, db)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _clear_url_skip(url: str, db: dict) -> None:
|
|
319
|
+
"""Self-heal: a previously-excluded URL just succeeded, so un-skip it."""
|
|
320
|
+
rec = db.get("urls", {}).get(url)
|
|
321
|
+
if rec and rec.get("status") == "skip":
|
|
322
|
+
rec.update(status="allow", failures=0, reason="", last_event=_now())
|
|
323
|
+
logger.info(f"botwall: URL skip cleared after success: {url}")
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _track_domain_failure(host: str, outcome: str, db: dict) -> None:
|
|
327
|
+
"""Optionally auto-skip a domain after PROMOTE_DOMAIN_AFTER hard failures.
|
|
328
|
+
|
|
329
|
+
Only active when SCRAPER_BOTWALL_DOMAIN_SKIP_AFTER > 0.
|
|
330
|
+
"""
|
|
331
|
+
if not PROMOTE_DOMAIN_AFTER:
|
|
332
|
+
return
|
|
333
|
+
counts_as_failure = outcome in ("botwall", "short_content") or (
|
|
334
|
+
COUNT_FIRECRAWL and outcome == "firecrawl_used"
|
|
335
|
+
)
|
|
336
|
+
if not counts_as_failure:
|
|
337
|
+
return
|
|
338
|
+
rec = db["hosts"].get(host)
|
|
339
|
+
if not rec or rec.get("manual_override") or rec.get("status") == "skip":
|
|
340
|
+
return
|
|
341
|
+
rec.setdefault("domain_failures", 0)
|
|
342
|
+
rec["domain_failures"] += 1
|
|
343
|
+
if rec["domain_failures"] >= PROMOTE_DOMAIN_AFTER:
|
|
344
|
+
rec["status"] = "skip"
|
|
345
|
+
rec["reason"] = f"auto: {rec['domain_failures']}× domain {outcome}"
|
|
346
|
+
logger.info(f"botwall domain skip-listed: {host} ({rec['reason']})")
|
|
347
|
+
_log_exclusion(f"domain:{host}", rec["reason"])
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def record(db: dict, url: str, tier: str, outcome: str, md_len=None, error=None,
|
|
351
|
+
latency_ms=None, status_code=None, challenge=None) -> None:
|
|
352
|
+
"""Update host counters + per-tier stats + winning_tier; track URL-, egress-,
|
|
353
|
+
and (optionally) domain-level failures; log event.
|
|
354
|
+
|
|
355
|
+
outcome ∈ {ok, short_content, botwall, http_block, rate_limited, timeout,
|
|
356
|
+
connection, http_error, error, firecrawl_used}.
|
|
357
|
+
|
|
358
|
+
`challenge` names the bot-wall vendor (cloudflare / datadome / akamai / …)
|
|
359
|
+
when one was served. Counts accumulate per host (the host key is the full
|
|
360
|
+
FQDN, so this is already per-subdomain); domain-level rollups are derived in
|
|
361
|
+
the reporting layer.
|
|
362
|
+
"""
|
|
363
|
+
host = host_of(url)
|
|
364
|
+
if not host:
|
|
365
|
+
return
|
|
366
|
+
hosts = db.setdefault("hosts", {})
|
|
367
|
+
rec = hosts.get(host) or _new_record()
|
|
368
|
+
rec["total_attempts"] += 1
|
|
369
|
+
rec["last_event"] = _now()
|
|
370
|
+
|
|
371
|
+
if challenge:
|
|
372
|
+
counts = rec.setdefault("challenge_counts", {})
|
|
373
|
+
counts[challenge] = counts.get(challenge, 0) + 1
|
|
374
|
+
|
|
375
|
+
stats = rec.setdefault("tier_stats", {}).setdefault(tier, {"ok": 0, "miss": 0})
|
|
376
|
+
if outcome == "ok":
|
|
377
|
+
rec["successes"] += 1
|
|
378
|
+
stats["ok"] += 1
|
|
379
|
+
rec["winning_tier"] = tier
|
|
380
|
+
rec["needs_egress"] = False # host recovered
|
|
381
|
+
rec["egress_failures"] = 0 # reset the escalation counter
|
|
382
|
+
_clear_url_skip(url, db) # self-heal a previously-excluded URL
|
|
383
|
+
else:
|
|
384
|
+
stats["miss"] += 1
|
|
385
|
+
|
|
386
|
+
hosts[host] = rec
|
|
387
|
+
|
|
388
|
+
_track_url_failure(url, outcome, db)
|
|
389
|
+
_track_egress(host, tier, outcome, db)
|
|
390
|
+
_track_domain_failure(host, outcome, db)
|
|
391
|
+
|
|
392
|
+
_log_event(url, tier, outcome, md_len=md_len, error=error,
|
|
393
|
+
latency_ms=latency_ms, status_code=status_code, challenge=challenge)
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Quality gates — minimum acceptable content length per host.
|
|
2
|
+
|
|
3
|
+
A page that renders to a few hundred chars of nav is a failure, not a success;
|
|
4
|
+
the gate makes a tier "fall through" instead of returning junk.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
MIN_MD_LEN = 2000 # default floor
|
|
12
|
+
|
|
13
|
+
# Hosts whose articles are legitimately short (API stubs, curated explainers).
|
|
14
|
+
MIN_MD_LEN_PER_HOST = {
|
|
15
|
+
"arxiv.org": 500,
|
|
16
|
+
"export.arxiv.org": 500,
|
|
17
|
+
"en.wikipedia.org": 1000,
|
|
18
|
+
"www.metmuseum.org": 500,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def host_of(url: str) -> str:
|
|
23
|
+
try:
|
|
24
|
+
return (urlparse(url).hostname or "").lower()
|
|
25
|
+
except Exception:
|
|
26
|
+
return ""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def min_len_for(url: str) -> int:
|
|
30
|
+
return MIN_MD_LEN_PER_HOST.get(host_of(url), MIN_MD_LEN)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# High-precision bot-wall / block-page markers, each tagged with the vendor that
|
|
34
|
+
# serves it. A page can clear the length gate yet be a Cloudflare "Just a
|
|
35
|
+
# moment..." interstitial (long but worthless), so the length floor alone isn't
|
|
36
|
+
# enough. These are scanned ONLY in the head of the content (the title /
|
|
37
|
+
# first-heading region) — a long article that merely mentions one of these
|
|
38
|
+
# phrases in its body won't trip the gate. Keep this list narrow: a false
|
|
39
|
+
# positive (rejecting a real page) is worse than missing an exotic wall.
|
|
40
|
+
#
|
|
41
|
+
# The vendor tag is what lets the policy *learn which wall* a host serves
|
|
42
|
+
# (recorded per host in botwall_db.json), so dashboards can show challenges by
|
|
43
|
+
# domain and routing can adapt. Order matters: the first match wins, so put the
|
|
44
|
+
# vendor-specific phrases before the generic ones.
|
|
45
|
+
_BOTWALL_MARKERS = (
|
|
46
|
+
("just a moment...", "cloudflare"),
|
|
47
|
+
("checking your browser", "cloudflare"),
|
|
48
|
+
("attention required! | cloudflare", "cloudflare"),
|
|
49
|
+
("verifying you are human", "cloudflare"), # Turnstile newer copy
|
|
50
|
+
("verify you are human", "cloudflare"),
|
|
51
|
+
("enable javascript and cookies to continue", "cloudflare"),
|
|
52
|
+
("request unsuccessful. incapsula", "incapsula"), # Imperva Incapsula
|
|
53
|
+
("pardon our interruption", "perimeterx"), # PerimeterX / HUMAN
|
|
54
|
+
("press & hold", "perimeterx"), # PerimeterX challenge
|
|
55
|
+
("humans only", "datadome"), # DataDome (e.g. Glassdoor)
|
|
56
|
+
("access denied", "akamai"), # Akamai / generic 403
|
|
57
|
+
("unusual traffic from your computer", "google"), # Google bot interstitial
|
|
58
|
+
("are you a human", "generic"),
|
|
59
|
+
("ddos protection by", "generic"), # generic CDN challenge
|
|
60
|
+
)
|
|
61
|
+
_BOTWALL_HEAD_CHARS = 600
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def classify_botwall(md: str | None) -> str | None:
|
|
65
|
+
"""Return the vendor of the bot-wall in the head of `md` (cloudflare /
|
|
66
|
+
incapsula / perimeterx / datadome / akamai / google / generic), or None if
|
|
67
|
+
the content doesn't look like a wall. First marker match wins."""
|
|
68
|
+
if not md:
|
|
69
|
+
return None
|
|
70
|
+
head = md[:_BOTWALL_HEAD_CHARS].lower()
|
|
71
|
+
for marker, vendor in _BOTWALL_MARKERS:
|
|
72
|
+
if marker in head:
|
|
73
|
+
return vendor
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _looks_like_botwall(md: str) -> bool:
|
|
78
|
+
return classify_botwall(md) is not None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# A Cloudflare *JS challenge* specifically — the thing cloudscraper (Tier 2) can
|
|
82
|
+
# actually solve. Distinct from a generic block: a Cloudflare WAF 1020 / DataDome
|
|
83
|
+
# / origin 403 is served-by-CF-or-not but un-solvable, so it must NOT match here.
|
|
84
|
+
_CF_CHALLENGE_MARKERS = (
|
|
85
|
+
"just a moment",
|
|
86
|
+
"checking your browser",
|
|
87
|
+
"verifying you are human",
|
|
88
|
+
"enable javascript and cookies to continue",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def is_cf_challenge(headers, body: str | None) -> bool:
|
|
93
|
+
"""True when an (often 403/503) response is a Cloudflare JS challenge that
|
|
94
|
+
Tier 2 can clear — served by Cloudflare AND carrying a challenge signal."""
|
|
95
|
+
h = {str(k).lower(): str(v) for k, v in dict(headers or {}).items()}
|
|
96
|
+
by_cf = h.get("server", "").lower() == "cloudflare" or "cf-ray" in h
|
|
97
|
+
if not by_cf:
|
|
98
|
+
return False
|
|
99
|
+
if h.get("cf-mitigated", "").lower() == "challenge":
|
|
100
|
+
return True
|
|
101
|
+
head = (body or "")[:_BOTWALL_HEAD_CHARS].lower()
|
|
102
|
+
return any(m in head for m in _CF_CHALLENGE_MARKERS)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _status_of(exc: BaseException) -> int | None:
|
|
106
|
+
"""Best-effort HTTP status from an exception: a response object if the
|
|
107
|
+
library attached one (requests/cloudscraper/curl_cffi), else the first 4xx/5xx
|
|
108
|
+
found in the message (curl_cffi/urllib render it as text, e.g. 'HTTP Error 403')."""
|
|
109
|
+
resp = getattr(exc, "response", None)
|
|
110
|
+
code = getattr(resp, "status_code", None)
|
|
111
|
+
if isinstance(code, int):
|
|
112
|
+
return code
|
|
113
|
+
m = re.search(r"\b([45]\d\d)\b", str(exc))
|
|
114
|
+
return int(m.group(1)) if m else None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def classify_error(exc: BaseException) -> tuple[str, int | None]:
|
|
118
|
+
"""Map a raised tier exception to (error_class, status_code).
|
|
119
|
+
|
|
120
|
+
error_class ∈ {http_block, rate_limited, timeout, connection, http_error,
|
|
121
|
+
error}. This is what lets the policy treat a hard 403/401 (datacenter-IP /
|
|
122
|
+
UA block) or a 429 as egress-worthy — the cheaper tiers raise these instead
|
|
123
|
+
of returning a marker page, so without this they'd never escalate."""
|
|
124
|
+
status = _status_of(exc)
|
|
125
|
+
name = type(exc).__name__.lower()
|
|
126
|
+
msg = str(exc).lower()
|
|
127
|
+
if status in (401, 403):
|
|
128
|
+
return "http_block", status
|
|
129
|
+
if status == 429:
|
|
130
|
+
return "rate_limited", status
|
|
131
|
+
if "timeout" in name or "timed out" in msg or "timeout" in msg:
|
|
132
|
+
return "timeout", status
|
|
133
|
+
if any(s in msg for s in (
|
|
134
|
+
"could not resolve", "name or service not known", "getaddrinfo",
|
|
135
|
+
"connection refused", "connection reset", "failed to connect",
|
|
136
|
+
"ssl", "certificate")):
|
|
137
|
+
return "connection", status
|
|
138
|
+
if status and 400 <= status < 600:
|
|
139
|
+
return "http_error", status
|
|
140
|
+
return "error", status
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def check(url: str, md: str | None) -> str:
|
|
144
|
+
"""Return md if it clears the gates, else raise BotWall / ShortContent."""
|
|
145
|
+
vendor = classify_botwall(md)
|
|
146
|
+
if vendor:
|
|
147
|
+
raise BotWall(f"bot-wall / block page detected ({vendor})", vendor=vendor)
|
|
148
|
+
gate = min_len_for(url)
|
|
149
|
+
n = len(md) if md else 0
|
|
150
|
+
if n < gate:
|
|
151
|
+
raise ShortContent(f"body too short: {n} < {gate}")
|
|
152
|
+
return md
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class ShortContent(RuntimeError):
|
|
156
|
+
"""Content fetched but below the quality gate — treated as a tier miss."""
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class BotWall(RuntimeError):
|
|
160
|
+
"""Content fetched but it's a bot-wall / block interstitial (e.g. Cloudflare
|
|
161
|
+
"Just a moment...") rather than the real page — treated as a tier miss so the
|
|
162
|
+
cascade falls through to a stealthier tier. `vendor` names the wall
|
|
163
|
+
(cloudflare / datadome / akamai / …) when known, so the policy can learn
|
|
164
|
+
which challenge a host serves."""
|
|
165
|
+
|
|
166
|
+
def __init__(self, *args, vendor: str | None = None):
|
|
167
|
+
super().__init__(*args)
|
|
168
|
+
self.vendor = vendor
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class RateLimited(RuntimeError):
|
|
172
|
+
"""Tier hit an upstream rate/quota limit (e.g. HTTP 429) — traced distinctly
|
|
173
|
+
from a normal failure so limit-pressure is visible in the dashboard."""
|
switchback/py.typed
ADDED
|
File without changes
|