veriscrape 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
veriscrape/__init__.py ADDED
@@ -0,0 +1,64 @@
1
+ """veriscrape: fetch, but it tells you the truth.
2
+
3
+ A verified-fetch primitive: every fetch returns the bytes *plus* a portable
4
+ trust verdict (OK / BLOCKED / CHALLENGE / HONEYPOT / SOFT_404 / LOGIN_WALL /
5
+ EMPTY_SHELL), so you know the moment your data is silently wrong, not three
6
+ days later through a downstream discrepancy.
7
+
8
+ >>> import veriscrape
9
+ >>> r = veriscrape.get("https://example.com")
10
+ >>> r.verdict, r.confidence
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import time
16
+
17
+ from .classify import classify
18
+ from .record import FetchRecord, Verdict
19
+
20
+ __all__ = ["get", "FetchRecord", "Verdict", "classify", "__version__"]
21
+ __version__ = "0.1.0"
22
+
23
+ _DEFAULT_IMPERSONATE = "chrome"
24
+
25
+
26
+ def get(
27
+ url: str,
28
+ *,
29
+ impersonate: str = _DEFAULT_IMPERSONATE,
30
+ timeout: float = 30.0,
31
+ **kwargs,
32
+ ) -> FetchRecord:
33
+ """Fetch ``url`` and return a :class:`FetchRecord` with a trust verdict.
34
+
35
+ Drop-in for ``requests.get``, but the result tells you whether the 200 is
36
+ real. Uses curl_cffi for browser-like TLS so you are not blocked on signal
37
+ alone, then runs the deterministic classifier over the response.
38
+ """
39
+ # Imported lazily so that `import veriscrape` never requires the network stack.
40
+ from curl_cffi import requests as cffi
41
+
42
+ start = time.perf_counter()
43
+ # impersonate is a free-form profile string at runtime; curl_cffi's stub narrows it to a Literal.
44
+ resp = cffi.get(url, impersonate=impersonate, timeout=timeout, **kwargs) # type: ignore[arg-type]
45
+ elapsed_ms = (time.perf_counter() - start) * 1000.0
46
+
47
+ headers = {k: v for k, v in dict(resp.headers).items() if v is not None}
48
+ body = resp.text
49
+ verdict, cause, confidence, evidence = classify(
50
+ status=resp.status_code, headers=headers, body=body
51
+ )
52
+
53
+ return FetchRecord(
54
+ url=url,
55
+ status=resp.status_code,
56
+ verdict=verdict,
57
+ cause=cause,
58
+ tactic=f"curl_cffi:{impersonate}",
59
+ confidence=confidence,
60
+ evidence=evidence,
61
+ headers=headers,
62
+ text=body,
63
+ elapsed_ms=elapsed_ms,
64
+ )
veriscrape/adapters.py ADDED
@@ -0,0 +1,90 @@
1
+ """Drop-in adapters: add a trust verdict to the fetcher you already use.
2
+
3
+ `veriscrape.get()` is the drop-in for `requests.get`. But if you already have a Scrapy
4
+ spider, a Playwright page, or a raw `requests`/`httpx` response, you don't have to switch
5
+ fetchers, just classify what you already have:
6
+
7
+ from veriscrape.adapters import from_requests, from_response
8
+
9
+ resp = requests.get(url)
10
+ record = from_requests(resp) # -> FetchRecord with .verdict / .cause / .ok
11
+
12
+ record = from_response(status, headers, body, url=url) # any stack
13
+
14
+ Scrapy: add ``veriscrape.adapters.VeriscrapeMiddleware`` to ``DOWNLOADER_MIDDLEWARES`` and
15
+ read ``response.meta["veriscrape"]``.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from . import FetchRecord, classify
21
+
22
+
23
+ def from_response(
24
+ status: int | None,
25
+ headers,
26
+ body: str | None,
27
+ *,
28
+ url: str = "",
29
+ tactic: str | None = None,
30
+ elapsed_ms: float | None = None,
31
+ ) -> FetchRecord:
32
+ """Classify raw response parts from any stack into a FetchRecord."""
33
+ headers = dict(headers or {})
34
+ verdict, cause, confidence, evidence = classify(status=status, headers=headers, body=body or "")
35
+ return FetchRecord(
36
+ url=url, status=status, verdict=verdict, cause=cause, confidence=confidence,
37
+ evidence=evidence, headers=headers, text=body, tactic=tactic, elapsed_ms=elapsed_ms,
38
+ )
39
+
40
+
41
+ def from_requests(response) -> FetchRecord:
42
+ """Classify a ``requests.Response`` (or any object with status_code/headers/text/url)."""
43
+ return from_response(
44
+ status=getattr(response, "status_code", None),
45
+ headers=getattr(response, "headers", {}) or {},
46
+ body=getattr(response, "text", "") or "",
47
+ url=str(getattr(response, "url", "") or ""),
48
+ tactic="requests",
49
+ )
50
+
51
+
52
+ def _normalize_scrapy_headers(headers) -> dict[str, str]:
53
+ """Best-effort: Scrapy headers are bytes-keyed and multi-valued. Flatten to str:str."""
54
+ out: dict[str, str] = {}
55
+ try:
56
+ items = headers.items()
57
+ except Exception:
58
+ return out
59
+ for key, value in items:
60
+ k = key.decode() if isinstance(key, bytes) else str(key)
61
+ if isinstance(value, (list, tuple)):
62
+ value = value[0] if value else b""
63
+ v = value.decode() if isinstance(value, bytes) else str(value)
64
+ out[k] = v
65
+ return out
66
+
67
+
68
+ class VeriscrapeMiddleware:
69
+ """Scrapy downloader middleware: attaches a veriscrape verdict to every response.
70
+
71
+ In ``settings.py``::
72
+
73
+ DOWNLOADER_MIDDLEWARES = {"veriscrape.adapters.VeriscrapeMiddleware": 900}
74
+
75
+ Then in a spider: ``response.meta["veriscrape"].verdict``.
76
+ """
77
+
78
+ def process_response(self, request, response, spider):
79
+ record = from_response(
80
+ status=getattr(response, "status", None),
81
+ headers=_normalize_scrapy_headers(getattr(response, "headers", {})),
82
+ body=getattr(response, "text", "") or "",
83
+ url=str(getattr(response, "url", "") or ""),
84
+ tactic="scrapy",
85
+ )
86
+ try:
87
+ response.meta["veriscrape"] = record
88
+ except Exception:
89
+ pass # never break the pipeline over a missing meta dict
90
+ return response