veriscrape 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veriscrape/__init__.py +64 -0
- veriscrape/adapters.py +90 -0
- veriscrape/classify.py +736 -0
- veriscrape/cli.py +61 -0
- veriscrape/record.py +62 -0
- veriscrape-0.1.0.dist-info/METADATA +334 -0
- veriscrape-0.1.0.dist-info/RECORD +11 -0
- veriscrape-0.1.0.dist-info/WHEEL +4 -0
- veriscrape-0.1.0.dist-info/entry_points.txt +2 -0
- veriscrape-0.1.0.dist-info/licenses/LICENSE +202 -0
- veriscrape-0.1.0.dist-info/licenses/NOTICE +4 -0
veriscrape/__init__.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""veriscrape: fetch, but it tells you the truth.
|
|
2
|
+
|
|
3
|
+
A verified-fetch primitive: every fetch returns the bytes *plus* a portable
|
|
4
|
+
trust verdict (OK / BLOCKED / CHALLENGE / HONEYPOT / SOFT_404 / LOGIN_WALL /
|
|
5
|
+
EMPTY_SHELL), so you know the moment your data is silently wrong, not three
|
|
6
|
+
days later through a downstream discrepancy.
|
|
7
|
+
|
|
8
|
+
>>> import veriscrape
|
|
9
|
+
>>> r = veriscrape.get("https://example.com")
|
|
10
|
+
>>> r.verdict, r.confidence
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
from .classify import classify
|
|
18
|
+
from .record import FetchRecord, Verdict
|
|
19
|
+
|
|
20
|
+
__all__ = ["get", "FetchRecord", "Verdict", "classify", "__version__"]
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
|
|
23
|
+
_DEFAULT_IMPERSONATE = "chrome"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get(
|
|
27
|
+
url: str,
|
|
28
|
+
*,
|
|
29
|
+
impersonate: str = _DEFAULT_IMPERSONATE,
|
|
30
|
+
timeout: float = 30.0,
|
|
31
|
+
**kwargs,
|
|
32
|
+
) -> FetchRecord:
|
|
33
|
+
"""Fetch ``url`` and return a :class:`FetchRecord` with a trust verdict.
|
|
34
|
+
|
|
35
|
+
Drop-in for ``requests.get``, but the result tells you whether the 200 is
|
|
36
|
+
real. Uses curl_cffi for browser-like TLS so you are not blocked on signal
|
|
37
|
+
alone, then runs the deterministic classifier over the response.
|
|
38
|
+
"""
|
|
39
|
+
# Imported lazily so that `import veriscrape` never requires the network stack.
|
|
40
|
+
from curl_cffi import requests as cffi
|
|
41
|
+
|
|
42
|
+
start = time.perf_counter()
|
|
43
|
+
# impersonate is a free-form profile string at runtime; curl_cffi's stub narrows it to a Literal.
|
|
44
|
+
resp = cffi.get(url, impersonate=impersonate, timeout=timeout, **kwargs) # type: ignore[arg-type]
|
|
45
|
+
elapsed_ms = (time.perf_counter() - start) * 1000.0
|
|
46
|
+
|
|
47
|
+
headers = {k: v for k, v in dict(resp.headers).items() if v is not None}
|
|
48
|
+
body = resp.text
|
|
49
|
+
verdict, cause, confidence, evidence = classify(
|
|
50
|
+
status=resp.status_code, headers=headers, body=body
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return FetchRecord(
|
|
54
|
+
url=url,
|
|
55
|
+
status=resp.status_code,
|
|
56
|
+
verdict=verdict,
|
|
57
|
+
cause=cause,
|
|
58
|
+
tactic=f"curl_cffi:{impersonate}",
|
|
59
|
+
confidence=confidence,
|
|
60
|
+
evidence=evidence,
|
|
61
|
+
headers=headers,
|
|
62
|
+
text=body,
|
|
63
|
+
elapsed_ms=elapsed_ms,
|
|
64
|
+
)
|
veriscrape/adapters.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Drop-in adapters: add a trust verdict to the fetcher you already use.
|
|
2
|
+
|
|
3
|
+
`veriscrape.get()` is the drop-in for `requests.get`. But if you already have a Scrapy
|
|
4
|
+
spider, a Playwright page, or a raw `requests`/`httpx` response, you don't have to switch
|
|
5
|
+
fetchers, just classify what you already have:
|
|
6
|
+
|
|
7
|
+
from veriscrape.adapters import from_requests, from_response
|
|
8
|
+
|
|
9
|
+
resp = requests.get(url)
|
|
10
|
+
record = from_requests(resp) # -> FetchRecord with .verdict / .cause / .ok
|
|
11
|
+
|
|
12
|
+
record = from_response(status, headers, body, url=url) # any stack
|
|
13
|
+
|
|
14
|
+
Scrapy: add ``veriscrape.adapters.VeriscrapeMiddleware`` to ``DOWNLOADER_MIDDLEWARES`` and
|
|
15
|
+
read ``response.meta["veriscrape"]``.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from . import FetchRecord, classify
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def from_response(
|
|
24
|
+
status: int | None,
|
|
25
|
+
headers,
|
|
26
|
+
body: str | None,
|
|
27
|
+
*,
|
|
28
|
+
url: str = "",
|
|
29
|
+
tactic: str | None = None,
|
|
30
|
+
elapsed_ms: float | None = None,
|
|
31
|
+
) -> FetchRecord:
|
|
32
|
+
"""Classify raw response parts from any stack into a FetchRecord."""
|
|
33
|
+
headers = dict(headers or {})
|
|
34
|
+
verdict, cause, confidence, evidence = classify(status=status, headers=headers, body=body or "")
|
|
35
|
+
return FetchRecord(
|
|
36
|
+
url=url, status=status, verdict=verdict, cause=cause, confidence=confidence,
|
|
37
|
+
evidence=evidence, headers=headers, text=body, tactic=tactic, elapsed_ms=elapsed_ms,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def from_requests(response) -> FetchRecord:
|
|
42
|
+
"""Classify a ``requests.Response`` (or any object with status_code/headers/text/url)."""
|
|
43
|
+
return from_response(
|
|
44
|
+
status=getattr(response, "status_code", None),
|
|
45
|
+
headers=getattr(response, "headers", {}) or {},
|
|
46
|
+
body=getattr(response, "text", "") or "",
|
|
47
|
+
url=str(getattr(response, "url", "") or ""),
|
|
48
|
+
tactic="requests",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _normalize_scrapy_headers(headers) -> dict[str, str]:
|
|
53
|
+
"""Best-effort: Scrapy headers are bytes-keyed and multi-valued. Flatten to str:str."""
|
|
54
|
+
out: dict[str, str] = {}
|
|
55
|
+
try:
|
|
56
|
+
items = headers.items()
|
|
57
|
+
except Exception:
|
|
58
|
+
return out
|
|
59
|
+
for key, value in items:
|
|
60
|
+
k = key.decode() if isinstance(key, bytes) else str(key)
|
|
61
|
+
if isinstance(value, (list, tuple)):
|
|
62
|
+
value = value[0] if value else b""
|
|
63
|
+
v = value.decode() if isinstance(value, bytes) else str(value)
|
|
64
|
+
out[k] = v
|
|
65
|
+
return out
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class VeriscrapeMiddleware:
|
|
69
|
+
"""Scrapy downloader middleware: attaches a veriscrape verdict to every response.
|
|
70
|
+
|
|
71
|
+
In ``settings.py``::
|
|
72
|
+
|
|
73
|
+
DOWNLOADER_MIDDLEWARES = {"veriscrape.adapters.VeriscrapeMiddleware": 900}
|
|
74
|
+
|
|
75
|
+
Then in a spider: ``response.meta["veriscrape"].verdict``.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def process_response(self, request, response, spider):
|
|
79
|
+
record = from_response(
|
|
80
|
+
status=getattr(response, "status", None),
|
|
81
|
+
headers=_normalize_scrapy_headers(getattr(response, "headers", {})),
|
|
82
|
+
body=getattr(response, "text", "") or "",
|
|
83
|
+
url=str(getattr(response, "url", "") or ""),
|
|
84
|
+
tactic="scrapy",
|
|
85
|
+
)
|
|
86
|
+
try:
|
|
87
|
+
response.meta["veriscrape"] = record
|
|
88
|
+
except Exception:
|
|
89
|
+
pass # never break the pipeline over a missing meta dict
|
|
90
|
+
return response
|