crawlerkit-core 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlerkit/core/__init__.py +6 -0
- crawlerkit/core/base_crawler.py +126 -0
- crawlerkit/core/base_parser.py +77 -0
- crawlerkit/core/captcha/__init__.py +34 -0
- crawlerkit/core/captcha/base.py +90 -0
- crawlerkit/core/captcha/govbr.py +40 -0
- crawlerkit/core/captcha/llm_image.py +46 -0
- crawlerkit/core/captcha/mcaptcha.py +142 -0
- crawlerkit/core/captcha/token_adapters.py +77 -0
- crawlerkit/core/captcha/turnstile.py +40 -0
- crawlerkit/core/cookies.py +40 -0
- crawlerkit/core/errors.py +48 -0
- crawlerkit/core/identity.py +108 -0
- crawlerkit/core/proxy.py +96 -0
- crawlerkit/core/tls.py +113 -0
- crawlerkit/core/transport.py +76 -0
- crawlerkit_core-0.1.0.dist-info/METADATA +80 -0
- crawlerkit_core-0.1.0.dist-info/RECORD +21 -0
- crawlerkit_core-0.1.0.dist-info/WHEEL +5 -0
- crawlerkit_core-0.1.0.dist-info/licenses/LICENSE +21 -0
- crawlerkit_core-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""BaseCrawler — the crawl stage. A new target fills one hook: flow()."""
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
import structlog
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
|
|
11
|
+
from .captcha.base import CaptchaRegistry, Challenge, default_registry
|
|
12
|
+
from .errors import BlockedError, PermanentError, TransientError
|
|
13
|
+
from .identity import Profile, pick
|
|
14
|
+
from .proxy import NullProxyProvider, ProxyProvider
|
|
15
|
+
from .transport import Transport
|
|
16
|
+
|
|
17
|
+
log = structlog.get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class RawResponse:
|
|
22
|
+
url: str
|
|
23
|
+
status: int
|
|
24
|
+
text: str
|
|
25
|
+
headers: dict = field(default_factory=dict)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BaseCrawler(ABC):
|
|
29
|
+
"""Owns transport+identity+proxy+captcha; subclass implements only flow().
|
|
30
|
+
|
|
31
|
+
No business logic, no parsing here — crawl and return the raw response.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
captcha_hint: Challenge | None = None # known sitekey when the widget isn't inline
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
*,
|
|
39
|
+
proxy_provider: ProxyProvider | None = None,
|
|
40
|
+
registry: CaptchaRegistry | None = None,
|
|
41
|
+
verify: bool = True,
|
|
42
|
+
profile: Profile | None = None,
|
|
43
|
+
client_cert: str | None = None,
|
|
44
|
+
max_attempts: int = 3,
|
|
45
|
+
):
|
|
46
|
+
self._proxy_provider = proxy_provider or NullProxyProvider()
|
|
47
|
+
self._verify = verify
|
|
48
|
+
self._client_cert = client_cert
|
|
49
|
+
self._fixed_profile = profile
|
|
50
|
+
self.max_attempts = max_attempts
|
|
51
|
+
self.registry = registry or default_registry()
|
|
52
|
+
self._build_transport()
|
|
53
|
+
|
|
54
|
+
def _build_transport(self) -> None:
|
|
55
|
+
"""(Re)create identity + proxy lease + transport — on init and on each rotation."""
|
|
56
|
+
self.profile = self._fixed_profile or pick()
|
|
57
|
+
self.proxy = self._proxy_provider.lease()
|
|
58
|
+
self.transport = Transport(
|
|
59
|
+
self.profile, self.proxy, verify=self._verify, client_cert=self._client_cert
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def _rotate(self) -> None:
|
|
63
|
+
log.info("rotate_identity_proxy")
|
|
64
|
+
self._build_transport()
|
|
65
|
+
|
|
66
|
+
# --- helpers exposed to flow() ---
|
|
67
|
+
def get(self, url: str, **kw):
|
|
68
|
+
return self.transport.get(url, **kw)
|
|
69
|
+
|
|
70
|
+
def post(self, url: str, **kw):
|
|
71
|
+
return self.transport.post(url, **kw)
|
|
72
|
+
|
|
73
|
+
def solve_captcha(self, source) -> str | None:
|
|
74
|
+
"""detect+solve; returns a token, None (no challenge), or raises UnsupportedCaptcha."""
|
|
75
|
+
solved = self.registry.solve(source, self.transport, hint=self.captcha_hint)
|
|
76
|
+
return solved.token if solved else None
|
|
77
|
+
|
|
78
|
+
def hidden_fields(self, html: str) -> dict:
|
|
79
|
+
"""All hidden inputs of the form (JSF ViewState / WebForms __VIEWSTATE postback state)."""
|
|
80
|
+
try:
|
|
81
|
+
soup = BeautifulSoup(html, "lxml")
|
|
82
|
+
except Exception: # noqa: BLE001
|
|
83
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
84
|
+
form = soup.find("form") if soup else None
|
|
85
|
+
scope = form or soup
|
|
86
|
+
hidden: dict[str, str] = {}
|
|
87
|
+
if scope:
|
|
88
|
+
for inp in scope.find_all("input"):
|
|
89
|
+
name = inp.get("name")
|
|
90
|
+
if name and (inp.get("type") == "hidden" or "ViewState" in name or "VIEWSTATE" in name.upper()):
|
|
91
|
+
hidden[name] = inp.get("value", "")
|
|
92
|
+
return hidden
|
|
93
|
+
|
|
94
|
+
# --- the only required hook ---
|
|
95
|
+
@abstractmethod
|
|
96
|
+
def flow(self, params: dict) -> RawResponse:
|
|
97
|
+
...
|
|
98
|
+
|
|
99
|
+
def run(self, params: dict) -> RawResponse:
|
|
100
|
+
"""Run flow() with retry + rotation. TransientError -> back off, retry (same identity);
|
|
101
|
+
BlockedError -> rotate identity+proxy, then retry; PermanentError -> fail fast."""
|
|
102
|
+
last: Exception | None = None
|
|
103
|
+
for attempt in range(1, self.max_attempts + 1):
|
|
104
|
+
try:
|
|
105
|
+
log.info("crawl_start", crawler=type(self).__name__, attempt=attempt)
|
|
106
|
+
raw = self.flow(params)
|
|
107
|
+
log.info("crawl_done", status=raw.status, bytes=len(raw.text))
|
|
108
|
+
return raw
|
|
109
|
+
except PermanentError:
|
|
110
|
+
raise
|
|
111
|
+
except BlockedError as e:
|
|
112
|
+
last = e
|
|
113
|
+
log.warning("blocked", attempt=attempt, error=str(e))
|
|
114
|
+
if attempt < self.max_attempts:
|
|
115
|
+
self._rotate()
|
|
116
|
+
self._backoff(attempt)
|
|
117
|
+
except TransientError as e:
|
|
118
|
+
last = e
|
|
119
|
+
log.warning("transient", attempt=attempt, error=str(e))
|
|
120
|
+
if attempt < self.max_attempts:
|
|
121
|
+
self._backoff(attempt)
|
|
122
|
+
raise last or RuntimeError("crawl failed with no captured error")
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def _backoff(attempt: int, cap: float = 30.0) -> None:
|
|
126
|
+
time.sleep(min(2.0**attempt + random.uniform(0, 1), cap))
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""BaseParser — the parse stage. A new target fills one hook: parse().
|
|
2
|
+
|
|
3
|
+
Pure + item-local: no network beyond fetching static assets for the optional PDF, no
|
|
4
|
+
cross-item state. Operates on the RawResponse the crawler returned (or a replayed one).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Generic, TypeVar
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
import structlog
|
|
12
|
+
|
|
13
|
+
from .base_crawler import RawResponse
|
|
14
|
+
|
|
15
|
+
log = structlog.get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
#: What ``parse()`` yields — your own model, a dataclass, a ``dict``, anything.
|
|
18
|
+
#: crawlerkit-core stays dependency-free: it never dictates the output type.
|
|
19
|
+
T = TypeVar("T")
|
|
20
|
+
|
|
21
|
+
# Print fixups: hide leftover form inputs, landscape, fit wide tables.
|
|
22
|
+
_PDF_FIXUP_CSS = """
|
|
23
|
+
input { display: none !important; }
|
|
24
|
+
@page { size: A4 landscape; margin: 1.2cm; }
|
|
25
|
+
table { font-size: 9px; table-layout: fixed; width: 100%; }
|
|
26
|
+
td, th { overflow-wrap: anywhere; }
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def render_pdf(html: str, base_url: str) -> bytes:
|
|
31
|
+
"""HTML -> PDF (WeasyPrint, no browser). Fetches remote CSS over a verified, AIA-repaired
|
|
32
|
+
TLS connection (curl_cffi + crawlerkit.core.tls). No `requests`."""
|
|
33
|
+
from curl_cffi import requests as cffi
|
|
34
|
+
from weasyprint import CSS, HTML, default_url_fetcher
|
|
35
|
+
|
|
36
|
+
from . import tls
|
|
37
|
+
|
|
38
|
+
def fetcher(url: str, **kw):
|
|
39
|
+
if url.startswith(("http://", "https://")):
|
|
40
|
+
host = urlparse(url).hostname or ""
|
|
41
|
+
try:
|
|
42
|
+
r = cffi.get(url, verify=tls.build_ca_bundle(host), timeout=30, impersonate="chrome131")
|
|
43
|
+
ct = r.headers.get("content-type", "")
|
|
44
|
+
out = {"string": r.content, "redirected_url": str(r.url)}
|
|
45
|
+
mime = ct.split(";")[0].strip()
|
|
46
|
+
if mime:
|
|
47
|
+
out["mime_type"] = mime
|
|
48
|
+
return out
|
|
49
|
+
except Exception as e: # noqa: BLE001 — a missing asset must not kill the PDF
|
|
50
|
+
log.warning("pdf_asset_skipped", url=url, error=str(e))
|
|
51
|
+
return {"string": b"", "mime_type": "text/plain"}
|
|
52
|
+
return default_url_fetcher(url, **kw)
|
|
53
|
+
|
|
54
|
+
return HTML(string=html, base_url=base_url, url_fetcher=fetcher).write_pdf(
|
|
55
|
+
stylesheets=[CSS(string=_PDF_FIXUP_CSS)]
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class BaseParser(ABC, Generic[T]):
|
|
60
|
+
"""Parse stage. Subclass with your own item type: ``class MyParser(BaseParser[MyModel])``
|
|
61
|
+
(or ``BaseParser[dict]``). ``parse()`` returns ``list[T]``; the type is yours, not the lib's."""
|
|
62
|
+
|
|
63
|
+
render_pdf_enabled: bool = True
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def parse(self, raw: RawResponse) -> list[T]:
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
def pdf(self, raw: RawResponse) -> bytes | None:
|
|
70
|
+
if not self.render_pdf_enabled:
|
|
71
|
+
return None
|
|
72
|
+
return render_pdf(raw.text, base_url=raw.url)
|
|
73
|
+
|
|
74
|
+
def run(self, raw: RawResponse) -> tuple[list[T], bytes | None]:
|
|
75
|
+
items = self.parse(raw)
|
|
76
|
+
log.info("parse_done", count=len(items))
|
|
77
|
+
return items, self.pdf(raw)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
CaptchaRegistry,
|
|
3
|
+
CaptchaServiceError,
|
|
4
|
+
Challenge,
|
|
5
|
+
Solved,
|
|
6
|
+
UnsupportedCaptcha,
|
|
7
|
+
default_registry,
|
|
8
|
+
)
|
|
9
|
+
from .govbr import GovBrSolver
|
|
10
|
+
from .llm_image import LlmImageSolver
|
|
11
|
+
from .mcaptcha import McaptchaPowSolver, mcaptcha_hint
|
|
12
|
+
from .token_adapters import HcaptchaSolver, RecaptchaV2Solver, RecaptchaV3Solver, TokenProvider
|
|
13
|
+
from .turnstile import TurnstileSolver
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Challenge",
|
|
17
|
+
"Solved",
|
|
18
|
+
"UnsupportedCaptcha",
|
|
19
|
+
"CaptchaServiceError",
|
|
20
|
+
"CaptchaRegistry",
|
|
21
|
+
"default_registry",
|
|
22
|
+
# own solvers
|
|
23
|
+
"McaptchaPowSolver",
|
|
24
|
+
"mcaptcha_hint",
|
|
25
|
+
"LlmImageSolver",
|
|
26
|
+
# browserless stubs (TODO crack)
|
|
27
|
+
"TurnstileSolver",
|
|
28
|
+
"GovBrSolver",
|
|
29
|
+
# optional token-adapters (opt-in)
|
|
30
|
+
"TokenProvider",
|
|
31
|
+
"RecaptchaV2Solver",
|
|
32
|
+
"RecaptchaV3Solver",
|
|
33
|
+
"HcaptchaSolver",
|
|
34
|
+
]
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Captcha detection + a registry of our own solvers.
|
|
2
|
+
|
|
3
|
+
Three outcomes when a source (HTML or response) is checked:
|
|
4
|
+
- no challenge -> registry.solve returns None
|
|
5
|
+
- challenge + solver -> Solved{token, expires_at}
|
|
6
|
+
- challenge, no solver -> raise UnsupportedCaptcha
|
|
7
|
+
|
|
8
|
+
A solver produces a token; the backend (compute / LLM-image / JS-runtime) is its own business.
|
|
9
|
+
Tokens are single-use and solved on submit (never pre-solved).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Optional, Protocol, runtime_checkable
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Challenge:
|
|
18
|
+
kind: str
|
|
19
|
+
params: dict = field(default_factory=dict)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Solved:
|
|
24
|
+
token: str
|
|
25
|
+
expires_at: float | None = None # absolute epoch seconds, from the challenge's own ttl
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class UnsupportedCaptcha(Exception):
|
|
29
|
+
def __init__(self, kind: str):
|
|
30
|
+
super().__init__(f"no solver registered for captcha kind: {kind}")
|
|
31
|
+
self.kind = kind
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CaptchaServiceError(Exception):
|
|
35
|
+
"""The captcha backend returned an unexpected/error response (often transient)."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@runtime_checkable
|
|
39
|
+
class Solver(Protocol):
|
|
40
|
+
kind: str
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def detect(cls, text: str) -> Optional[Challenge]:
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class CaptchaRegistry:
|
|
51
|
+
def __init__(self) -> None:
|
|
52
|
+
self._solvers: dict[str, Solver] = {}
|
|
53
|
+
|
|
54
|
+
def register(self, solver: Solver) -> "CaptchaRegistry":
|
|
55
|
+
self._solvers[solver.kind] = solver
|
|
56
|
+
return self
|
|
57
|
+
|
|
58
|
+
def detect(self, source) -> Optional[Challenge]:
|
|
59
|
+
text = source if isinstance(source, str) else getattr(source, "text", "") or ""
|
|
60
|
+
for solver in self._solvers.values():
|
|
61
|
+
ch = solver.detect(text)
|
|
62
|
+
if ch is not None:
|
|
63
|
+
return ch
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def solve(self, source, transport, *, hint: Optional[Challenge] = None) -> Optional[Solved]:
|
|
67
|
+
challenge = self.detect(source) or hint
|
|
68
|
+
if challenge is None:
|
|
69
|
+
return None
|
|
70
|
+
solver = self._solvers.get(challenge.kind)
|
|
71
|
+
if solver is None:
|
|
72
|
+
raise UnsupportedCaptcha(challenge.kind)
|
|
73
|
+
return solver.solve(challenge, transport)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def default_registry() -> CaptchaRegistry:
|
|
77
|
+
"""Registry with the built-in own-solvers: mCaptcha PoW (working) + gov.br/Turnstile
|
|
78
|
+
browserless stubs (detect works, solve raises NotImplementedError until cracked).
|
|
79
|
+
Optional token-adapters (reCAPTCHA/hCaptcha) and the LLM image solver are opt-in —
|
|
80
|
+
register them yourself when configured."""
|
|
81
|
+
from .govbr import GovBrSolver
|
|
82
|
+
from .mcaptcha import McaptchaPowSolver
|
|
83
|
+
from .turnstile import TurnstileSolver
|
|
84
|
+
|
|
85
|
+
return (
|
|
86
|
+
CaptchaRegistry()
|
|
87
|
+
.register(McaptchaPowSolver())
|
|
88
|
+
.register(TurnstileSolver())
|
|
89
|
+
.register(GovBrSolver())
|
|
90
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""gov.br (sso.acesso.gov.br) — BROWSERLESS solver scaffold.
|
|
2
|
+
|
|
3
|
+
gov.br SSO is the fleet's most common gate (~79 repos) and is browser-only in atlas today.
|
|
4
|
+
`detect()` works now; `solve()` is a TODO for a manual, browserless crack — fails loudly.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from .base import Challenge, Solved
|
|
10
|
+
|
|
11
|
+
_SIGNATURE = re.compile(r"sso\.acesso\.gov\.br|acesso\.gov\.br|\bgovbr\b", re.I)
|
|
12
|
+
_SITEKEY_RE = re.compile(r'data-sitekey=["\']([0-9A-Za-z_-]{8,})["\']')
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GovBrSolver:
|
|
16
|
+
kind = "govbr"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def detect(cls, text: str):
|
|
20
|
+
text = text or ""
|
|
21
|
+
if not _SIGNATURE.search(text):
|
|
22
|
+
return None
|
|
23
|
+
m = _SITEKEY_RE.search(text) # gov.br embeds hCaptcha/reCAPTCHA
|
|
24
|
+
return Challenge(kind=cls.kind, params={"sitekey": m.group(1) if m else None})
|
|
25
|
+
|
|
26
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
27
|
+
# TODO(crawlerkit): implement the BROWSERLESS gov.br SSO authentication.
|
|
28
|
+
# gov.br (sso.acesso.gov.br) is JS-heavy and gated by a captcha (hCaptcha/reCAPTCHA) plus
|
|
29
|
+
# fingerprint checks. Browserless approach to fill in here:
|
|
30
|
+
# 1. Drive the SSO step sequence with the verified curl_cffi transport, carrying cookies
|
|
31
|
+
# across redirects (login -> authorize -> callback).
|
|
32
|
+
# 2. Solve the embedded captcha via the registry (hCaptcha/reCAPTCHA token solver) OR a
|
|
33
|
+
# JS-runtime crack of the gov.br challenge script (QuickJS/Node + DOM shim seeded from
|
|
34
|
+
# the active Profile + proxy IP).
|
|
35
|
+
# 3. Complete the OAuth/SSO redirect; return Solved(token=<session cookie / SSO assertion>).
|
|
36
|
+
# Note: some gov.br services accept ICP-Brasil mutual-TLS client certs — see crawlerkit.core.tls.
|
|
37
|
+
raise NotImplementedError(
|
|
38
|
+
f"browserless gov.br solve is a TODO (params={challenge.params!r}) "
|
|
39
|
+
"— implement the SSO/JS-runtime crack"
|
|
40
|
+
)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Own image-captcha solver: fetch the challenge image over the verified transport, classify with
|
|
2
|
+
a pluggable vision LLM, return the answer/token. Provider-agnostic — inject a `classify` callable
|
|
3
|
+
`(image_bytes, prompt) -> str`. Prompts ported from atlas's GPT solver.
|
|
4
|
+
|
|
5
|
+
Image captchas are target-specific, so the crawler builds the Challenge with the image location
|
|
6
|
+
(`params["image_url"]` or `params["image_bytes"]`); `detect()` returns None.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .base import CaptchaServiceError, Challenge, Solved
|
|
10
|
+
|
|
11
|
+
OCR_PROMPT = (
|
|
12
|
+
"This image is a CAPTCHA. Read the characters exactly. Respond with ONLY the characters "
|
|
13
|
+
"(letters/digits), no spaces, no explanation."
|
|
14
|
+
)
|
|
15
|
+
# 3x3 / 4x4 grid-selection prompts (hCaptcha / reCAPTCHA) are available for grid challenges;
|
|
16
|
+
# port the full set from atlas chatgpt_captcha_solver.py when wiring a grid flow.
|
|
17
|
+
GRID_3X3_PROMPT = (
|
|
18
|
+
"A reference image sits above a 3x3 grid (tiles numbered 1-9, left-to-right, top-to-bottom). "
|
|
19
|
+
"Return the tile numbers that clearly and fully match the reference, separated by '/', e.g. '2/5/9'. "
|
|
20
|
+
"If none match, return 'none'. No other text."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LlmImageSolver:
|
|
25
|
+
kind = "image"
|
|
26
|
+
|
|
27
|
+
def __init__(self, classify, *, prompt: str = OCR_PROMPT):
|
|
28
|
+
# classify: Callable[[bytes, str], str]
|
|
29
|
+
self._classify = classify
|
|
30
|
+
self._prompt = prompt
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def detect(cls, text: str):
|
|
34
|
+
return None # the crawler constructs the image Challenge explicitly
|
|
35
|
+
|
|
36
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
37
|
+
img = challenge.params.get("image_bytes")
|
|
38
|
+
if img is None:
|
|
39
|
+
url = challenge.params.get("image_url")
|
|
40
|
+
if not url:
|
|
41
|
+
raise CaptchaServiceError("LlmImageSolver needs params['image_url'] or ['image_bytes']")
|
|
42
|
+
img = transport.get(url, timeout=30).content
|
|
43
|
+
answer = (self._classify(img, challenge.params.get("prompt", self._prompt)) or "").strip()
|
|
44
|
+
if not answer:
|
|
45
|
+
raise CaptchaServiceError("vision LLM returned an empty answer")
|
|
46
|
+
return Solved(token=answer)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""mCaptcha proof-of-work solver (compute backend, no third-party service).
|
|
2
|
+
|
|
3
|
+
Ported from the Detran POC. Byte layout confirmed empirically against a captured
|
|
4
|
+
(salt, string, nonce) -> result oracle (guarded by self_test). No sitekey secret needed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import re
|
|
9
|
+
import struct
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
from .base import CaptchaServiceError, Challenge, Solved
|
|
13
|
+
|
|
14
|
+
U128_MAX = (1 << 128) - 1
|
|
15
|
+
|
|
16
|
+
# Captured oracle (a real, self-consistent challenge) — proves the byte layout.
|
|
17
|
+
_ORACLE_SALT = "8c7b6a5d4e3f2a1b0c9d8e7f6a5b4c3d2e1f0a9b8c7d6e5f4a3b2c1d0e9f8a7b"
|
|
18
|
+
_ORACLE_STRING = "7KoiRWIqZk3qFy7C8Jt96E9KUSGfdbVL"
|
|
19
|
+
_ORACLE_DIFFICULTY = 4_000_000
|
|
20
|
+
_ORACLE_NONCE = 3_539_967
|
|
21
|
+
_ORACLE_RESULT = 340282365527686933810834880601832247926
|
|
22
|
+
|
|
23
|
+
# matches the mCaptcha widget iframe URL embedded in a page
|
|
24
|
+
_WIDGET_RE = re.compile(r"https?://([\w.-]+)/widget\?sitekey=([\w-]+)")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def mcaptcha_hint(host: str, sitekey: str) -> Challenge:
|
|
28
|
+
"""Build a Challenge from a known sitekey when the widget isn't inline in the GET HTML."""
|
|
29
|
+
return Challenge(
|
|
30
|
+
kind="mcaptcha",
|
|
31
|
+
params={"host": host, "sitekey": sitekey, "api_base": f"https://{host}/api/v1/pow"},
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _headers(challenge: Challenge) -> dict:
|
|
36
|
+
host = challenge.params["host"]
|
|
37
|
+
sitekey = challenge.params["sitekey"]
|
|
38
|
+
return {
|
|
39
|
+
"Content-Type": "application/json",
|
|
40
|
+
"Accept": "*/*",
|
|
41
|
+
"Origin": f"https://{host}",
|
|
42
|
+
"Referer": f"https://{host}/widget?sitekey={sitekey}",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class McaptchaPowSolver:
|
|
47
|
+
kind = "mcaptcha"
|
|
48
|
+
|
|
49
|
+
# ---- detection ----
|
|
50
|
+
@classmethod
|
|
51
|
+
def detect(cls, text: str):
|
|
52
|
+
m = _WIDGET_RE.search(text or "")
|
|
53
|
+
if not m:
|
|
54
|
+
return None
|
|
55
|
+
return mcaptcha_hint(host=m.group(1), sitekey=m.group(2))
|
|
56
|
+
|
|
57
|
+
# ---- proof-of-work math ----
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _prefix(salt: str, string: str):
|
|
60
|
+
h = hashlib.sha256()
|
|
61
|
+
h.update(salt.encode())
|
|
62
|
+
sb = string.encode()
|
|
63
|
+
h.update(struct.pack("<Q", len(sb))) # bincode fixint LE u64 length prefix
|
|
64
|
+
h.update(sb)
|
|
65
|
+
return h
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def compute_result(cls, salt: str, string: str, nonce: int) -> int:
|
|
69
|
+
h = cls._prefix(salt, string)
|
|
70
|
+
h.update(str(nonce).encode()) # nonce as decimal ASCII
|
|
71
|
+
return int.from_bytes(h.digest()[:16], "big") # first 16 bytes, big-endian
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def threshold(difficulty: int) -> int:
|
|
75
|
+
return U128_MAX - U128_MAX // difficulty
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def solve_pow(cls, salt, string, difficulty, max_iters=2_000_000_000, max_seconds=120):
|
|
79
|
+
thr = cls.threshold(difficulty)
|
|
80
|
+
base = cls._prefix(salt, string)
|
|
81
|
+
start = time.perf_counter()
|
|
82
|
+
nonce = 0
|
|
83
|
+
while nonce < max_iters:
|
|
84
|
+
nonce += 1
|
|
85
|
+
h = base.copy()
|
|
86
|
+
h.update(str(nonce).encode())
|
|
87
|
+
if int.from_bytes(h.digest()[:16], "big") >= thr:
|
|
88
|
+
return nonce, str(cls.compute_result(salt, string, nonce)), int(
|
|
89
|
+
(time.perf_counter() - start) * 1000
|
|
90
|
+
)
|
|
91
|
+
if (nonce & 0x3FFFF) == 0 and (time.perf_counter() - start) > max_seconds:
|
|
92
|
+
raise TimeoutError(f"PoW unsolved in {max_seconds}s (difficulty {difficulty})")
|
|
93
|
+
raise RuntimeError("PoW unsolved within iteration cap")
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def self_test(cls) -> None:
|
|
97
|
+
"""Oracle gate: assert the layout reproduces the captured result. Instant."""
|
|
98
|
+
got = cls.compute_result(_ORACLE_SALT, _ORACLE_STRING, _ORACLE_NONCE)
|
|
99
|
+
if got != _ORACLE_RESULT:
|
|
100
|
+
raise AssertionError(f"mCaptcha layout self-test FAILED: {got} != {_ORACLE_RESULT}")
|
|
101
|
+
|
|
102
|
+
# ---- solve (config -> pow -> verify -> token) ----
|
|
103
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
104
|
+
# mCaptcha's actor backend is intermittently flaky ("Actor mailbox error") on both
|
|
105
|
+
# /config and /verify — retry the whole config -> pow -> verify a few times.
|
|
106
|
+
last: Exception | None = None
|
|
107
|
+
for attempt in range(3):
|
|
108
|
+
try:
|
|
109
|
+
return self._solve_once(challenge, transport)
|
|
110
|
+
except CaptchaServiceError as e:
|
|
111
|
+
last = e
|
|
112
|
+
time.sleep(1.0 * (attempt + 1))
|
|
113
|
+
raise last # type: ignore[misc]
|
|
114
|
+
|
|
115
|
+
def _solve_once(self, challenge: Challenge, transport) -> Solved:
|
|
116
|
+
api = challenge.params["api_base"]
|
|
117
|
+
key = challenge.params["sitekey"]
|
|
118
|
+
headers = _headers(challenge)
|
|
119
|
+
|
|
120
|
+
cfg = transport.post(f"{api}/config", json={"key": key}, headers=headers).json()
|
|
121
|
+
if not (isinstance(cfg, dict) and "salt" in cfg):
|
|
122
|
+
raise CaptchaServiceError(f"mcaptcha /config returned no challenge: {cfg!r}")
|
|
123
|
+
|
|
124
|
+
nonce, result, elapsed_ms = self.solve_pow(
|
|
125
|
+
cfg["salt"], cfg["string"], cfg["difficulty_factor"]
|
|
126
|
+
)
|
|
127
|
+
verify = transport.post(
|
|
128
|
+
f"{api}/verify",
|
|
129
|
+
json={
|
|
130
|
+
"key": key,
|
|
131
|
+
"nonce": nonce,
|
|
132
|
+
"result": result,
|
|
133
|
+
"string": cfg["string"],
|
|
134
|
+
"time": elapsed_ms,
|
|
135
|
+
"worker_type": "wasm",
|
|
136
|
+
},
|
|
137
|
+
headers=headers,
|
|
138
|
+
).json()
|
|
139
|
+
token = verify.get("token") if isinstance(verify, dict) else None
|
|
140
|
+
if not token:
|
|
141
|
+
raise CaptchaServiceError(f"mcaptcha /verify returned no token: {verify!r}")
|
|
142
|
+
return Solved(token=token, expires_at=None)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""OPTIONAL token-captcha adapters: reCAPTCHA v2/v3 + hCaptcha via a third-party token provider.
|
|
2
|
+
|
|
3
|
+
Browserless (POST site_key + url -> token; no DOM). These reintroduce a paid third party, so they
|
|
4
|
+
are OPT-IN — NOT registered in `default_registry()`. Wire them only when configured:
|
|
5
|
+
|
|
6
|
+
reg.register(RecaptchaV2Solver(provider)).register(HcaptchaSolver(provider))
|
|
7
|
+
|
|
8
|
+
`provider` is any object implementing `TokenProvider` (e.g. an adapter around atlas's
|
|
9
|
+
AntiCaptcha/2Captcha solvers). `detect()` finds the sitekey; the crawler supplies params["url"]
|
|
10
|
+
(the page URL the provider needs) via a hint.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from typing import Protocol, runtime_checkable
|
|
15
|
+
|
|
16
|
+
from .base import CaptchaServiceError, Challenge, Solved
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@runtime_checkable
|
|
20
|
+
class TokenProvider(Protocol):
|
|
21
|
+
def solve_recaptcha_v2(self, site_key: str, url: str, **kw) -> str: ...
|
|
22
|
+
def solve_recaptcha_v3(self, site_key: str, url: str, **kw) -> str: ...
|
|
23
|
+
def solve_hcaptcha(self, site_key: str, url: str, **kw) -> str: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _sitekey(text: str, marker: str) -> str | None:
|
|
27
|
+
if marker not in (text or "").lower():
|
|
28
|
+
return None
|
|
29
|
+
m = re.search(r'(?:data-sitekey|sitekey|render)["\']?\s*[:=]\s*["\']?([0-9A-Za-z_-]{20,})', text or "")
|
|
30
|
+
return m.group(1) if m else None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class _BaseTokenSolver:
|
|
34
|
+
kind = ""
|
|
35
|
+
_marker = ""
|
|
36
|
+
|
|
37
|
+
def __init__(self, provider: TokenProvider):
|
|
38
|
+
self._p = provider
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def detect(cls, text: str):
|
|
42
|
+
sk = _sitekey(text, cls._marker)
|
|
43
|
+
return Challenge(kind=cls.kind, params={"sitekey": sk}) if sk else None
|
|
44
|
+
|
|
45
|
+
def _require(self, challenge: Challenge) -> tuple[str, str]:
|
|
46
|
+
sk = challenge.params.get("sitekey")
|
|
47
|
+
url = challenge.params.get("url") # supplied by the crawler (provider needs the page URL)
|
|
48
|
+
if not sk or not url:
|
|
49
|
+
raise CaptchaServiceError(f"{self.kind} needs params['sitekey'] and ['url']")
|
|
50
|
+
return sk, url
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class RecaptchaV2Solver(_BaseTokenSolver):
|
|
54
|
+
kind = "recaptcha_v2"
|
|
55
|
+
_marker = "recaptcha"
|
|
56
|
+
|
|
57
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
58
|
+
sk, url = self._require(challenge)
|
|
59
|
+
return Solved(token=self._p.solve_recaptcha_v2(sk, url))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class RecaptchaV3Solver(_BaseTokenSolver):
|
|
63
|
+
kind = "recaptcha_v3"
|
|
64
|
+
_marker = "recaptcha"
|
|
65
|
+
|
|
66
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
67
|
+
sk, url = self._require(challenge)
|
|
68
|
+
return Solved(token=self._p.solve_recaptcha_v3(sk, url, action=challenge.params.get("action")))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class HcaptchaSolver(_BaseTokenSolver):
|
|
72
|
+
kind = "hcaptcha"
|
|
73
|
+
_marker = "hcaptcha"
|
|
74
|
+
|
|
75
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
76
|
+
sk, url = self._require(challenge)
|
|
77
|
+
return Solved(token=self._p.solve_hcaptcha(sk, url))
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Cloudflare Turnstile — BROWSERLESS solver scaffold.
|
|
2
|
+
|
|
3
|
+
`detect()` works today (finds the widget + sitekey). `solve()` is a TODO for a manual,
|
|
4
|
+
browserless crack — it fails loudly (NotImplementedError), never silently.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from .base import Challenge, Solved
|
|
10
|
+
|
|
11
|
+
_SIGNATURE = re.compile(r"challenges\.cloudflare\.com/turnstile|cf-turnstile|turnstile\.render", re.I)
|
|
12
|
+
_SITEKEY_RE = re.compile(r'(?:data-sitekey|sitekey)["\']?\s*[:=]\s*["\']([0-9A-Za-z_-]{8,})["\']')
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TurnstileSolver:
|
|
16
|
+
kind = "turnstile"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def detect(cls, text: str):
|
|
20
|
+
text = text or ""
|
|
21
|
+
if not _SIGNATURE.search(text):
|
|
22
|
+
return None
|
|
23
|
+
m = _SITEKEY_RE.search(text)
|
|
24
|
+
return Challenge(kind=cls.kind, params={"sitekey": m.group(1) if m else None})
|
|
25
|
+
|
|
26
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
27
|
+
# TODO(crawlerkit): implement the BROWSERLESS Cloudflare Turnstile solve.
|
|
28
|
+
# Turnstile mints a `cf-turnstile-response` token by running obfuscated,
|
|
29
|
+
# fingerprint-bearing widget JS. Browserless approach to fill in here:
|
|
30
|
+
# 1. First try the PASSIVE path — a clean impersonated identity (the active Profile +
|
|
31
|
+
# proxy IP) often receives a token with no interactive challenge. Attempt that first.
|
|
32
|
+
# 2. Otherwise fetch the widget bundle (turnstile/v0/api.js + the challenge for
|
|
33
|
+
# params["sitekey"]) and execute its JS in a JS runtime (QuickJS via py-mini-racer,
|
|
34
|
+
# or a Node subprocess) with a minimal DOM/navigator shim seeded from the active
|
|
35
|
+
# Profile (UA, sec-ch-ua, screen, languages) and the leased proxy IP.
|
|
36
|
+
# 3. return Solved(token=<cf-turnstile-response>, expires_at=now+~300).
|
|
37
|
+
raise NotImplementedError(
|
|
38
|
+
f"browserless Turnstile solve is a TODO (params={challenge.params!r}) "
|
|
39
|
+
"— implement the passive/JS-runtime crack"
|
|
40
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Optional cookie-jar persistence across crawls.
|
|
2
|
+
|
|
3
|
+
curl_cffi keeps cookies within a Session automatically (a GET that sets JSESSIONID is reused by the
|
|
4
|
+
following POST — no action needed). These helpers add OPTIONAL cross-run warming: dump the jar to
|
|
5
|
+
disk after a crawl and reload it before the next. Best-effort; never raises on a malformed file.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def save_cookies(transport, path: str) -> int:
|
|
12
|
+
"""Dump the transport's current cookies to `path` (JSON). Returns the count saved."""
|
|
13
|
+
data = []
|
|
14
|
+
try:
|
|
15
|
+
for c in transport._session.cookies.jar:
|
|
16
|
+
data.append({"name": c.name, "value": c.value, "domain": c.domain, "path": c.path})
|
|
17
|
+
except Exception: # noqa: BLE001 — cookie internals vary; best-effort
|
|
18
|
+
return 0
|
|
19
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
20
|
+
json.dump(data, f)
|
|
21
|
+
return len(data)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_cookies(transport, path: str) -> int:
|
|
25
|
+
"""Load cookies from `path` into the transport's session. Returns the count loaded (0 if absent)."""
|
|
26
|
+
try:
|
|
27
|
+
with open(path, encoding="utf-8") as f:
|
|
28
|
+
data = json.load(f)
|
|
29
|
+
except (FileNotFoundError, ValueError):
|
|
30
|
+
return 0
|
|
31
|
+
n = 0
|
|
32
|
+
for c in data:
|
|
33
|
+
try:
|
|
34
|
+
transport._session.cookies.set(
|
|
35
|
+
c["name"], c["value"], domain=c.get("domain"), path=c.get("path", "/")
|
|
36
|
+
)
|
|
37
|
+
n += 1
|
|
38
|
+
except Exception: # noqa: BLE001
|
|
39
|
+
continue
|
|
40
|
+
return n
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Crawl error taxonomy + a response block-detector.
|
|
2
|
+
|
|
3
|
+
`BaseCrawler.run()` reacts by class:
|
|
4
|
+
- `TransientError` -> back off, retry with the SAME identity (network blip / timeout / 5xx).
|
|
5
|
+
- `BlockedError` -> rotate identity + proxy, then retry (anti-bot block: 403/429/challenge page).
|
|
6
|
+
- `PermanentError` -> fail fast, no retry (bad input / unrecoverable).
|
|
7
|
+
Anything else propagates unchanged.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CrawlerError(Exception):
|
|
14
|
+
"""Base class for crawl-stage errors."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TransientError(CrawlerError):
|
|
18
|
+
"""A transient failure (network blip, timeout, 5xx) — retry unchanged."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PermanentError(CrawlerError):
|
|
22
|
+
"""An unrecoverable failure (bad input, hard 4xx) — do not retry."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BlockedError(CrawlerError):
|
|
26
|
+
"""An anti-bot block (403/429 or a challenge page) — rotate identity+proxy and retry."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Common interstitial/anti-bot markers (Cloudflare, Akamai, Incapsula, generic).
|
|
30
|
+
_BLOCK_MARKERS = re.compile(
|
|
31
|
+
r"just a moment|attention required|access denied|/cdn-cgi/challenge|"
|
|
32
|
+
r"cf-error-details|akamai|incapsula|request unsuccessful",
|
|
33
|
+
re.I,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def raise_for_block(response) -> None:
|
|
38
|
+
"""Opt-in guard: raise `BlockedError`/`TransientError` if a response looks blocked.
|
|
39
|
+
|
|
40
|
+
Call from `flow()` after a request whose 200 you don't fully trust. Detects 403/429 and
|
|
41
|
+
common challenge-page markers (-> blocked) and 5xx (-> transient).
|
|
42
|
+
"""
|
|
43
|
+
status = getattr(response, "status_code", 0) or 0
|
|
44
|
+
text = getattr(response, "text", "") or ""
|
|
45
|
+
if status in (403, 429) or _BLOCK_MARKERS.search(text[:4000]):
|
|
46
|
+
raise BlockedError(f"anti-bot block detected (status={status})")
|
|
47
|
+
if status >= 500:
|
|
48
|
+
raise TransientError(f"server error (status={status})")
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Coherent browser identity via browserforge + curl_cffi impersonate.
|
|
2
|
+
|
|
3
|
+
curl_cffi's `impersonate` target owns the TLS/JA3 + HTTP2 fingerprint and MUST stay coherent with
|
|
4
|
+
the User-Agent — a UA that disagrees with the JA3 is worse than no spoofing. browserforge supplies a
|
|
5
|
+
realistic header SET + ORDER + locale; we pick the nearest supported impersonate target and SNAP the
|
|
6
|
+
UA/sec-ch-ua Chrome version to it, so UA <-> JA3 never drift. Each `generate()` randomizes (rotation),
|
|
7
|
+
and profiles are rotated together with the proxy.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
|
|
13
|
+
# curl_cffi impersonate targets we support, ascending by Chrome major.
|
|
14
|
+
_IMPERSONATE_BY_MAJOR: list[tuple[int, str]] = [
|
|
15
|
+
(120, "chrome120"),
|
|
16
|
+
(124, "chrome124"),
|
|
17
|
+
(131, "chrome131"),
|
|
18
|
+
(133, "chrome133a"),
|
|
19
|
+
]
|
|
20
|
+
DEFAULT_IMPERSONATE = "chrome131"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _impersonate_for_major(major: int) -> tuple[str, int]:
|
|
24
|
+
"""Nearest supported (target, target_major) with target_major <= major; else the lowest."""
|
|
25
|
+
chosen_major, chosen_target = _IMPERSONATE_BY_MAJOR[0]
|
|
26
|
+
for mj, target in _IMPERSONATE_BY_MAJOR:
|
|
27
|
+
if mj <= major:
|
|
28
|
+
chosen_major, chosen_target = mj, target
|
|
29
|
+
return chosen_target, chosen_major
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _snap_version(headers: dict, gen_major: int, target_major: int) -> None:
|
|
33
|
+
"""Rewrite the UA + sec-ch-ua Chrome version from gen_major to target_major (in place)."""
|
|
34
|
+
if gen_major == target_major:
|
|
35
|
+
return
|
|
36
|
+
if ua := headers.get("User-Agent"):
|
|
37
|
+
headers["User-Agent"] = re.sub(r"Chrome/\d+", f"Chrome/{target_major}", ua)
|
|
38
|
+
if sch := headers.get("sec-ch-ua"): # only the Chrome/Chromium brands carry the major
|
|
39
|
+
headers["sec-ch-ua"] = sch.replace(f'v="{gen_major}"', f'v="{target_major}"')
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class Profile:
|
|
44
|
+
impersonate: str
|
|
45
|
+
_headers: dict = field(default_factory=dict)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def user_agent(self) -> str:
|
|
49
|
+
return self._headers.get("User-Agent", "")
|
|
50
|
+
|
|
51
|
+
def headers(self) -> dict:
|
|
52
|
+
return dict(self._headers)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _fallback_profile() -> Profile:
|
|
56
|
+
"""Static coherent profile when browserforge is unavailable."""
|
|
57
|
+
return Profile(
|
|
58
|
+
impersonate=DEFAULT_IMPERSONATE,
|
|
59
|
+
_headers={
|
|
60
|
+
"User-Agent": (
|
|
61
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
62
|
+
"Chrome/131.0.0.0 Safari/537.36"
|
|
63
|
+
),
|
|
64
|
+
"Accept": (
|
|
65
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
|
66
|
+
"image/avif,image/webp,*/*;q=0.8"
|
|
67
|
+
),
|
|
68
|
+
"Accept-Language": "pt-BR,pt;q=0.9,en;q=0.8",
|
|
69
|
+
"Accept-Encoding": "gzip, deflate, br, zstd", # curl_cffi decodes br/zstd natively
|
|
70
|
+
},
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ProfileGenerator:
|
|
75
|
+
"""Generate coherent Profiles via browserforge, snapped to a curl_cffi impersonate target."""
|
|
76
|
+
|
|
77
|
+
def __init__(self, *, browser="chrome", os=("windows", "linux"), device="desktop", locale="pt-BR"):
|
|
78
|
+
self._hg = None
|
|
79
|
+
try:
|
|
80
|
+
from browserforge.headers import HeaderGenerator
|
|
81
|
+
|
|
82
|
+
self._hg = HeaderGenerator(browser=browser, os=os, device=device, locale=locale)
|
|
83
|
+
except Exception: # noqa: BLE001 — browserforge optional; fall back to a static profile
|
|
84
|
+
self._hg = None
|
|
85
|
+
|
|
86
|
+
def generate(self) -> Profile:
|
|
87
|
+
if self._hg is None:
|
|
88
|
+
return _fallback_profile()
|
|
89
|
+
try:
|
|
90
|
+
h = dict(self._hg.generate())
|
|
91
|
+
except Exception: # noqa: BLE001
|
|
92
|
+
return _fallback_profile()
|
|
93
|
+
m = re.search(r"Chrome/(\d+)", h.get("User-Agent", ""))
|
|
94
|
+
gen_major = int(m.group(1)) if m else 131
|
|
95
|
+
target, target_major = _impersonate_for_major(gen_major)
|
|
96
|
+
_snap_version(h, gen_major, target_major)
|
|
97
|
+
return Profile(impersonate=target, _headers=h)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
_DEFAULT_GEN: ProfileGenerator | None = None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def pick(pool=None, index: int = 0) -> Profile:
|
|
104
|
+
"""Return a freshly generated, coherent Profile (browserforge-randomized = rotation)."""
|
|
105
|
+
global _DEFAULT_GEN
|
|
106
|
+
if _DEFAULT_GEN is None:
|
|
107
|
+
_DEFAULT_GEN = ProfileGenerator()
|
|
108
|
+
return _DEFAULT_GEN.generate()
|
crawlerkit/core/proxy.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Proxy leasing.
|
|
2
|
+
|
|
3
|
+
The leased egress is what the transport binds AND what any captcha solver uses — so a
|
|
4
|
+
risk-scored token is minted from the same IP that will submit it. v1 ships Null + Static
|
|
5
|
+
providers; BrightData / sticky-session / VPN-as-proxy come later.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class ProxyLease:
|
|
14
|
+
url: str | None # e.g. "http://user:pass@host:port", or None for direct
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ProxyProvider:
|
|
18
|
+
def lease(self, key: str | None = None) -> ProxyLease:
|
|
19
|
+
raise NotImplementedError
|
|
20
|
+
|
|
21
|
+
def release(self, lease: ProxyLease) -> None: # noqa: B027 — optional hook
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NullProxyProvider(ProxyProvider):
|
|
26
|
+
"""Direct egress (no proxy)."""
|
|
27
|
+
|
|
28
|
+
def lease(self, key: str | None = None) -> ProxyLease:
|
|
29
|
+
return ProxyLease(url=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class StaticProxyProvider(ProxyProvider):
|
|
33
|
+
"""Round-robin a fixed list (arg, or CRAWLERKIT_PROXIES env, comma-separated)."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, proxies: list[str] | None = None):
|
|
36
|
+
self._proxies = proxies or [
|
|
37
|
+
p.strip() for p in os.environ.get("CRAWLERKIT_PROXIES", "").split(",") if p.strip()
|
|
38
|
+
]
|
|
39
|
+
self._i = 0
|
|
40
|
+
|
|
41
|
+
def lease(self, key: str | None = None) -> ProxyLease:
|
|
42
|
+
if not self._proxies:
|
|
43
|
+
return ProxyLease(url=None)
|
|
44
|
+
url = self._proxies[self._i % len(self._proxies)]
|
|
45
|
+
self._i += 1
|
|
46
|
+
return ProxyLease(url=url)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class BrightDataProxyProvider(ProxyProvider):
|
|
50
|
+
"""BrightData (Luminati) sticky-session proxy. The session id (seeded from `key`, e.g. the
|
|
51
|
+
crawl item) pins a sticky egress IP so retries reuse it. Creds from args or env
|
|
52
|
+
BRIGHT_DATA_USER/PASS/HOST/PORT. NOTE: the exact username param syntax varies by zone/plan."""
|
|
53
|
+
|
|
54
|
+
def __init__(self, user=None, password=None, host=None, port=None, sticky=True):
|
|
55
|
+
self.user = user or os.environ.get("BRIGHT_DATA_USER", "")
|
|
56
|
+
self.password = password or os.environ.get("BRIGHT_DATA_PASS", "")
|
|
57
|
+
self.host = host or os.environ.get("BRIGHT_DATA_HOST", "")
|
|
58
|
+
self.port = port or os.environ.get("BRIGHT_DATA_PORT", "22225")
|
|
59
|
+
self.sticky = sticky
|
|
60
|
+
self._n = 0
|
|
61
|
+
|
|
62
|
+
def lease(self, key: str | None = None) -> ProxyLease:
|
|
63
|
+
if not (self.user and self.host):
|
|
64
|
+
return ProxyLease(url=None)
|
|
65
|
+
user = self.user
|
|
66
|
+
if self.sticky:
|
|
67
|
+
sid = key or f"s{self._n}"
|
|
68
|
+
self._n += 1
|
|
69
|
+
user = f"{self.user}-session-{sid}"
|
|
70
|
+
return ProxyLease(url=f"http://{user}:{self.password}@{self.host}:{self.port}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class DataImpulseProxyProvider(ProxyProvider):
|
|
74
|
+
"""DataImpulse rotating/sticky proxy. Creds from args or env DATA_IMPULSE_USER/PASS/HOST/PORT;
|
|
75
|
+
optional country targeting. Sticky session id seeded from `key`."""
|
|
76
|
+
|
|
77
|
+
def __init__(self, user=None, password=None, host=None, port=None, country=None, sticky=True):
|
|
78
|
+
self.user = user or os.environ.get("DATA_IMPULSE_USER", "")
|
|
79
|
+
self.password = password or os.environ.get("DATA_IMPULSE_PASS", "")
|
|
80
|
+
self.host = host or os.environ.get("DATA_IMPULSE_HOST", "gw.dataimpulse.com")
|
|
81
|
+
self.port = port or os.environ.get("DATA_IMPULSE_PORT", "823")
|
|
82
|
+
self.country = country
|
|
83
|
+
self.sticky = sticky
|
|
84
|
+
self._n = 0
|
|
85
|
+
|
|
86
|
+
def lease(self, key: str | None = None) -> ProxyLease:
|
|
87
|
+
if not (self.user and self.host):
|
|
88
|
+
return ProxyLease(url=None)
|
|
89
|
+
user = self.user
|
|
90
|
+
if self.country:
|
|
91
|
+
user += f"__cr.{self.country}"
|
|
92
|
+
if self.sticky:
|
|
93
|
+
sid = key or f"s{self._n}"
|
|
94
|
+
self._n += 1
|
|
95
|
+
user += f";sess-{sid}"
|
|
96
|
+
return ProxyLease(url=f"http://{user}:{self.password}@{self.host}:{self.port}")
|
crawlerkit/core/tls.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Per-host CA bundle builder with AIA repair.
|
|
2
|
+
|
|
3
|
+
Some hosts (e.g. Detran) serve only their leaf certificate and omit the intermediate, so
|
|
4
|
+
Python TLS verification fails with "unable to get local issuer certificate". A browser papers
|
|
5
|
+
over this by fetching the missing intermediate from the leaf's AIA "CA Issuers" URL; we do the
|
|
6
|
+
same here, generically: read the leaf's AIA, fetch + follow intermediates up to a trusted root,
|
|
7
|
+
and concatenate with certifi's roots. Verification stays ON. Cached per host.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import socket
|
|
12
|
+
import ssl
|
|
13
|
+
import tempfile
|
|
14
|
+
import urllib.request
|
|
15
|
+
|
|
16
|
+
import certifi
|
|
17
|
+
from cryptography import x509
|
|
18
|
+
from cryptography.hazmat.primitives import serialization
|
|
19
|
+
from cryptography.x509.oid import AuthorityInformationAccessOID, ExtensionOID
|
|
20
|
+
|
|
21
|
+
_CACHE_DIR = os.environ.get(
|
|
22
|
+
"CRAWLERKIT_CA_DIR", os.path.join(tempfile.gettempdir(), "crawlerkit-ca")
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _leaf_cert(host: str, port: int = 443) -> x509.Certificate:
|
|
27
|
+
ctx = ssl._create_unverified_context()
|
|
28
|
+
with socket.create_connection((host, port), timeout=30) as sock:
|
|
29
|
+
with ctx.wrap_socket(sock, server_hostname=host) as ssock:
|
|
30
|
+
der = ssock.getpeercert(binary_form=True)
|
|
31
|
+
return x509.load_der_x509_certificate(der)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _ca_issuer_urls(cert: x509.Certificate) -> list[str]:
|
|
35
|
+
try:
|
|
36
|
+
aia = cert.extensions.get_extension_for_oid(
|
|
37
|
+
ExtensionOID.AUTHORITY_INFORMATION_ACCESS
|
|
38
|
+
).value
|
|
39
|
+
except x509.ExtensionNotFound:
|
|
40
|
+
return []
|
|
41
|
+
return [
|
|
42
|
+
d.access_location.value
|
|
43
|
+
for d in aia
|
|
44
|
+
if d.access_method == AuthorityInformationAccessOID.CA_ISSUERS
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _fetch_cert(url: str) -> x509.Certificate:
|
|
49
|
+
with urllib.request.urlopen(url, timeout=30) as r: # noqa: S310 (public CA cert, http ok)
|
|
50
|
+
raw = r.read()
|
|
51
|
+
if raw.lstrip().startswith(b"-----BEGIN"):
|
|
52
|
+
return x509.load_pem_x509_certificate(raw)
|
|
53
|
+
return x509.load_der_x509_certificate(raw)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def build_ca_bundle(host: str, port: int = 443, *, force: bool = False, max_depth: int = 4) -> str:
|
|
57
|
+
"""Return a path to a CA bundle = trusted roots + any intermediates `host` omits.
|
|
58
|
+
|
|
59
|
+
Best-effort: if AIA repair fails, falls back to certifi roots only.
|
|
60
|
+
"""
|
|
61
|
+
os.makedirs(_CACHE_DIR, exist_ok=True)
|
|
62
|
+
path = os.path.join(_CACHE_DIR, f"{host}_{port}.pem")
|
|
63
|
+
if os.path.exists(path) and not force:
|
|
64
|
+
return path
|
|
65
|
+
|
|
66
|
+
roots = open(certifi.where(), encoding="utf-8").read()
|
|
67
|
+
extra: list[str] = []
|
|
68
|
+
try:
|
|
69
|
+
cert = _leaf_cert(host, port)
|
|
70
|
+
seen: set[str] = set()
|
|
71
|
+
for _ in range(max_depth):
|
|
72
|
+
urls = [u for u in _ca_issuer_urls(cert) if u.startswith(("http://", "https://"))]
|
|
73
|
+
if not urls or urls[0] in seen:
|
|
74
|
+
break
|
|
75
|
+
seen.add(urls[0])
|
|
76
|
+
cert = _fetch_cert(urls[0])
|
|
77
|
+
extra.append(cert.public_bytes(serialization.Encoding.PEM).decode())
|
|
78
|
+
if cert.issuer == cert.subject: # reached a self-signed root
|
|
79
|
+
break
|
|
80
|
+
except Exception: # noqa: BLE001 — never let CA discovery crash a crawl; roots-only fallback
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
84
|
+
f.write(roots if roots.endswith("\n") else roots + "\n")
|
|
85
|
+
for pem in extra:
|
|
86
|
+
f.write(pem if pem.endswith("\n") else pem + "\n")
|
|
87
|
+
return path
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def client_cert_from_pfx(pfx_path: str, password: str | bytes | None, out_path: str | None = None) -> str:
|
|
91
|
+
"""Load an ICP-Brasil / PKCS#12 `.pfx` and write a combined PEM (private key + cert + CA chain)
|
|
92
|
+
for curl_cffi's `cert=` (mutual TLS). Returns the PEM path. Port of alexandria/pfx_to_pem via
|
|
93
|
+
`cryptography` (no pyOpenSSL). The output is chmod 600 (contains the private key)."""
|
|
94
|
+
if isinstance(password, str):
|
|
95
|
+
password = password.encode()
|
|
96
|
+
with open(pfx_path, "rb") as f:
|
|
97
|
+
data = f.read()
|
|
98
|
+
key, cert, extra = serialization.pkcs12.load_key_and_certificates(data, password)
|
|
99
|
+
os.makedirs(_CACHE_DIR, exist_ok=True)
|
|
100
|
+
out_path = out_path or os.path.join(_CACHE_DIR, os.path.basename(pfx_path) + ".pem")
|
|
101
|
+
with open(out_path, "wb") as f:
|
|
102
|
+
if key is not None:
|
|
103
|
+
f.write(key.private_bytes(
|
|
104
|
+
serialization.Encoding.PEM,
|
|
105
|
+
serialization.PrivateFormat.TraditionalOpenSSL,
|
|
106
|
+
serialization.NoEncryption(),
|
|
107
|
+
))
|
|
108
|
+
if cert is not None:
|
|
109
|
+
f.write(cert.public_bytes(serialization.Encoding.PEM))
|
|
110
|
+
for ca in (extra or []):
|
|
111
|
+
f.write(ca.public_bytes(serialization.Encoding.PEM))
|
|
112
|
+
os.chmod(out_path, 0o600)
|
|
113
|
+
return out_path
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Fingerprinted HTTP transport — the only HTTP path.
|
|
2
|
+
|
|
3
|
+
A `curl_cffi` Session bound to one Profile (TLS/JA3 + UA + header order) + a proxy lease +
|
|
4
|
+
per-host verified CA bundle (with AIA repair). TLS/JA3 fingerprint is a property of THIS client,
|
|
5
|
+
so it is the foundation, not a plugin. `requests` is intentionally not used (giveaway fingerprint).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import random
|
|
10
|
+
import time
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
|
+
|
|
13
|
+
import structlog
|
|
14
|
+
from curl_cffi import requests as cffi
|
|
15
|
+
from curl_cffi.requests import exceptions as _cffi_exc
|
|
16
|
+
|
|
17
|
+
from . import tls
|
|
18
|
+
from .errors import TransientError
|
|
19
|
+
from .identity import Profile
|
|
20
|
+
from .proxy import ProxyLease
|
|
21
|
+
|
|
22
|
+
log = structlog.get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Transport:
|
|
26
|
+
def __init__(self, profile: Profile, proxy: ProxyLease, *, verify: bool = True,
|
|
27
|
+
client_cert: str | None = None, min_interval: float | None = None):
|
|
28
|
+
self.profile = profile
|
|
29
|
+
self.proxy = proxy
|
|
30
|
+
self.verify = verify
|
|
31
|
+
self.client_cert = client_cert # PEM (cert+key) for ICP-Brasil mutual TLS, or None
|
|
32
|
+
# politeness: minimum seconds between requests (+ up to 25% jitter). 0/None = off.
|
|
33
|
+
self.min_interval = float(
|
|
34
|
+
min_interval if min_interval is not None else os.environ.get("CRAWLERKIT_MIN_INTERVAL", 0)
|
|
35
|
+
)
|
|
36
|
+
self._last = 0.0
|
|
37
|
+
self._ca: dict[str, str] = {}
|
|
38
|
+
self._session = cffi.Session(impersonate=profile.impersonate)
|
|
39
|
+
self._session.headers.update(profile.headers())
|
|
40
|
+
if proxy.url:
|
|
41
|
+
self._session.proxies = {"http": proxy.url, "https": proxy.url}
|
|
42
|
+
|
|
43
|
+
def _verify_for(self, url: str):
|
|
44
|
+
if self.verify is False:
|
|
45
|
+
return False
|
|
46
|
+
host = urlparse(url).hostname or ""
|
|
47
|
+
if host not in self._ca:
|
|
48
|
+
self._ca[host] = tls.build_ca_bundle(host)
|
|
49
|
+
return self._ca[host]
|
|
50
|
+
|
|
51
|
+
def _throttle(self) -> None:
|
|
52
|
+
if self.min_interval <= 0:
|
|
53
|
+
return
|
|
54
|
+
wait = self.min_interval - (time.monotonic() - self._last)
|
|
55
|
+
if wait > 0:
|
|
56
|
+
time.sleep(wait + random.uniform(0, self.min_interval * 0.25))
|
|
57
|
+
self._last = time.monotonic()
|
|
58
|
+
|
|
59
|
+
def request(self, method: str, url: str, **kw):
|
|
60
|
+
kw.setdefault("verify", self._verify_for(url))
|
|
61
|
+
kw.setdefault("impersonate", self.profile.impersonate)
|
|
62
|
+
kw.setdefault("timeout", 30)
|
|
63
|
+
if self.client_cert:
|
|
64
|
+
kw.setdefault("cert", self.client_cert)
|
|
65
|
+
self._throttle()
|
|
66
|
+
log.debug("http", method=method, url=url, proxy=bool(self.proxy.url))
|
|
67
|
+
try:
|
|
68
|
+
return self._session.request(method, url, **kw)
|
|
69
|
+
except _cffi_exc.RequestsError as e: # network/curl failure -> transient (retryable)
|
|
70
|
+
raise TransientError(f"{method} {url}: {e}") from e
|
|
71
|
+
|
|
72
|
+
def get(self, url: str, **kw):
|
|
73
|
+
return self.request("GET", url, **kw)
|
|
74
|
+
|
|
75
|
+
def post(self, url: str, **kw):
|
|
76
|
+
return self.request("POST", url, **kw)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlerkit-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Browserless crawler base: curl_cffi transport, TLS/AIA, identity, proxy, captcha, BaseCrawler/BaseParser.
|
|
5
|
+
Author-email: Lucas Caovilla <lucasgrisac@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/lucascaovilla/crawlerkit
|
|
8
|
+
Project-URL: Repository, https://github.com/lucascaovilla/crawlerkit
|
|
9
|
+
Project-URL: Documentation, https://github.com/lucascaovilla/crawlerkit#readme
|
|
10
|
+
Project-URL: Issues, https://github.com/lucascaovilla/crawlerkit/issues
|
|
11
|
+
Keywords: crawler,scraping,curl_cffi,tls,fingerprint,captcha,browserless
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: curl_cffi>=0.7
|
|
26
|
+
Requires-Dist: browserforge>=1.2
|
|
27
|
+
Requires-Dist: cryptography>=42
|
|
28
|
+
Requires-Dist: certifi>=2024.0
|
|
29
|
+
Requires-Dist: selectolax>=0.3
|
|
30
|
+
Requires-Dist: lxml>=5.0
|
|
31
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
32
|
+
Requires-Dist: structlog>=24.1
|
|
33
|
+
Requires-Dist: tenacity>=8.2
|
|
34
|
+
Requires-Dist: weasyprint>=60
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
38
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
39
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
40
|
+
Requires-Dist: commitizen>=3.27; extra == "dev"
|
|
41
|
+
Provides-Extra: docs
|
|
42
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
43
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == "docs"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# crawlerkit-core
|
|
47
|
+
|
|
48
|
+
[](https://pypi.org/project/crawlerkit-core/)
|
|
49
|
+
[](https://pypi.org/project/crawlerkit-core/)
|
|
50
|
+
[](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml)
|
|
51
|
+
[](LICENSE)
|
|
52
|
+
|
|
53
|
+
A **standalone, browserless** crawler base (`crawlerkit.core`): fingerprinted **curl_cffi** transport,
|
|
54
|
+
per-host TLS with **AIA repair** + `.pfx` client certs, **browserforge** identity (UA snapped to the
|
|
55
|
+
impersonate target), proxy providers, a pluggable **captcha** registry, an error taxonomy with
|
|
56
|
+
retry+rotation, and the `BaseCrawler.flow()` / `BaseParser.parse()` hooks. Zero non-PyPI dependencies —
|
|
57
|
+
`parse()` returns **your own type**, not one the library dictates.
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install crawlerkit-core
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Use
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from crawlerkit.core import BaseCrawler, BaseParser, RawResponse, Transport, Profile
|
|
69
|
+
from crawlerkit.core.captcha import default_registry, McaptchaPowSolver, mcaptcha_hint
|
|
70
|
+
from crawlerkit.core.proxy import StaticProxyProvider, BrightDataProxyProvider
|
|
71
|
+
from crawlerkit.core.errors import BlockedError, TransientError, raise_for_block
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**HTTP is curl_cffi only — `requests` is never used.** Deps: curl_cffi, browserforge, cryptography,
|
|
75
|
+
certifi, selectolax, lxml, beautifulsoup4, weasyprint, structlog, tenacity.
|
|
76
|
+
|
|
77
|
+
**Build a crawler:** [GETTING_STARTED.md](GETTING_STARTED.md). **Run the demos:**
|
|
78
|
+
[`examples/`](examples/) (`quotes.py` — a full crawl+parse; `fingerprint_demo.py` — identity proof).
|
|
79
|
+
Reference: [`docs/`](docs/) (identity, transport-tls, proxy, captcha, cracking-govbr-turnstile, errors,
|
|
80
|
+
api). License: MIT.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
crawlerkit/core/__init__.py,sha256=aU6V1P8kGOOVFP6nWwgUSBacrJoucW1m3BAlIp7Mfps,230
|
|
2
|
+
crawlerkit/core/base_crawler.py,sha256=xGcQL8kToY1ENl7Q9FFmzFRwxiVtWhuqiEycKcI3Yfc,4666
|
|
3
|
+
crawlerkit/core/base_parser.py,sha256=6Heku1NZ6eTaNeMaFN9hRgLikcETZxIEeVT2oNvKIcw,2900
|
|
4
|
+
crawlerkit/core/cookies.py,sha256=PndaPAjN29m5LpIq0jq2WSNuZugteu4vVmX5v8Inu3g,1449
|
|
5
|
+
crawlerkit/core/errors.py,sha256=oogfj8t6I7SMEHOYirE6r4FRIMI9_TtUlLrJBpziL0A,1744
|
|
6
|
+
crawlerkit/core/identity.py,sha256=2Ga0-GIf3mcnkwCZSPQuRXh1NtuvQhoQ3-DG91b08Ek,4108
|
|
7
|
+
crawlerkit/core/proxy.py,sha256=FPfg45I8PiWI8s-I398_d3xiMJED68w7EokAHhUcqOk,3694
|
|
8
|
+
crawlerkit/core/tls.py,sha256=0xXGsDbR25kdP2n3C_5Mw6r7yazorpgHHCkvy1QMgkM,4550
|
|
9
|
+
crawlerkit/core/transport.py,sha256=i8QJsbY7TunCjvNrzHF44c3HvXd5QZq78dR2lxJG0ao,2913
|
|
10
|
+
crawlerkit/core/captcha/__init__.py,sha256=h5WEzZQ7De_ESGQy-T_lABYyQyhtDx1pEjFJr3dpoHI,843
|
|
11
|
+
crawlerkit/core/captcha/base.py,sha256=ZNwqyOGx2ibG6XUxKpDdyhkhz47S_CUB01WooWRqS2M,2830
|
|
12
|
+
crawlerkit/core/captcha/govbr.py,sha256=4OjczX6K9lhzE_ojpkS0ngfZKqIejJ_j0SFfCMfu4j8,1905
|
|
13
|
+
crawlerkit/core/captcha/llm_image.py,sha256=E3LnA_VlKpPXei2T4HsmaQlbo9cdOqigC-fFv1bSU4A,2081
|
|
14
|
+
crawlerkit/core/captcha/mcaptcha.py,sha256=9fvRz1XgZnIArKPoZTgiVTqYazS0GitIFFxV8gyMyhI,5372
|
|
15
|
+
crawlerkit/core/captcha/token_adapters.py,sha256=-GzRC8vMqNWINzT3tcRQln2lIDAC9L429W_eMKTTbyE,2747
|
|
16
|
+
crawlerkit/core/captcha/turnstile.py,sha256=G4MeD01QlFv0faItUtbFnmMkrQcGplAiVyPJCQZmcPQ,1925
|
|
17
|
+
crawlerkit_core-0.1.0.dist-info/licenses/LICENSE,sha256=kAsK6_g7uDe4qOBRw-suoniIN_7YxJsmXCZs-XoqHfE,1071
|
|
18
|
+
crawlerkit_core-0.1.0.dist-info/METADATA,sha256=LvKXmYcJ-GhsNK1nRZ3bxY3l_i_RkcdTJWddYm_UuNs,3738
|
|
19
|
+
crawlerkit_core-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
20
|
+
crawlerkit_core-0.1.0.dist-info/top_level.txt,sha256=vy8AhdTkmxRHsuY8cQY-yWv1bWpk7JkaAMvF3JuqinU,11
|
|
21
|
+
crawlerkit_core-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lucas Caovilla
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
crawlerkit
|