crawlerkit-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ from .base_crawler import BaseCrawler, RawResponse
2
+ from .base_parser import BaseParser
3
+ from .identity import Profile
4
+ from .transport import Transport
5
+
6
+ __all__ = ["BaseCrawler", "BaseParser", "RawResponse", "Transport", "Profile"]
@@ -0,0 +1,126 @@
1
+ """BaseCrawler — the crawl stage. A new target fills one hook: flow()."""
2
+
3
+ import random
4
+ import time
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+
8
+ import structlog
9
+ from bs4 import BeautifulSoup
10
+
11
+ from .captcha.base import CaptchaRegistry, Challenge, default_registry
12
+ from .errors import BlockedError, PermanentError, TransientError
13
+ from .identity import Profile, pick
14
+ from .proxy import NullProxyProvider, ProxyProvider
15
+ from .transport import Transport
16
+
17
+ log = structlog.get_logger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class RawResponse:
22
+ url: str
23
+ status: int
24
+ text: str
25
+ headers: dict = field(default_factory=dict)
26
+
27
+
28
+ class BaseCrawler(ABC):
29
+ """Owns transport+identity+proxy+captcha; subclass implements only flow().
30
+
31
+ No business logic, no parsing here — crawl and return the raw response.
32
+ """
33
+
34
+ captcha_hint: Challenge | None = None # known sitekey when the widget isn't inline
35
+
36
+ def __init__(
37
+ self,
38
+ *,
39
+ proxy_provider: ProxyProvider | None = None,
40
+ registry: CaptchaRegistry | None = None,
41
+ verify: bool = True,
42
+ profile: Profile | None = None,
43
+ client_cert: str | None = None,
44
+ max_attempts: int = 3,
45
+ ):
46
+ self._proxy_provider = proxy_provider or NullProxyProvider()
47
+ self._verify = verify
48
+ self._client_cert = client_cert
49
+ self._fixed_profile = profile
50
+ self.max_attempts = max_attempts
51
+ self.registry = registry or default_registry()
52
+ self._build_transport()
53
+
54
+ def _build_transport(self) -> None:
55
+ """(Re)create identity + proxy lease + transport — on init and on each rotation."""
56
+ self.profile = self._fixed_profile or pick()
57
+ self.proxy = self._proxy_provider.lease()
58
+ self.transport = Transport(
59
+ self.profile, self.proxy, verify=self._verify, client_cert=self._client_cert
60
+ )
61
+
62
+ def _rotate(self) -> None:
63
+ log.info("rotate_identity_proxy")
64
+ self._build_transport()
65
+
66
+ # --- helpers exposed to flow() ---
67
+ def get(self, url: str, **kw):
68
+ return self.transport.get(url, **kw)
69
+
70
+ def post(self, url: str, **kw):
71
+ return self.transport.post(url, **kw)
72
+
73
+ def solve_captcha(self, source) -> str | None:
74
+ """detect+solve; returns a token, None (no challenge), or raises UnsupportedCaptcha."""
75
+ solved = self.registry.solve(source, self.transport, hint=self.captcha_hint)
76
+ return solved.token if solved else None
77
+
78
+ def hidden_fields(self, html: str) -> dict:
79
+ """All hidden inputs of the form (JSF ViewState / WebForms __VIEWSTATE postback state)."""
80
+ try:
81
+ soup = BeautifulSoup(html, "lxml")
82
+ except Exception: # noqa: BLE001
83
+ soup = BeautifulSoup(html, "html.parser")
84
+ form = soup.find("form") if soup else None
85
+ scope = form or soup
86
+ hidden: dict[str, str] = {}
87
+ if scope:
88
+ for inp in scope.find_all("input"):
89
+ name = inp.get("name")
90
+ if name and (inp.get("type") == "hidden" or "ViewState" in name or "VIEWSTATE" in name.upper()):
91
+ hidden[name] = inp.get("value", "")
92
+ return hidden
93
+
94
+ # --- the only required hook ---
95
+ @abstractmethod
96
+ def flow(self, params: dict) -> RawResponse:
97
+ ...
98
+
99
+ def run(self, params: dict) -> RawResponse:
100
+ """Run flow() with retry + rotation. TransientError -> back off, retry (same identity);
101
+ BlockedError -> rotate identity+proxy, then retry; PermanentError -> fail fast."""
102
+ last: Exception | None = None
103
+ for attempt in range(1, self.max_attempts + 1):
104
+ try:
105
+ log.info("crawl_start", crawler=type(self).__name__, attempt=attempt)
106
+ raw = self.flow(params)
107
+ log.info("crawl_done", status=raw.status, bytes=len(raw.text))
108
+ return raw
109
+ except PermanentError:
110
+ raise
111
+ except BlockedError as e:
112
+ last = e
113
+ log.warning("blocked", attempt=attempt, error=str(e))
114
+ if attempt < self.max_attempts:
115
+ self._rotate()
116
+ self._backoff(attempt)
117
+ except TransientError as e:
118
+ last = e
119
+ log.warning("transient", attempt=attempt, error=str(e))
120
+ if attempt < self.max_attempts:
121
+ self._backoff(attempt)
122
+ raise last or RuntimeError("crawl failed with no captured error")
123
+
124
+ @staticmethod
125
+ def _backoff(attempt: int, cap: float = 30.0) -> None:
126
+ time.sleep(min(2.0**attempt + random.uniform(0, 1), cap))
@@ -0,0 +1,77 @@
1
+ """BaseParser — the parse stage. A new target fills one hook: parse().
2
+
3
+ Pure + item-local: no network beyond fetching static assets for the optional PDF, no
4
+ cross-item state. Operates on the RawResponse the crawler returned (or a replayed one).
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Generic, TypeVar
9
+ from urllib.parse import urlparse
10
+
11
+ import structlog
12
+
13
+ from .base_crawler import RawResponse
14
+
15
+ log = structlog.get_logger(__name__)
16
+
17
+ #: What ``parse()`` yields — your own model, a dataclass, a ``dict``, anything.
18
+ #: crawlerkit-core stays dependency-free: it never dictates the output type.
19
+ T = TypeVar("T")
20
+
21
+ # Print fixups: hide leftover form inputs, landscape, fit wide tables.
22
+ _PDF_FIXUP_CSS = """
23
+ input { display: none !important; }
24
+ @page { size: A4 landscape; margin: 1.2cm; }
25
+ table { font-size: 9px; table-layout: fixed; width: 100%; }
26
+ td, th { overflow-wrap: anywhere; }
27
+ """
28
+
29
+
30
+ def render_pdf(html: str, base_url: str) -> bytes:
31
+ """HTML -> PDF (WeasyPrint, no browser). Fetches remote CSS over a verified, AIA-repaired
32
+ TLS connection (curl_cffi + crawlerkit.core.tls). No `requests`."""
33
+ from curl_cffi import requests as cffi
34
+ from weasyprint import CSS, HTML, default_url_fetcher
35
+
36
+ from . import tls
37
+
38
+ def fetcher(url: str, **kw):
39
+ if url.startswith(("http://", "https://")):
40
+ host = urlparse(url).hostname or ""
41
+ try:
42
+ r = cffi.get(url, verify=tls.build_ca_bundle(host), timeout=30, impersonate="chrome131")
43
+ ct = r.headers.get("content-type", "")
44
+ out = {"string": r.content, "redirected_url": str(r.url)}
45
+ mime = ct.split(";")[0].strip()
46
+ if mime:
47
+ out["mime_type"] = mime
48
+ return out
49
+ except Exception as e: # noqa: BLE001 — a missing asset must not kill the PDF
50
+ log.warning("pdf_asset_skipped", url=url, error=str(e))
51
+ return {"string": b"", "mime_type": "text/plain"}
52
+ return default_url_fetcher(url, **kw)
53
+
54
+ return HTML(string=html, base_url=base_url, url_fetcher=fetcher).write_pdf(
55
+ stylesheets=[CSS(string=_PDF_FIXUP_CSS)]
56
+ )
57
+
58
+
59
+ class BaseParser(ABC, Generic[T]):
60
+ """Parse stage. Subclass with your own item type: ``class MyParser(BaseParser[MyModel])``
61
+ (or ``BaseParser[dict]``). ``parse()`` returns ``list[T]``; the type is yours, not the lib's."""
62
+
63
+ render_pdf_enabled: bool = True
64
+
65
+ @abstractmethod
66
+ def parse(self, raw: RawResponse) -> list[T]:
67
+ ...
68
+
69
+ def pdf(self, raw: RawResponse) -> bytes | None:
70
+ if not self.render_pdf_enabled:
71
+ return None
72
+ return render_pdf(raw.text, base_url=raw.url)
73
+
74
+ def run(self, raw: RawResponse) -> tuple[list[T], bytes | None]:
75
+ items = self.parse(raw)
76
+ log.info("parse_done", count=len(items))
77
+ return items, self.pdf(raw)
@@ -0,0 +1,34 @@
1
+ from .base import (
2
+ CaptchaRegistry,
3
+ CaptchaServiceError,
4
+ Challenge,
5
+ Solved,
6
+ UnsupportedCaptcha,
7
+ default_registry,
8
+ )
9
+ from .govbr import GovBrSolver
10
+ from .llm_image import LlmImageSolver
11
+ from .mcaptcha import McaptchaPowSolver, mcaptcha_hint
12
+ from .token_adapters import HcaptchaSolver, RecaptchaV2Solver, RecaptchaV3Solver, TokenProvider
13
+ from .turnstile import TurnstileSolver
14
+
15
+ __all__ = [
16
+ "Challenge",
17
+ "Solved",
18
+ "UnsupportedCaptcha",
19
+ "CaptchaServiceError",
20
+ "CaptchaRegistry",
21
+ "default_registry",
22
+ # own solvers
23
+ "McaptchaPowSolver",
24
+ "mcaptcha_hint",
25
+ "LlmImageSolver",
26
+ # browserless stubs (TODO crack)
27
+ "TurnstileSolver",
28
+ "GovBrSolver",
29
+ # optional token-adapters (opt-in)
30
+ "TokenProvider",
31
+ "RecaptchaV2Solver",
32
+ "RecaptchaV3Solver",
33
+ "HcaptchaSolver",
34
+ ]
@@ -0,0 +1,90 @@
1
+ """Captcha detection + a registry of our own solvers.
2
+
3
+ Three outcomes when a source (HTML or response) is checked:
4
+ - no challenge -> registry.solve returns None
5
+ - challenge + solver -> Solved{token, expires_at}
6
+ - challenge, no solver -> raise UnsupportedCaptcha
7
+
8
+ A solver produces a token; the backend (compute / LLM-image / JS-runtime) is its own business.
9
+ Tokens are single-use and solved on submit (never pre-solved).
10
+ """
11
+
12
+ from dataclasses import dataclass, field
13
+ from typing import Optional, Protocol, runtime_checkable
14
+
15
+
16
+ @dataclass
17
+ class Challenge:
18
+ kind: str
19
+ params: dict = field(default_factory=dict)
20
+
21
+
22
+ @dataclass
23
+ class Solved:
24
+ token: str
25
+ expires_at: float | None = None # absolute epoch seconds, from the challenge's own ttl
26
+
27
+
28
+ class UnsupportedCaptcha(Exception):
29
+ def __init__(self, kind: str):
30
+ super().__init__(f"no solver registered for captcha kind: {kind}")
31
+ self.kind = kind
32
+
33
+
34
+ class CaptchaServiceError(Exception):
35
+ """The captcha backend returned an unexpected/error response (often transient)."""
36
+
37
+
38
+ @runtime_checkable
39
+ class Solver(Protocol):
40
+ kind: str
41
+
42
+ @classmethod
43
+ def detect(cls, text: str) -> Optional[Challenge]:
44
+ ...
45
+
46
+ def solve(self, challenge: Challenge, transport) -> Solved:
47
+ ...
48
+
49
+
50
+ class CaptchaRegistry:
51
+ def __init__(self) -> None:
52
+ self._solvers: dict[str, Solver] = {}
53
+
54
+ def register(self, solver: Solver) -> "CaptchaRegistry":
55
+ self._solvers[solver.kind] = solver
56
+ return self
57
+
58
+ def detect(self, source) -> Optional[Challenge]:
59
+ text = source if isinstance(source, str) else getattr(source, "text", "") or ""
60
+ for solver in self._solvers.values():
61
+ ch = solver.detect(text)
62
+ if ch is not None:
63
+ return ch
64
+ return None
65
+
66
+ def solve(self, source, transport, *, hint: Optional[Challenge] = None) -> Optional[Solved]:
67
+ challenge = self.detect(source) or hint
68
+ if challenge is None:
69
+ return None
70
+ solver = self._solvers.get(challenge.kind)
71
+ if solver is None:
72
+ raise UnsupportedCaptcha(challenge.kind)
73
+ return solver.solve(challenge, transport)
74
+
75
+
76
+ def default_registry() -> CaptchaRegistry:
77
+ """Registry with the built-in own-solvers: mCaptcha PoW (working) + gov.br/Turnstile
78
+ browserless stubs (detect works, solve raises NotImplementedError until cracked).
79
+ Optional token-adapters (reCAPTCHA/hCaptcha) and the LLM image solver are opt-in —
80
+ register them yourself when configured."""
81
+ from .govbr import GovBrSolver
82
+ from .mcaptcha import McaptchaPowSolver
83
+ from .turnstile import TurnstileSolver
84
+
85
+ return (
86
+ CaptchaRegistry()
87
+ .register(McaptchaPowSolver())
88
+ .register(TurnstileSolver())
89
+ .register(GovBrSolver())
90
+ )
@@ -0,0 +1,40 @@
1
+ """gov.br (sso.acesso.gov.br) — BROWSERLESS solver scaffold.
2
+
3
+ gov.br SSO is the fleet's most common gate (~79 repos) and is browser-only in atlas today.
4
+ `detect()` works now; `solve()` is a TODO for a manual, browserless crack — fails loudly.
5
+ """
6
+
7
+ import re
8
+
9
+ from .base import Challenge, Solved
10
+
11
+ _SIGNATURE = re.compile(r"sso\.acesso\.gov\.br|acesso\.gov\.br|\bgovbr\b", re.I)
12
+ _SITEKEY_RE = re.compile(r'data-sitekey=["\']([0-9A-Za-z_-]{8,})["\']')
13
+
14
+
15
+ class GovBrSolver:
16
+ kind = "govbr"
17
+
18
+ @classmethod
19
+ def detect(cls, text: str):
20
+ text = text or ""
21
+ if not _SIGNATURE.search(text):
22
+ return None
23
+ m = _SITEKEY_RE.search(text) # gov.br embeds hCaptcha/reCAPTCHA
24
+ return Challenge(kind=cls.kind, params={"sitekey": m.group(1) if m else None})
25
+
26
+ def solve(self, challenge: Challenge, transport) -> Solved:
27
+ # TODO(crawlerkit): implement the BROWSERLESS gov.br SSO authentication.
28
+ # gov.br (sso.acesso.gov.br) is JS-heavy and gated by a captcha (hCaptcha/reCAPTCHA) plus
29
+ # fingerprint checks. Browserless approach to fill in here:
30
+ # 1. Drive the SSO step sequence with the verified curl_cffi transport, carrying cookies
31
+ # across redirects (login -> authorize -> callback).
32
+ # 2. Solve the embedded captcha via the registry (hCaptcha/reCAPTCHA token solver) OR a
33
+ # JS-runtime crack of the gov.br challenge script (QuickJS/Node + DOM shim seeded from
34
+ # the active Profile + proxy IP).
35
+ # 3. Complete the OAuth/SSO redirect; return Solved(token=<session cookie / SSO assertion>).
36
+ # Note: some gov.br services accept ICP-Brasil mutual-TLS client certs — see crawlerkit.core.tls.
37
+ raise NotImplementedError(
38
+ f"browserless gov.br solve is a TODO (params={challenge.params!r}) "
39
+ "— implement the SSO/JS-runtime crack"
40
+ )
@@ -0,0 +1,46 @@
1
+ """Own image-captcha solver: fetch the challenge image over the verified transport, classify with
2
+ a pluggable vision LLM, return the answer/token. Provider-agnostic — inject a `classify` callable
3
+ `(image_bytes, prompt) -> str`. Prompts ported from atlas's GPT solver.
4
+
5
+ Image captchas are target-specific, so the crawler builds the Challenge with the image location
6
+ (`params["image_url"]` or `params["image_bytes"]`); `detect()` returns None.
7
+ """
8
+
9
+ from .base import CaptchaServiceError, Challenge, Solved
10
+
11
+ OCR_PROMPT = (
12
+ "This image is a CAPTCHA. Read the characters exactly. Respond with ONLY the characters "
13
+ "(letters/digits), no spaces, no explanation."
14
+ )
15
+ # 3x3 / 4x4 grid-selection prompts (hCaptcha / reCAPTCHA) are available for grid challenges;
16
+ # port the full set from atlas chatgpt_captcha_solver.py when wiring a grid flow.
17
+ GRID_3X3_PROMPT = (
18
+ "A reference image sits above a 3x3 grid (tiles numbered 1-9, left-to-right, top-to-bottom). "
19
+ "Return the tile numbers that clearly and fully match the reference, separated by '/', e.g. '2/5/9'. "
20
+ "If none match, return 'none'. No other text."
21
+ )
22
+
23
+
24
+ class LlmImageSolver:
25
+ kind = "image"
26
+
27
+ def __init__(self, classify, *, prompt: str = OCR_PROMPT):
28
+ # classify: Callable[[bytes, str], str]
29
+ self._classify = classify
30
+ self._prompt = prompt
31
+
32
+ @classmethod
33
+ def detect(cls, text: str):
34
+ return None # the crawler constructs the image Challenge explicitly
35
+
36
+ def solve(self, challenge: Challenge, transport) -> Solved:
37
+ img = challenge.params.get("image_bytes")
38
+ if img is None:
39
+ url = challenge.params.get("image_url")
40
+ if not url:
41
+ raise CaptchaServiceError("LlmImageSolver needs params['image_url'] or ['image_bytes']")
42
+ img = transport.get(url, timeout=30).content
43
+ answer = (self._classify(img, challenge.params.get("prompt", self._prompt)) or "").strip()
44
+ if not answer:
45
+ raise CaptchaServiceError("vision LLM returned an empty answer")
46
+ return Solved(token=answer)
@@ -0,0 +1,142 @@
1
+ """mCaptcha proof-of-work solver (compute backend, no third-party service).
2
+
3
+ Ported from the Detran POC. Byte layout confirmed empirically against a captured
4
+ (salt, string, nonce) -> result oracle (guarded by self_test). No sitekey secret needed.
5
+ """
6
+
7
+ import hashlib
8
+ import re
9
+ import struct
10
+ import time
11
+
12
+ from .base import CaptchaServiceError, Challenge, Solved
13
+
14
+ U128_MAX = (1 << 128) - 1
15
+
16
+ # Captured oracle (a real, self-consistent challenge) — proves the byte layout.
17
+ _ORACLE_SALT = "8c7b6a5d4e3f2a1b0c9d8e7f6a5b4c3d2e1f0a9b8c7d6e5f4a3b2c1d0e9f8a7b"
18
+ _ORACLE_STRING = "7KoiRWIqZk3qFy7C8Jt96E9KUSGfdbVL"
19
+ _ORACLE_DIFFICULTY = 4_000_000
20
+ _ORACLE_NONCE = 3_539_967
21
+ _ORACLE_RESULT = 340282365527686933810834880601832247926
22
+
23
+ # matches the mCaptcha widget iframe URL embedded in a page
24
+ _WIDGET_RE = re.compile(r"https?://([\w.-]+)/widget\?sitekey=([\w-]+)")
25
+
26
+
27
+ def mcaptcha_hint(host: str, sitekey: str) -> Challenge:
28
+ """Build a Challenge from a known sitekey when the widget isn't inline in the GET HTML."""
29
+ return Challenge(
30
+ kind="mcaptcha",
31
+ params={"host": host, "sitekey": sitekey, "api_base": f"https://{host}/api/v1/pow"},
32
+ )
33
+
34
+
35
+ def _headers(challenge: Challenge) -> dict:
36
+ host = challenge.params["host"]
37
+ sitekey = challenge.params["sitekey"]
38
+ return {
39
+ "Content-Type": "application/json",
40
+ "Accept": "*/*",
41
+ "Origin": f"https://{host}",
42
+ "Referer": f"https://{host}/widget?sitekey={sitekey}",
43
+ }
44
+
45
+
46
+ class McaptchaPowSolver:
47
+ kind = "mcaptcha"
48
+
49
+ # ---- detection ----
50
+ @classmethod
51
+ def detect(cls, text: str):
52
+ m = _WIDGET_RE.search(text or "")
53
+ if not m:
54
+ return None
55
+ return mcaptcha_hint(host=m.group(1), sitekey=m.group(2))
56
+
57
+ # ---- proof-of-work math ----
58
+ @staticmethod
59
+ def _prefix(salt: str, string: str):
60
+ h = hashlib.sha256()
61
+ h.update(salt.encode())
62
+ sb = string.encode()
63
+ h.update(struct.pack("<Q", len(sb))) # bincode fixint LE u64 length prefix
64
+ h.update(sb)
65
+ return h
66
+
67
+ @classmethod
68
+ def compute_result(cls, salt: str, string: str, nonce: int) -> int:
69
+ h = cls._prefix(salt, string)
70
+ h.update(str(nonce).encode()) # nonce as decimal ASCII
71
+ return int.from_bytes(h.digest()[:16], "big") # first 16 bytes, big-endian
72
+
73
+ @staticmethod
74
+ def threshold(difficulty: int) -> int:
75
+ return U128_MAX - U128_MAX // difficulty
76
+
77
+ @classmethod
78
+ def solve_pow(cls, salt, string, difficulty, max_iters=2_000_000_000, max_seconds=120):
79
+ thr = cls.threshold(difficulty)
80
+ base = cls._prefix(salt, string)
81
+ start = time.perf_counter()
82
+ nonce = 0
83
+ while nonce < max_iters:
84
+ nonce += 1
85
+ h = base.copy()
86
+ h.update(str(nonce).encode())
87
+ if int.from_bytes(h.digest()[:16], "big") >= thr:
88
+ return nonce, str(cls.compute_result(salt, string, nonce)), int(
89
+ (time.perf_counter() - start) * 1000
90
+ )
91
+ if (nonce & 0x3FFFF) == 0 and (time.perf_counter() - start) > max_seconds:
92
+ raise TimeoutError(f"PoW unsolved in {max_seconds}s (difficulty {difficulty})")
93
+ raise RuntimeError("PoW unsolved within iteration cap")
94
+
95
+ @classmethod
96
+ def self_test(cls) -> None:
97
+ """Oracle gate: assert the layout reproduces the captured result. Instant."""
98
+ got = cls.compute_result(_ORACLE_SALT, _ORACLE_STRING, _ORACLE_NONCE)
99
+ if got != _ORACLE_RESULT:
100
+ raise AssertionError(f"mCaptcha layout self-test FAILED: {got} != {_ORACLE_RESULT}")
101
+
102
+ # ---- solve (config -> pow -> verify -> token) ----
103
+ def solve(self, challenge: Challenge, transport) -> Solved:
104
+ # mCaptcha's actor backend is intermittently flaky ("Actor mailbox error") on both
105
+ # /config and /verify — retry the whole config -> pow -> verify a few times.
106
+ last: Exception | None = None
107
+ for attempt in range(3):
108
+ try:
109
+ return self._solve_once(challenge, transport)
110
+ except CaptchaServiceError as e:
111
+ last = e
112
+ time.sleep(1.0 * (attempt + 1))
113
+ raise last # type: ignore[misc]
114
+
115
+ def _solve_once(self, challenge: Challenge, transport) -> Solved:
116
+ api = challenge.params["api_base"]
117
+ key = challenge.params["sitekey"]
118
+ headers = _headers(challenge)
119
+
120
+ cfg = transport.post(f"{api}/config", json={"key": key}, headers=headers).json()
121
+ if not (isinstance(cfg, dict) and "salt" in cfg):
122
+ raise CaptchaServiceError(f"mcaptcha /config returned no challenge: {cfg!r}")
123
+
124
+ nonce, result, elapsed_ms = self.solve_pow(
125
+ cfg["salt"], cfg["string"], cfg["difficulty_factor"]
126
+ )
127
+ verify = transport.post(
128
+ f"{api}/verify",
129
+ json={
130
+ "key": key,
131
+ "nonce": nonce,
132
+ "result": result,
133
+ "string": cfg["string"],
134
+ "time": elapsed_ms,
135
+ "worker_type": "wasm",
136
+ },
137
+ headers=headers,
138
+ ).json()
139
+ token = verify.get("token") if isinstance(verify, dict) else None
140
+ if not token:
141
+ raise CaptchaServiceError(f"mcaptcha /verify returned no token: {verify!r}")
142
+ return Solved(token=token, expires_at=None)
@@ -0,0 +1,77 @@
1
+ """OPTIONAL token-captcha adapters: reCAPTCHA v2/v3 + hCaptcha via a third-party token provider.
2
+
3
+ Browserless (POST site_key + url -> token; no DOM). These reintroduce a paid third party, so they
4
+ are OPT-IN — NOT registered in `default_registry()`. Wire them only when configured:
5
+
6
+ reg.register(RecaptchaV2Solver(provider)).register(HcaptchaSolver(provider))
7
+
8
+ `provider` is any object implementing `TokenProvider` (e.g. an adapter around atlas's
9
+ AntiCaptcha/2Captcha solvers). `detect()` finds the sitekey; the crawler supplies params["url"]
10
+ (the page URL the provider needs) via a hint.
11
+ """
12
+
13
+ import re
14
+ from typing import Protocol, runtime_checkable
15
+
16
+ from .base import CaptchaServiceError, Challenge, Solved
17
+
18
+
19
+ @runtime_checkable
20
+ class TokenProvider(Protocol):
21
+ def solve_recaptcha_v2(self, site_key: str, url: str, **kw) -> str: ...
22
+ def solve_recaptcha_v3(self, site_key: str, url: str, **kw) -> str: ...
23
+ def solve_hcaptcha(self, site_key: str, url: str, **kw) -> str: ...
24
+
25
+
26
+ def _sitekey(text: str, marker: str) -> str | None:
27
+ if marker not in (text or "").lower():
28
+ return None
29
+ m = re.search(r'(?:data-sitekey|sitekey|render)["\']?\s*[:=]\s*["\']?([0-9A-Za-z_-]{20,})', text or "")
30
+ return m.group(1) if m else None
31
+
32
+
33
+ class _BaseTokenSolver:
34
+ kind = ""
35
+ _marker = ""
36
+
37
+ def __init__(self, provider: TokenProvider):
38
+ self._p = provider
39
+
40
+ @classmethod
41
+ def detect(cls, text: str):
42
+ sk = _sitekey(text, cls._marker)
43
+ return Challenge(kind=cls.kind, params={"sitekey": sk}) if sk else None
44
+
45
+ def _require(self, challenge: Challenge) -> tuple[str, str]:
46
+ sk = challenge.params.get("sitekey")
47
+ url = challenge.params.get("url") # supplied by the crawler (provider needs the page URL)
48
+ if not sk or not url:
49
+ raise CaptchaServiceError(f"{self.kind} needs params['sitekey'] and ['url']")
50
+ return sk, url
51
+
52
+
53
+ class RecaptchaV2Solver(_BaseTokenSolver):
54
+ kind = "recaptcha_v2"
55
+ _marker = "recaptcha"
56
+
57
+ def solve(self, challenge: Challenge, transport) -> Solved:
58
+ sk, url = self._require(challenge)
59
+ return Solved(token=self._p.solve_recaptcha_v2(sk, url))
60
+
61
+
62
+ class RecaptchaV3Solver(_BaseTokenSolver):
63
+ kind = "recaptcha_v3"
64
+ _marker = "recaptcha"
65
+
66
+ def solve(self, challenge: Challenge, transport) -> Solved:
67
+ sk, url = self._require(challenge)
68
+ return Solved(token=self._p.solve_recaptcha_v3(sk, url, action=challenge.params.get("action")))
69
+
70
+
71
+ class HcaptchaSolver(_BaseTokenSolver):
72
+ kind = "hcaptcha"
73
+ _marker = "hcaptcha"
74
+
75
+ def solve(self, challenge: Challenge, transport) -> Solved:
76
+ sk, url = self._require(challenge)
77
+ return Solved(token=self._p.solve_hcaptcha(sk, url))
@@ -0,0 +1,40 @@
1
+ """Cloudflare Turnstile — BROWSERLESS solver scaffold.
2
+
3
+ `detect()` works today (finds the widget + sitekey). `solve()` is a TODO for a manual,
4
+ browserless crack — it fails loudly (NotImplementedError), never silently.
5
+ """
6
+
7
+ import re
8
+
9
+ from .base import Challenge, Solved
10
+
11
+ _SIGNATURE = re.compile(r"challenges\.cloudflare\.com/turnstile|cf-turnstile|turnstile\.render", re.I)
12
+ _SITEKEY_RE = re.compile(r'(?:data-sitekey|sitekey)["\']?\s*[:=]\s*["\']([0-9A-Za-z_-]{8,})["\']')
13
+
14
+
15
+ class TurnstileSolver:
16
+ kind = "turnstile"
17
+
18
+ @classmethod
19
+ def detect(cls, text: str):
20
+ text = text or ""
21
+ if not _SIGNATURE.search(text):
22
+ return None
23
+ m = _SITEKEY_RE.search(text)
24
+ return Challenge(kind=cls.kind, params={"sitekey": m.group(1) if m else None})
25
+
26
+ def solve(self, challenge: Challenge, transport) -> Solved:
27
+ # TODO(crawlerkit): implement the BROWSERLESS Cloudflare Turnstile solve.
28
+ # Turnstile mints a `cf-turnstile-response` token by running obfuscated,
29
+ # fingerprint-bearing widget JS. Browserless approach to fill in here:
30
+ # 1. First try the PASSIVE path — a clean impersonated identity (the active Profile +
31
+ # proxy IP) often receives a token with no interactive challenge. Attempt that first.
32
+ # 2. Otherwise fetch the widget bundle (turnstile/v0/api.js + the challenge for
33
+ # params["sitekey"]) and execute its JS in a JS runtime (QuickJS via py-mini-racer,
34
+ # or a Node subprocess) with a minimal DOM/navigator shim seeded from the active
35
+ # Profile (UA, sec-ch-ua, screen, languages) and the leased proxy IP.
36
+ # 3. return Solved(token=<cf-turnstile-response>, expires_at=now+~300).
37
+ raise NotImplementedError(
38
+ f"browserless Turnstile solve is a TODO (params={challenge.params!r}) "
39
+ "— implement the passive/JS-runtime crack"
40
+ )
@@ -0,0 +1,40 @@
1
+ """Optional cookie-jar persistence across crawls.
2
+
3
+ curl_cffi keeps cookies within a Session automatically (a GET that sets JSESSIONID is reused by the
4
+ following POST — no action needed). These helpers add OPTIONAL cross-run warming: dump the jar to
5
+ disk after a crawl and reload it before the next. Best-effort; never raises on a malformed file.
6
+ """
7
+
8
+ import json
9
+
10
+
11
+ def save_cookies(transport, path: str) -> int:
12
+ """Dump the transport's current cookies to `path` (JSON). Returns the count saved."""
13
+ data = []
14
+ try:
15
+ for c in transport._session.cookies.jar:
16
+ data.append({"name": c.name, "value": c.value, "domain": c.domain, "path": c.path})
17
+ except Exception: # noqa: BLE001 — cookie internals vary; best-effort
18
+ return 0
19
+ with open(path, "w", encoding="utf-8") as f:
20
+ json.dump(data, f)
21
+ return len(data)
22
+
23
+
24
+ def load_cookies(transport, path: str) -> int:
25
+ """Load cookies from `path` into the transport's session. Returns the count loaded (0 if absent)."""
26
+ try:
27
+ with open(path, encoding="utf-8") as f:
28
+ data = json.load(f)
29
+ except (FileNotFoundError, ValueError):
30
+ return 0
31
+ n = 0
32
+ for c in data:
33
+ try:
34
+ transport._session.cookies.set(
35
+ c["name"], c["value"], domain=c.get("domain"), path=c.get("path", "/")
36
+ )
37
+ n += 1
38
+ except Exception: # noqa: BLE001
39
+ continue
40
+ return n
@@ -0,0 +1,48 @@
1
+ """Crawl error taxonomy + a response block-detector.
2
+
3
+ `BaseCrawler.run()` reacts by class:
4
+ - `TransientError` -> back off, retry with the SAME identity (network blip / timeout / 5xx).
5
+ - `BlockedError` -> rotate identity + proxy, then retry (anti-bot block: 403/429/challenge page).
6
+ - `PermanentError` -> fail fast, no retry (bad input / unrecoverable).
7
+ Anything else propagates unchanged.
8
+ """
9
+
10
+ import re
11
+
12
+
13
+ class CrawlerError(Exception):
14
+ """Base class for crawl-stage errors."""
15
+
16
+
17
+ class TransientError(CrawlerError):
18
+ """A transient failure (network blip, timeout, 5xx) — retry unchanged."""
19
+
20
+
21
+ class PermanentError(CrawlerError):
22
+ """An unrecoverable failure (bad input, hard 4xx) — do not retry."""
23
+
24
+
25
+ class BlockedError(CrawlerError):
26
+ """An anti-bot block (403/429 or a challenge page) — rotate identity+proxy and retry."""
27
+
28
+
29
+ # Common interstitial/anti-bot markers (Cloudflare, Akamai, Incapsula, generic).
30
+ _BLOCK_MARKERS = re.compile(
31
+ r"just a moment|attention required|access denied|/cdn-cgi/challenge|"
32
+ r"cf-error-details|akamai|incapsula|request unsuccessful",
33
+ re.I,
34
+ )
35
+
36
+
37
+ def raise_for_block(response) -> None:
38
+ """Opt-in guard: raise `BlockedError`/`TransientError` if a response looks blocked.
39
+
40
+ Call from `flow()` after a request whose 200 you don't fully trust. Detects 403/429 and
41
+ common challenge-page markers (-> blocked) and 5xx (-> transient).
42
+ """
43
+ status = getattr(response, "status_code", 0) or 0
44
+ text = getattr(response, "text", "") or ""
45
+ if status in (403, 429) or _BLOCK_MARKERS.search(text[:4000]):
46
+ raise BlockedError(f"anti-bot block detected (status={status})")
47
+ if status >= 500:
48
+ raise TransientError(f"server error (status={status})")
@@ -0,0 +1,108 @@
1
+ """Coherent browser identity via browserforge + curl_cffi impersonate.
2
+
3
+ curl_cffi's `impersonate` target owns the TLS/JA3 + HTTP2 fingerprint and MUST stay coherent with
4
+ the User-Agent — a UA that disagrees with the JA3 is worse than no spoofing. browserforge supplies a
5
+ realistic header SET + ORDER + locale; we pick the nearest supported impersonate target and SNAP the
6
+ UA/sec-ch-ua Chrome version to it, so UA <-> JA3 never drift. Each `generate()` randomizes (rotation),
7
+ and profiles are rotated together with the proxy.
8
+ """
9
+
10
+ import re
11
+ from dataclasses import dataclass, field
12
+
13
+ # curl_cffi impersonate targets we support, ascending by Chrome major.
14
+ _IMPERSONATE_BY_MAJOR: list[tuple[int, str]] = [
15
+ (120, "chrome120"),
16
+ (124, "chrome124"),
17
+ (131, "chrome131"),
18
+ (133, "chrome133a"),
19
+ ]
20
+ DEFAULT_IMPERSONATE = "chrome131"
21
+
22
+
23
+ def _impersonate_for_major(major: int) -> tuple[str, int]:
24
+ """Nearest supported (target, target_major) with target_major <= major; else the lowest."""
25
+ chosen_major, chosen_target = _IMPERSONATE_BY_MAJOR[0]
26
+ for mj, target in _IMPERSONATE_BY_MAJOR:
27
+ if mj <= major:
28
+ chosen_major, chosen_target = mj, target
29
+ return chosen_target, chosen_major
30
+
31
+
32
+ def _snap_version(headers: dict, gen_major: int, target_major: int) -> None:
33
+ """Rewrite the UA + sec-ch-ua Chrome version from gen_major to target_major (in place)."""
34
+ if gen_major == target_major:
35
+ return
36
+ if ua := headers.get("User-Agent"):
37
+ headers["User-Agent"] = re.sub(r"Chrome/\d+", f"Chrome/{target_major}", ua)
38
+ if sch := headers.get("sec-ch-ua"): # only the Chrome/Chromium brands carry the major
39
+ headers["sec-ch-ua"] = sch.replace(f'v="{gen_major}"', f'v="{target_major}"')
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class Profile:
44
+ impersonate: str
45
+ _headers: dict = field(default_factory=dict)
46
+
47
+ @property
48
+ def user_agent(self) -> str:
49
+ return self._headers.get("User-Agent", "")
50
+
51
+ def headers(self) -> dict:
52
+ return dict(self._headers)
53
+
54
+
55
+ def _fallback_profile() -> Profile:
56
+ """Static coherent profile when browserforge is unavailable."""
57
+ return Profile(
58
+ impersonate=DEFAULT_IMPERSONATE,
59
+ _headers={
60
+ "User-Agent": (
61
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
62
+ "Chrome/131.0.0.0 Safari/537.36"
63
+ ),
64
+ "Accept": (
65
+ "text/html,application/xhtml+xml,application/xml;q=0.9,"
66
+ "image/avif,image/webp,*/*;q=0.8"
67
+ ),
68
+ "Accept-Language": "pt-BR,pt;q=0.9,en;q=0.8",
69
+ "Accept-Encoding": "gzip, deflate, br, zstd", # curl_cffi decodes br/zstd natively
70
+ },
71
+ )
72
+
73
+
74
+ class ProfileGenerator:
75
+ """Generate coherent Profiles via browserforge, snapped to a curl_cffi impersonate target."""
76
+
77
+ def __init__(self, *, browser="chrome", os=("windows", "linux"), device="desktop", locale="pt-BR"):
78
+ self._hg = None
79
+ try:
80
+ from browserforge.headers import HeaderGenerator
81
+
82
+ self._hg = HeaderGenerator(browser=browser, os=os, device=device, locale=locale)
83
+ except Exception: # noqa: BLE001 — browserforge optional; fall back to a static profile
84
+ self._hg = None
85
+
86
+ def generate(self) -> Profile:
87
+ if self._hg is None:
88
+ return _fallback_profile()
89
+ try:
90
+ h = dict(self._hg.generate())
91
+ except Exception: # noqa: BLE001
92
+ return _fallback_profile()
93
+ m = re.search(r"Chrome/(\d+)", h.get("User-Agent", ""))
94
+ gen_major = int(m.group(1)) if m else 131
95
+ target, target_major = _impersonate_for_major(gen_major)
96
+ _snap_version(h, gen_major, target_major)
97
+ return Profile(impersonate=target, _headers=h)
98
+
99
+
100
+ _DEFAULT_GEN: ProfileGenerator | None = None
101
+
102
+
103
+ def pick(pool=None, index: int = 0) -> Profile:
104
+ """Return a freshly generated, coherent Profile (browserforge-randomized = rotation)."""
105
+ global _DEFAULT_GEN
106
+ if _DEFAULT_GEN is None:
107
+ _DEFAULT_GEN = ProfileGenerator()
108
+ return _DEFAULT_GEN.generate()
@@ -0,0 +1,96 @@
1
+ """Proxy leasing.
2
+
3
+ The leased egress is what the transport binds AND what any captcha solver uses — so a
4
+ risk-scored token is minted from the same IP that will submit it. v1 ships Null + Static
5
+ providers; BrightData / sticky-session / VPN-as-proxy come later.
6
+ """
7
+
8
+ import os
9
+ from dataclasses import dataclass
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class ProxyLease:
14
+ url: str | None # e.g. "http://user:pass@host:port", or None for direct
15
+
16
+
17
+ class ProxyProvider:
18
+ def lease(self, key: str | None = None) -> ProxyLease:
19
+ raise NotImplementedError
20
+
21
+ def release(self, lease: ProxyLease) -> None: # noqa: B027 — optional hook
22
+ pass
23
+
24
+
25
+ class NullProxyProvider(ProxyProvider):
26
+ """Direct egress (no proxy)."""
27
+
28
+ def lease(self, key: str | None = None) -> ProxyLease:
29
+ return ProxyLease(url=None)
30
+
31
+
32
+ class StaticProxyProvider(ProxyProvider):
33
+ """Round-robin a fixed list (arg, or CRAWLERKIT_PROXIES env, comma-separated)."""
34
+
35
+ def __init__(self, proxies: list[str] | None = None):
36
+ self._proxies = proxies or [
37
+ p.strip() for p in os.environ.get("CRAWLERKIT_PROXIES", "").split(",") if p.strip()
38
+ ]
39
+ self._i = 0
40
+
41
+ def lease(self, key: str | None = None) -> ProxyLease:
42
+ if not self._proxies:
43
+ return ProxyLease(url=None)
44
+ url = self._proxies[self._i % len(self._proxies)]
45
+ self._i += 1
46
+ return ProxyLease(url=url)
47
+
48
+
49
+ class BrightDataProxyProvider(ProxyProvider):
50
+ """BrightData (Luminati) sticky-session proxy. The session id (seeded from `key`, e.g. the
51
+ crawl item) pins a sticky egress IP so retries reuse it. Creds from args or env
52
+ BRIGHT_DATA_USER/PASS/HOST/PORT. NOTE: the exact username param syntax varies by zone/plan."""
53
+
54
+ def __init__(self, user=None, password=None, host=None, port=None, sticky=True):
55
+ self.user = user or os.environ.get("BRIGHT_DATA_USER", "")
56
+ self.password = password or os.environ.get("BRIGHT_DATA_PASS", "")
57
+ self.host = host or os.environ.get("BRIGHT_DATA_HOST", "")
58
+ self.port = port or os.environ.get("BRIGHT_DATA_PORT", "22225")
59
+ self.sticky = sticky
60
+ self._n = 0
61
+
62
+ def lease(self, key: str | None = None) -> ProxyLease:
63
+ if not (self.user and self.host):
64
+ return ProxyLease(url=None)
65
+ user = self.user
66
+ if self.sticky:
67
+ sid = key or f"s{self._n}"
68
+ self._n += 1
69
+ user = f"{self.user}-session-{sid}"
70
+ return ProxyLease(url=f"http://{user}:{self.password}@{self.host}:{self.port}")
71
+
72
+
73
+ class DataImpulseProxyProvider(ProxyProvider):
74
+ """DataImpulse rotating/sticky proxy. Creds from args or env DATA_IMPULSE_USER/PASS/HOST/PORT;
75
+ optional country targeting. Sticky session id seeded from `key`."""
76
+
77
+ def __init__(self, user=None, password=None, host=None, port=None, country=None, sticky=True):
78
+ self.user = user or os.environ.get("DATA_IMPULSE_USER", "")
79
+ self.password = password or os.environ.get("DATA_IMPULSE_PASS", "")
80
+ self.host = host or os.environ.get("DATA_IMPULSE_HOST", "gw.dataimpulse.com")
81
+ self.port = port or os.environ.get("DATA_IMPULSE_PORT", "823")
82
+ self.country = country
83
+ self.sticky = sticky
84
+ self._n = 0
85
+
86
+ def lease(self, key: str | None = None) -> ProxyLease:
87
+ if not (self.user and self.host):
88
+ return ProxyLease(url=None)
89
+ user = self.user
90
+ if self.country:
91
+ user += f"__cr.{self.country}"
92
+ if self.sticky:
93
+ sid = key or f"s{self._n}"
94
+ self._n += 1
95
+ user += f";sess-{sid}"
96
+ return ProxyLease(url=f"http://{user}:{self.password}@{self.host}:{self.port}")
crawlerkit/core/tls.py ADDED
@@ -0,0 +1,113 @@
1
+ """Per-host CA bundle builder with AIA repair.
2
+
3
+ Some hosts (e.g. Detran) serve only their leaf certificate and omit the intermediate, so
4
+ Python TLS verification fails with "unable to get local issuer certificate". A browser papers
5
+ over this by fetching the missing intermediate from the leaf's AIA "CA Issuers" URL; we do the
6
+ same here, generically: read the leaf's AIA, fetch + follow intermediates up to a trusted root,
7
+ and concatenate with certifi's roots. Verification stays ON. Cached per host.
8
+ """
9
+
10
+ import os
11
+ import socket
12
+ import ssl
13
+ import tempfile
14
+ import urllib.request
15
+
16
+ import certifi
17
+ from cryptography import x509
18
+ from cryptography.hazmat.primitives import serialization
19
+ from cryptography.x509.oid import AuthorityInformationAccessOID, ExtensionOID
20
+
21
+ _CACHE_DIR = os.environ.get(
22
+ "CRAWLERKIT_CA_DIR", os.path.join(tempfile.gettempdir(), "crawlerkit-ca")
23
+ )
24
+
25
+
26
+ def _leaf_cert(host: str, port: int = 443) -> x509.Certificate:
27
+ ctx = ssl._create_unverified_context()
28
+ with socket.create_connection((host, port), timeout=30) as sock:
29
+ with ctx.wrap_socket(sock, server_hostname=host) as ssock:
30
+ der = ssock.getpeercert(binary_form=True)
31
+ return x509.load_der_x509_certificate(der)
32
+
33
+
34
+ def _ca_issuer_urls(cert: x509.Certificate) -> list[str]:
35
+ try:
36
+ aia = cert.extensions.get_extension_for_oid(
37
+ ExtensionOID.AUTHORITY_INFORMATION_ACCESS
38
+ ).value
39
+ except x509.ExtensionNotFound:
40
+ return []
41
+ return [
42
+ d.access_location.value
43
+ for d in aia
44
+ if d.access_method == AuthorityInformationAccessOID.CA_ISSUERS
45
+ ]
46
+
47
+
48
+ def _fetch_cert(url: str) -> x509.Certificate:
49
+ with urllib.request.urlopen(url, timeout=30) as r: # noqa: S310 (public CA cert, http ok)
50
+ raw = r.read()
51
+ if raw.lstrip().startswith(b"-----BEGIN"):
52
+ return x509.load_pem_x509_certificate(raw)
53
+ return x509.load_der_x509_certificate(raw)
54
+
55
+
56
+ def build_ca_bundle(host: str, port: int = 443, *, force: bool = False, max_depth: int = 4) -> str:
57
+ """Return a path to a CA bundle = trusted roots + any intermediates `host` omits.
58
+
59
+ Best-effort: if AIA repair fails, falls back to certifi roots only.
60
+ """
61
+ os.makedirs(_CACHE_DIR, exist_ok=True)
62
+ path = os.path.join(_CACHE_DIR, f"{host}_{port}.pem")
63
+ if os.path.exists(path) and not force:
64
+ return path
65
+
66
+ roots = open(certifi.where(), encoding="utf-8").read()
67
+ extra: list[str] = []
68
+ try:
69
+ cert = _leaf_cert(host, port)
70
+ seen: set[str] = set()
71
+ for _ in range(max_depth):
72
+ urls = [u for u in _ca_issuer_urls(cert) if u.startswith(("http://", "https://"))]
73
+ if not urls or urls[0] in seen:
74
+ break
75
+ seen.add(urls[0])
76
+ cert = _fetch_cert(urls[0])
77
+ extra.append(cert.public_bytes(serialization.Encoding.PEM).decode())
78
+ if cert.issuer == cert.subject: # reached a self-signed root
79
+ break
80
+ except Exception: # noqa: BLE001 — never let CA discovery crash a crawl; roots-only fallback
81
+ pass
82
+
83
+ with open(path, "w", encoding="utf-8") as f:
84
+ f.write(roots if roots.endswith("\n") else roots + "\n")
85
+ for pem in extra:
86
+ f.write(pem if pem.endswith("\n") else pem + "\n")
87
+ return path
88
+
89
+
90
+ def client_cert_from_pfx(pfx_path: str, password: str | bytes | None, out_path: str | None = None) -> str:
91
+ """Load an ICP-Brasil / PKCS#12 `.pfx` and write a combined PEM (private key + cert + CA chain)
92
+ for curl_cffi's `cert=` (mutual TLS). Returns the PEM path. Port of alexandria/pfx_to_pem via
93
+ `cryptography` (no pyOpenSSL). The output is chmod 600 (contains the private key)."""
94
+ if isinstance(password, str):
95
+ password = password.encode()
96
+ with open(pfx_path, "rb") as f:
97
+ data = f.read()
98
+ key, cert, extra = serialization.pkcs12.load_key_and_certificates(data, password)
99
+ os.makedirs(_CACHE_DIR, exist_ok=True)
100
+ out_path = out_path or os.path.join(_CACHE_DIR, os.path.basename(pfx_path) + ".pem")
101
+ with open(out_path, "wb") as f:
102
+ if key is not None:
103
+ f.write(key.private_bytes(
104
+ serialization.Encoding.PEM,
105
+ serialization.PrivateFormat.TraditionalOpenSSL,
106
+ serialization.NoEncryption(),
107
+ ))
108
+ if cert is not None:
109
+ f.write(cert.public_bytes(serialization.Encoding.PEM))
110
+ for ca in (extra or []):
111
+ f.write(ca.public_bytes(serialization.Encoding.PEM))
112
+ os.chmod(out_path, 0o600)
113
+ return out_path
@@ -0,0 +1,76 @@
1
+ """Fingerprinted HTTP transport — the only HTTP path.
2
+
3
+ A `curl_cffi` Session bound to one Profile (TLS/JA3 + UA + header order) + a proxy lease +
4
+ per-host verified CA bundle (with AIA repair). TLS/JA3 fingerprint is a property of THIS client,
5
+ so it is the foundation, not a plugin. `requests` is intentionally not used (giveaway fingerprint).
6
+ """
7
+
8
+ import os
9
+ import random
10
+ import time
11
+ from urllib.parse import urlparse
12
+
13
+ import structlog
14
+ from curl_cffi import requests as cffi
15
+ from curl_cffi.requests import exceptions as _cffi_exc
16
+
17
+ from . import tls
18
+ from .errors import TransientError
19
+ from .identity import Profile
20
+ from .proxy import ProxyLease
21
+
22
+ log = structlog.get_logger(__name__)
23
+
24
+
25
+ class Transport:
26
+ def __init__(self, profile: Profile, proxy: ProxyLease, *, verify: bool = True,
27
+ client_cert: str | None = None, min_interval: float | None = None):
28
+ self.profile = profile
29
+ self.proxy = proxy
30
+ self.verify = verify
31
+ self.client_cert = client_cert # PEM (cert+key) for ICP-Brasil mutual TLS, or None
32
+ # politeness: minimum seconds between requests (+ up to 25% jitter). 0/None = off.
33
+ self.min_interval = float(
34
+ min_interval if min_interval is not None else os.environ.get("CRAWLERKIT_MIN_INTERVAL", 0)
35
+ )
36
+ self._last = 0.0
37
+ self._ca: dict[str, str] = {}
38
+ self._session = cffi.Session(impersonate=profile.impersonate)
39
+ self._session.headers.update(profile.headers())
40
+ if proxy.url:
41
+ self._session.proxies = {"http": proxy.url, "https": proxy.url}
42
+
43
+ def _verify_for(self, url: str):
44
+ if self.verify is False:
45
+ return False
46
+ host = urlparse(url).hostname or ""
47
+ if host not in self._ca:
48
+ self._ca[host] = tls.build_ca_bundle(host)
49
+ return self._ca[host]
50
+
51
+ def _throttle(self) -> None:
52
+ if self.min_interval <= 0:
53
+ return
54
+ wait = self.min_interval - (time.monotonic() - self._last)
55
+ if wait > 0:
56
+ time.sleep(wait + random.uniform(0, self.min_interval * 0.25))
57
+ self._last = time.monotonic()
58
+
59
+ def request(self, method: str, url: str, **kw):
60
+ kw.setdefault("verify", self._verify_for(url))
61
+ kw.setdefault("impersonate", self.profile.impersonate)
62
+ kw.setdefault("timeout", 30)
63
+ if self.client_cert:
64
+ kw.setdefault("cert", self.client_cert)
65
+ self._throttle()
66
+ log.debug("http", method=method, url=url, proxy=bool(self.proxy.url))
67
+ try:
68
+ return self._session.request(method, url, **kw)
69
+ except _cffi_exc.RequestsError as e: # network/curl failure -> transient (retryable)
70
+ raise TransientError(f"{method} {url}: {e}") from e
71
+
72
+ def get(self, url: str, **kw):
73
+ return self.request("GET", url, **kw)
74
+
75
+ def post(self, url: str, **kw):
76
+ return self.request("POST", url, **kw)
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlerkit-core
3
+ Version: 0.1.0
4
+ Summary: Browserless crawler base: curl_cffi transport, TLS/AIA, identity, proxy, captcha, BaseCrawler/BaseParser.
5
+ Author-email: Lucas Caovilla <lucasgrisac@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/lucascaovilla/crawlerkit
8
+ Project-URL: Repository, https://github.com/lucascaovilla/crawlerkit
9
+ Project-URL: Documentation, https://github.com/lucascaovilla/crawlerkit#readme
10
+ Project-URL: Issues, https://github.com/lucascaovilla/crawlerkit/issues
11
+ Keywords: crawler,scraping,curl_cffi,tls,fingerprint,captcha,browserless
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Internet :: WWW/HTTP
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.11
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: curl_cffi>=0.7
26
+ Requires-Dist: browserforge>=1.2
27
+ Requires-Dist: cryptography>=42
28
+ Requires-Dist: certifi>=2024.0
29
+ Requires-Dist: selectolax>=0.3
30
+ Requires-Dist: lxml>=5.0
31
+ Requires-Dist: beautifulsoup4>=4.12
32
+ Requires-Dist: structlog>=24.1
33
+ Requires-Dist: tenacity>=8.2
34
+ Requires-Dist: weasyprint>=60
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=8.0; extra == "dev"
37
+ Requires-Dist: ruff>=0.5; extra == "dev"
38
+ Requires-Dist: build>=1.2; extra == "dev"
39
+ Requires-Dist: twine>=5.0; extra == "dev"
40
+ Requires-Dist: commitizen>=3.27; extra == "dev"
41
+ Provides-Extra: docs
42
+ Requires-Dist: mkdocs-material>=9.5; extra == "docs"
43
+ Requires-Dist: mkdocstrings[python]>=0.25; extra == "docs"
44
+ Dynamic: license-file
45
+
46
+ # crawlerkit-core
47
+
48
+ [![PyPI version](https://img.shields.io/pypi/v/crawlerkit-core.svg)](https://pypi.org/project/crawlerkit-core/)
49
+ [![Python versions](https://img.shields.io/pypi/pyversions/crawlerkit-core.svg)](https://pypi.org/project/crawlerkit-core/)
50
+ [![CI](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml/badge.svg)](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml)
51
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
52
+
53
+ A **standalone, browserless** crawler base (`crawlerkit.core`): fingerprinted **curl_cffi** transport,
54
+ per-host TLS with **AIA repair** + `.pfx` client certs, **browserforge** identity (UA snapped to the
55
+ impersonate target), proxy providers, a pluggable **captcha** registry, an error taxonomy with
56
+ retry+rotation, and the `BaseCrawler.flow()` / `BaseParser.parse()` hooks. Zero non-PyPI dependencies —
57
+ `parse()` returns **your own type**, not one the library dictates.
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ pip install crawlerkit-core
63
+ ```
64
+
65
+ ## Use
66
+
67
+ ```python
68
+ from crawlerkit.core import BaseCrawler, BaseParser, RawResponse, Transport, Profile
69
+ from crawlerkit.core.captcha import default_registry, McaptchaPowSolver, mcaptcha_hint
70
+ from crawlerkit.core.proxy import StaticProxyProvider, BrightDataProxyProvider
71
+ from crawlerkit.core.errors import BlockedError, TransientError, raise_for_block
72
+ ```
73
+
74
+ **HTTP is curl_cffi only — `requests` is never used.** Deps: curl_cffi, browserforge, cryptography,
75
+ certifi, selectolax, lxml, beautifulsoup4, weasyprint, structlog, tenacity.
76
+
77
+ **Build a crawler:** [GETTING_STARTED.md](GETTING_STARTED.md). **Run the demos:**
78
+ [`examples/`](examples/) (`quotes.py` — a full crawl+parse; `fingerprint_demo.py` — identity proof).
79
+ Reference: [`docs/`](docs/) (identity, transport-tls, proxy, captcha, cracking-govbr-turnstile, errors,
80
+ api). License: MIT.
@@ -0,0 +1,21 @@
1
+ crawlerkit/core/__init__.py,sha256=aU6V1P8kGOOVFP6nWwgUSBacrJoucW1m3BAlIp7Mfps,230
2
+ crawlerkit/core/base_crawler.py,sha256=xGcQL8kToY1ENl7Q9FFmzFRwxiVtWhuqiEycKcI3Yfc,4666
3
+ crawlerkit/core/base_parser.py,sha256=6Heku1NZ6eTaNeMaFN9hRgLikcETZxIEeVT2oNvKIcw,2900
4
+ crawlerkit/core/cookies.py,sha256=PndaPAjN29m5LpIq0jq2WSNuZugteu4vVmX5v8Inu3g,1449
5
+ crawlerkit/core/errors.py,sha256=oogfj8t6I7SMEHOYirE6r4FRIMI9_TtUlLrJBpziL0A,1744
6
+ crawlerkit/core/identity.py,sha256=2Ga0-GIf3mcnkwCZSPQuRXh1NtuvQhoQ3-DG91b08Ek,4108
7
+ crawlerkit/core/proxy.py,sha256=FPfg45I8PiWI8s-I398_d3xiMJED68w7EokAHhUcqOk,3694
8
+ crawlerkit/core/tls.py,sha256=0xXGsDbR25kdP2n3C_5Mw6r7yazorpgHHCkvy1QMgkM,4550
9
+ crawlerkit/core/transport.py,sha256=i8QJsbY7TunCjvNrzHF44c3HvXd5QZq78dR2lxJG0ao,2913
10
+ crawlerkit/core/captcha/__init__.py,sha256=h5WEzZQ7De_ESGQy-T_lABYyQyhtDx1pEjFJr3dpoHI,843
11
+ crawlerkit/core/captcha/base.py,sha256=ZNwqyOGx2ibG6XUxKpDdyhkhz47S_CUB01WooWRqS2M,2830
12
+ crawlerkit/core/captcha/govbr.py,sha256=4OjczX6K9lhzE_ojpkS0ngfZKqIejJ_j0SFfCMfu4j8,1905
13
+ crawlerkit/core/captcha/llm_image.py,sha256=E3LnA_VlKpPXei2T4HsmaQlbo9cdOqigC-fFv1bSU4A,2081
14
+ crawlerkit/core/captcha/mcaptcha.py,sha256=9fvRz1XgZnIArKPoZTgiVTqYazS0GitIFFxV8gyMyhI,5372
15
+ crawlerkit/core/captcha/token_adapters.py,sha256=-GzRC8vMqNWINzT3tcRQln2lIDAC9L429W_eMKTTbyE,2747
16
+ crawlerkit/core/captcha/turnstile.py,sha256=G4MeD01QlFv0faItUtbFnmMkrQcGplAiVyPJCQZmcPQ,1925
17
+ crawlerkit_core-0.1.0.dist-info/licenses/LICENSE,sha256=kAsK6_g7uDe4qOBRw-suoniIN_7YxJsmXCZs-XoqHfE,1071
18
+ crawlerkit_core-0.1.0.dist-info/METADATA,sha256=LvKXmYcJ-GhsNK1nRZ3bxY3l_i_RkcdTJWddYm_UuNs,3738
19
+ crawlerkit_core-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
+ crawlerkit_core-0.1.0.dist-info/top_level.txt,sha256=vy8AhdTkmxRHsuY8cQY-yWv1bWpk7JkaAMvF3JuqinU,11
21
+ crawlerkit_core-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Lucas Caovilla
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ crawlerkit