crawlerkit-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. crawlerkit_core-0.1.0/LICENSE +21 -0
  2. crawlerkit_core-0.1.0/PKG-INFO +80 -0
  3. crawlerkit_core-0.1.0/README.md +35 -0
  4. crawlerkit_core-0.1.0/crawlerkit/core/__init__.py +6 -0
  5. crawlerkit_core-0.1.0/crawlerkit/core/base_crawler.py +126 -0
  6. crawlerkit_core-0.1.0/crawlerkit/core/base_parser.py +77 -0
  7. crawlerkit_core-0.1.0/crawlerkit/core/captcha/__init__.py +34 -0
  8. crawlerkit_core-0.1.0/crawlerkit/core/captcha/base.py +90 -0
  9. crawlerkit_core-0.1.0/crawlerkit/core/captcha/govbr.py +40 -0
  10. crawlerkit_core-0.1.0/crawlerkit/core/captcha/llm_image.py +46 -0
  11. crawlerkit_core-0.1.0/crawlerkit/core/captcha/mcaptcha.py +142 -0
  12. crawlerkit_core-0.1.0/crawlerkit/core/captcha/token_adapters.py +77 -0
  13. crawlerkit_core-0.1.0/crawlerkit/core/captcha/turnstile.py +40 -0
  14. crawlerkit_core-0.1.0/crawlerkit/core/cookies.py +40 -0
  15. crawlerkit_core-0.1.0/crawlerkit/core/errors.py +48 -0
  16. crawlerkit_core-0.1.0/crawlerkit/core/identity.py +108 -0
  17. crawlerkit_core-0.1.0/crawlerkit/core/proxy.py +96 -0
  18. crawlerkit_core-0.1.0/crawlerkit/core/tls.py +113 -0
  19. crawlerkit_core-0.1.0/crawlerkit/core/transport.py +76 -0
  20. crawlerkit_core-0.1.0/crawlerkit_core.egg-info/PKG-INFO +80 -0
  21. crawlerkit_core-0.1.0/crawlerkit_core.egg-info/SOURCES.txt +27 -0
  22. crawlerkit_core-0.1.0/crawlerkit_core.egg-info/dependency_links.txt +1 -0
  23. crawlerkit_core-0.1.0/crawlerkit_core.egg-info/requires.txt +21 -0
  24. crawlerkit_core-0.1.0/crawlerkit_core.egg-info/top_level.txt +1 -0
  25. crawlerkit_core-0.1.0/pyproject.toml +90 -0
  26. crawlerkit_core-0.1.0/setup.cfg +4 -0
  27. crawlerkit_core-0.1.0/tests/test_base_parser.py +37 -0
  28. crawlerkit_core-0.1.0/tests/test_captcha_registry.py +20 -0
  29. crawlerkit_core-0.1.0/tests/test_errors.py +32 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Lucas Caovilla
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlerkit-core
3
+ Version: 0.1.0
4
+ Summary: Browserless crawler base: curl_cffi transport, TLS/AIA, identity, proxy, captcha, BaseCrawler/BaseParser.
5
+ Author-email: Lucas Caovilla <lucasgrisac@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/lucascaovilla/crawlerkit
8
+ Project-URL: Repository, https://github.com/lucascaovilla/crawlerkit
9
+ Project-URL: Documentation, https://github.com/lucascaovilla/crawlerkit#readme
10
+ Project-URL: Issues, https://github.com/lucascaovilla/crawlerkit/issues
11
+ Keywords: crawler,scraping,curl_cffi,tls,fingerprint,captcha,browserless
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Internet :: WWW/HTTP
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.11
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: curl_cffi>=0.7
26
+ Requires-Dist: browserforge>=1.2
27
+ Requires-Dist: cryptography>=42
28
+ Requires-Dist: certifi>=2024.0
29
+ Requires-Dist: selectolax>=0.3
30
+ Requires-Dist: lxml>=5.0
31
+ Requires-Dist: beautifulsoup4>=4.12
32
+ Requires-Dist: structlog>=24.1
33
+ Requires-Dist: tenacity>=8.2
34
+ Requires-Dist: weasyprint>=60
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=8.0; extra == "dev"
37
+ Requires-Dist: ruff>=0.5; extra == "dev"
38
+ Requires-Dist: build>=1.2; extra == "dev"
39
+ Requires-Dist: twine>=5.0; extra == "dev"
40
+ Requires-Dist: commitizen>=3.27; extra == "dev"
41
+ Provides-Extra: docs
42
+ Requires-Dist: mkdocs-material>=9.5; extra == "docs"
43
+ Requires-Dist: mkdocstrings[python]>=0.25; extra == "docs"
44
+ Dynamic: license-file
45
+
46
+ # crawlerkit-core
47
+
48
+ [![PyPI version](https://img.shields.io/pypi/v/crawlerkit-core.svg)](https://pypi.org/project/crawlerkit-core/)
49
+ [![Python versions](https://img.shields.io/pypi/pyversions/crawlerkit-core.svg)](https://pypi.org/project/crawlerkit-core/)
50
+ [![CI](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml/badge.svg)](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml)
51
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
52
+
53
+ A **standalone, browserless** crawler base (`crawlerkit.core`): fingerprinted **curl_cffi** transport,
54
+ per-host TLS with **AIA repair** + `.pfx` client certs, **browserforge** identity (UA snapped to the
55
+ impersonate target), proxy providers, a pluggable **captcha** registry, an error taxonomy with
56
+ retry+rotation, and the `BaseCrawler.flow()` / `BaseParser.parse()` hooks. Zero non-PyPI dependencies —
57
+ `parse()` returns **your own type**, not one the library dictates.
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ pip install crawlerkit-core
63
+ ```
64
+
65
+ ## Use
66
+
67
+ ```python
68
+ from crawlerkit.core import BaseCrawler, BaseParser, RawResponse, Transport, Profile
69
+ from crawlerkit.core.captcha import default_registry, McaptchaPowSolver, mcaptcha_hint
70
+ from crawlerkit.core.proxy import StaticProxyProvider, BrightDataProxyProvider
71
+ from crawlerkit.core.errors import BlockedError, TransientError, raise_for_block
72
+ ```
73
+
74
+ **HTTP is curl_cffi only — `requests` is never used.** Deps: curl_cffi, browserforge, cryptography,
75
+ certifi, selectolax, lxml, beautifulsoup4, weasyprint, structlog, tenacity.
76
+
77
+ **Build a crawler:** [GETTING_STARTED.md](GETTING_STARTED.md). **Run the demos:**
78
+ [`examples/`](examples/) (`quotes.py` — a full crawl+parse; `fingerprint_demo.py` — identity proof).
79
+ Reference: [`docs/`](docs/) (identity, transport-tls, proxy, captcha, cracking-govbr-turnstile, errors,
80
+ api). License: MIT.
@@ -0,0 +1,35 @@
1
+ # crawlerkit-core
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/crawlerkit-core.svg)](https://pypi.org/project/crawlerkit-core/)
4
+ [![Python versions](https://img.shields.io/pypi/pyversions/crawlerkit-core.svg)](https://pypi.org/project/crawlerkit-core/)
5
+ [![CI](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml/badge.svg)](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
7
+
8
+ A **standalone, browserless** crawler base (`crawlerkit.core`): fingerprinted **curl_cffi** transport,
9
+ per-host TLS with **AIA repair** + `.pfx` client certs, **browserforge** identity (UA snapped to the
10
+ impersonate target), proxy providers, a pluggable **captcha** registry, an error taxonomy with
11
+ retry+rotation, and the `BaseCrawler.flow()` / `BaseParser.parse()` hooks. Zero non-PyPI dependencies —
12
+ `parse()` returns **your own type**, not one the library dictates.
13
+
14
+ ## Install
15
+
16
+ ```bash
17
+ pip install crawlerkit-core
18
+ ```
19
+
20
+ ## Use
21
+
22
+ ```python
23
+ from crawlerkit.core import BaseCrawler, BaseParser, RawResponse, Transport, Profile
24
+ from crawlerkit.core.captcha import default_registry, McaptchaPowSolver, mcaptcha_hint
25
+ from crawlerkit.core.proxy import StaticProxyProvider, BrightDataProxyProvider
26
+ from crawlerkit.core.errors import BlockedError, TransientError, raise_for_block
27
+ ```
28
+
29
+ **HTTP is curl_cffi only — `requests` is never used.** Deps: curl_cffi, browserforge, cryptography,
30
+ certifi, selectolax, lxml, beautifulsoup4, weasyprint, structlog, tenacity.
31
+
32
+ **Build a crawler:** [GETTING_STARTED.md](GETTING_STARTED.md). **Run the demos:**
33
+ [`examples/`](examples/) (`quotes.py` — a full crawl+parse; `fingerprint_demo.py` — identity proof).
34
+ Reference: [`docs/`](docs/) (identity, transport-tls, proxy, captcha, cracking-govbr-turnstile, errors,
35
+ api). License: MIT.
@@ -0,0 +1,6 @@
1
+ from .base_crawler import BaseCrawler, RawResponse
2
+ from .base_parser import BaseParser
3
+ from .identity import Profile
4
+ from .transport import Transport
5
+
6
+ __all__ = ["BaseCrawler", "BaseParser", "RawResponse", "Transport", "Profile"]
@@ -0,0 +1,126 @@
1
+ """BaseCrawler — the crawl stage. A new target fills one hook: flow()."""
2
+
3
+ import random
4
+ import time
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+
8
+ import structlog
9
+ from bs4 import BeautifulSoup
10
+
11
+ from .captcha.base import CaptchaRegistry, Challenge, default_registry
12
+ from .errors import BlockedError, PermanentError, TransientError
13
+ from .identity import Profile, pick
14
+ from .proxy import NullProxyProvider, ProxyProvider
15
+ from .transport import Transport
16
+
17
+ log = structlog.get_logger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class RawResponse:
22
+ url: str
23
+ status: int
24
+ text: str
25
+ headers: dict = field(default_factory=dict)
26
+
27
+
28
+ class BaseCrawler(ABC):
29
+ """Owns transport+identity+proxy+captcha; subclass implements only flow().
30
+
31
+ No business logic, no parsing here — crawl and return the raw response.
32
+ """
33
+
34
+ captcha_hint: Challenge | None = None # known sitekey when the widget isn't inline
35
+
36
+ def __init__(
37
+ self,
38
+ *,
39
+ proxy_provider: ProxyProvider | None = None,
40
+ registry: CaptchaRegistry | None = None,
41
+ verify: bool = True,
42
+ profile: Profile | None = None,
43
+ client_cert: str | None = None,
44
+ max_attempts: int = 3,
45
+ ):
46
+ self._proxy_provider = proxy_provider or NullProxyProvider()
47
+ self._verify = verify
48
+ self._client_cert = client_cert
49
+ self._fixed_profile = profile
50
+ self.max_attempts = max_attempts
51
+ self.registry = registry or default_registry()
52
+ self._build_transport()
53
+
54
+ def _build_transport(self) -> None:
55
+ """(Re)create identity + proxy lease + transport — on init and on each rotation."""
56
+ self.profile = self._fixed_profile or pick()
57
+ self.proxy = self._proxy_provider.lease()
58
+ self.transport = Transport(
59
+ self.profile, self.proxy, verify=self._verify, client_cert=self._client_cert
60
+ )
61
+
62
+ def _rotate(self) -> None:
63
+ log.info("rotate_identity_proxy")
64
+ self._build_transport()
65
+
66
+ # --- helpers exposed to flow() ---
67
+ def get(self, url: str, **kw):
68
+ return self.transport.get(url, **kw)
69
+
70
+ def post(self, url: str, **kw):
71
+ return self.transport.post(url, **kw)
72
+
73
+ def solve_captcha(self, source) -> str | None:
74
+ """detect+solve; returns a token, None (no challenge), or raises UnsupportedCaptcha."""
75
+ solved = self.registry.solve(source, self.transport, hint=self.captcha_hint)
76
+ return solved.token if solved else None
77
+
78
+ def hidden_fields(self, html: str) -> dict:
79
+ """All hidden inputs of the form (JSF ViewState / WebForms __VIEWSTATE postback state)."""
80
+ try:
81
+ soup = BeautifulSoup(html, "lxml")
82
+ except Exception: # noqa: BLE001
83
+ soup = BeautifulSoup(html, "html.parser")
84
+ form = soup.find("form") if soup else None
85
+ scope = form or soup
86
+ hidden: dict[str, str] = {}
87
+ if scope:
88
+ for inp in scope.find_all("input"):
89
+ name = inp.get("name")
90
+ if name and (inp.get("type") == "hidden" or "ViewState" in name or "VIEWSTATE" in name.upper()):
91
+ hidden[name] = inp.get("value", "")
92
+ return hidden
93
+
94
+ # --- the only required hook ---
95
+ @abstractmethod
96
+ def flow(self, params: dict) -> RawResponse:
97
+ ...
98
+
99
+ def run(self, params: dict) -> RawResponse:
100
+ """Run flow() with retry + rotation. TransientError -> back off, retry (same identity);
101
+ BlockedError -> rotate identity+proxy, then retry; PermanentError -> fail fast."""
102
+ last: Exception | None = None
103
+ for attempt in range(1, self.max_attempts + 1):
104
+ try:
105
+ log.info("crawl_start", crawler=type(self).__name__, attempt=attempt)
106
+ raw = self.flow(params)
107
+ log.info("crawl_done", status=raw.status, bytes=len(raw.text))
108
+ return raw
109
+ except PermanentError:
110
+ raise
111
+ except BlockedError as e:
112
+ last = e
113
+ log.warning("blocked", attempt=attempt, error=str(e))
114
+ if attempt < self.max_attempts:
115
+ self._rotate()
116
+ self._backoff(attempt)
117
+ except TransientError as e:
118
+ last = e
119
+ log.warning("transient", attempt=attempt, error=str(e))
120
+ if attempt < self.max_attempts:
121
+ self._backoff(attempt)
122
+ raise last or RuntimeError("crawl failed with no captured error")
123
+
124
+ @staticmethod
125
+ def _backoff(attempt: int, cap: float = 30.0) -> None:
126
+ time.sleep(min(2.0**attempt + random.uniform(0, 1), cap))
@@ -0,0 +1,77 @@
1
+ """BaseParser — the parse stage. A new target fills one hook: parse().
2
+
3
+ Pure + item-local: no network beyond fetching static assets for the optional PDF, no
4
+ cross-item state. Operates on the RawResponse the crawler returned (or a replayed one).
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Generic, TypeVar
9
+ from urllib.parse import urlparse
10
+
11
+ import structlog
12
+
13
+ from .base_crawler import RawResponse
14
+
15
+ log = structlog.get_logger(__name__)
16
+
17
+ #: What ``parse()`` yields — your own model, a dataclass, a ``dict``, anything.
18
+ #: crawlerkit-core stays dependency-free: it never dictates the output type.
19
+ T = TypeVar("T")
20
+
21
+ # Print fixups: hide leftover form inputs, landscape, fit wide tables.
22
+ _PDF_FIXUP_CSS = """
23
+ input { display: none !important; }
24
+ @page { size: A4 landscape; margin: 1.2cm; }
25
+ table { font-size: 9px; table-layout: fixed; width: 100%; }
26
+ td, th { overflow-wrap: anywhere; }
27
+ """
28
+
29
+
30
+ def render_pdf(html: str, base_url: str) -> bytes:
31
+ """HTML -> PDF (WeasyPrint, no browser). Fetches remote CSS over a verified, AIA-repaired
32
+ TLS connection (curl_cffi + crawlerkit.core.tls). No `requests`."""
33
+ from curl_cffi import requests as cffi
34
+ from weasyprint import CSS, HTML, default_url_fetcher
35
+
36
+ from . import tls
37
+
38
+ def fetcher(url: str, **kw):
39
+ if url.startswith(("http://", "https://")):
40
+ host = urlparse(url).hostname or ""
41
+ try:
42
+ r = cffi.get(url, verify=tls.build_ca_bundle(host), timeout=30, impersonate="chrome131")
43
+ ct = r.headers.get("content-type", "")
44
+ out = {"string": r.content, "redirected_url": str(r.url)}
45
+ mime = ct.split(";")[0].strip()
46
+ if mime:
47
+ out["mime_type"] = mime
48
+ return out
49
+ except Exception as e: # noqa: BLE001 — a missing asset must not kill the PDF
50
+ log.warning("pdf_asset_skipped", url=url, error=str(e))
51
+ return {"string": b"", "mime_type": "text/plain"}
52
+ return default_url_fetcher(url, **kw)
53
+
54
+ return HTML(string=html, base_url=base_url, url_fetcher=fetcher).write_pdf(
55
+ stylesheets=[CSS(string=_PDF_FIXUP_CSS)]
56
+ )
57
+
58
+
59
+ class BaseParser(ABC, Generic[T]):
60
+ """Parse stage. Subclass with your own item type: ``class MyParser(BaseParser[MyModel])``
61
+ (or ``BaseParser[dict]``). ``parse()`` returns ``list[T]``; the type is yours, not the lib's."""
62
+
63
+ render_pdf_enabled: bool = True
64
+
65
+ @abstractmethod
66
+ def parse(self, raw: RawResponse) -> list[T]:
67
+ ...
68
+
69
+ def pdf(self, raw: RawResponse) -> bytes | None:
70
+ if not self.render_pdf_enabled:
71
+ return None
72
+ return render_pdf(raw.text, base_url=raw.url)
73
+
74
+ def run(self, raw: RawResponse) -> tuple[list[T], bytes | None]:
75
+ items = self.parse(raw)
76
+ log.info("parse_done", count=len(items))
77
+ return items, self.pdf(raw)
@@ -0,0 +1,34 @@
1
+ from .base import (
2
+ CaptchaRegistry,
3
+ CaptchaServiceError,
4
+ Challenge,
5
+ Solved,
6
+ UnsupportedCaptcha,
7
+ default_registry,
8
+ )
9
+ from .govbr import GovBrSolver
10
+ from .llm_image import LlmImageSolver
11
+ from .mcaptcha import McaptchaPowSolver, mcaptcha_hint
12
+ from .token_adapters import HcaptchaSolver, RecaptchaV2Solver, RecaptchaV3Solver, TokenProvider
13
+ from .turnstile import TurnstileSolver
14
+
15
+ __all__ = [
16
+ "Challenge",
17
+ "Solved",
18
+ "UnsupportedCaptcha",
19
+ "CaptchaServiceError",
20
+ "CaptchaRegistry",
21
+ "default_registry",
22
+ # own solvers
23
+ "McaptchaPowSolver",
24
+ "mcaptcha_hint",
25
+ "LlmImageSolver",
26
+ # browserless stubs (TODO crack)
27
+ "TurnstileSolver",
28
+ "GovBrSolver",
29
+ # optional token-adapters (opt-in)
30
+ "TokenProvider",
31
+ "RecaptchaV2Solver",
32
+ "RecaptchaV3Solver",
33
+ "HcaptchaSolver",
34
+ ]
@@ -0,0 +1,90 @@
1
+ """Captcha detection + a registry of our own solvers.
2
+
3
+ Three outcomes when a source (HTML or response) is checked:
4
+ - no challenge -> registry.solve returns None
5
+ - challenge + solver -> Solved{token, expires_at}
6
+ - challenge, no solver -> raise UnsupportedCaptcha
7
+
8
+ A solver produces a token; the backend (compute / LLM-image / JS-runtime) is its own business.
9
+ Tokens are single-use and solved on submit (never pre-solved).
10
+ """
11
+
12
+ from dataclasses import dataclass, field
13
+ from typing import Optional, Protocol, runtime_checkable
14
+
15
+
16
+ @dataclass
17
+ class Challenge:
18
+ kind: str
19
+ params: dict = field(default_factory=dict)
20
+
21
+
22
+ @dataclass
23
+ class Solved:
24
+ token: str
25
+ expires_at: float | None = None # absolute epoch seconds, from the challenge's own ttl
26
+
27
+
28
+ class UnsupportedCaptcha(Exception):
29
+ def __init__(self, kind: str):
30
+ super().__init__(f"no solver registered for captcha kind: {kind}")
31
+ self.kind = kind
32
+
33
+
34
+ class CaptchaServiceError(Exception):
35
+ """The captcha backend returned an unexpected/error response (often transient)."""
36
+
37
+
38
+ @runtime_checkable
39
+ class Solver(Protocol):
40
+ kind: str
41
+
42
+ @classmethod
43
+ def detect(cls, text: str) -> Optional[Challenge]:
44
+ ...
45
+
46
+ def solve(self, challenge: Challenge, transport) -> Solved:
47
+ ...
48
+
49
+
50
+ class CaptchaRegistry:
51
+ def __init__(self) -> None:
52
+ self._solvers: dict[str, Solver] = {}
53
+
54
+ def register(self, solver: Solver) -> "CaptchaRegistry":
55
+ self._solvers[solver.kind] = solver
56
+ return self
57
+
58
+ def detect(self, source) -> Optional[Challenge]:
59
+ text = source if isinstance(source, str) else getattr(source, "text", "") or ""
60
+ for solver in self._solvers.values():
61
+ ch = solver.detect(text)
62
+ if ch is not None:
63
+ return ch
64
+ return None
65
+
66
+ def solve(self, source, transport, *, hint: Optional[Challenge] = None) -> Optional[Solved]:
67
+ challenge = self.detect(source) or hint
68
+ if challenge is None:
69
+ return None
70
+ solver = self._solvers.get(challenge.kind)
71
+ if solver is None:
72
+ raise UnsupportedCaptcha(challenge.kind)
73
+ return solver.solve(challenge, transport)
74
+
75
+
76
+ def default_registry() -> CaptchaRegistry:
77
+ """Registry with the built-in own-solvers: mCaptcha PoW (working) + gov.br/Turnstile
78
+ browserless stubs (detect works, solve raises NotImplementedError until cracked).
79
+ Optional token-adapters (reCAPTCHA/hCaptcha) and the LLM image solver are opt-in —
80
+ register them yourself when configured."""
81
+ from .govbr import GovBrSolver
82
+ from .mcaptcha import McaptchaPowSolver
83
+ from .turnstile import TurnstileSolver
84
+
85
+ return (
86
+ CaptchaRegistry()
87
+ .register(McaptchaPowSolver())
88
+ .register(TurnstileSolver())
89
+ .register(GovBrSolver())
90
+ )
@@ -0,0 +1,40 @@
1
+ """gov.br (sso.acesso.gov.br) — BROWSERLESS solver scaffold.
2
+
3
+ gov.br SSO is the fleet's most common gate (~79 repos) and is browser-only in atlas today.
4
+ `detect()` works now; `solve()` is a TODO for a manual, browserless crack — fails loudly.
5
+ """
6
+
7
+ import re
8
+
9
+ from .base import Challenge, Solved
10
+
11
+ _SIGNATURE = re.compile(r"sso\.acesso\.gov\.br|acesso\.gov\.br|\bgovbr\b", re.I)
12
+ _SITEKEY_RE = re.compile(r'data-sitekey=["\']([0-9A-Za-z_-]{8,})["\']')
13
+
14
+
15
+ class GovBrSolver:
16
+ kind = "govbr"
17
+
18
+ @classmethod
19
+ def detect(cls, text: str):
20
+ text = text or ""
21
+ if not _SIGNATURE.search(text):
22
+ return None
23
+ m = _SITEKEY_RE.search(text) # gov.br embeds hCaptcha/reCAPTCHA
24
+ return Challenge(kind=cls.kind, params={"sitekey": m.group(1) if m else None})
25
+
26
+ def solve(self, challenge: Challenge, transport) -> Solved:
27
+ # TODO(crawlerkit): implement the BROWSERLESS gov.br SSO authentication.
28
+ # gov.br (sso.acesso.gov.br) is JS-heavy and gated by a captcha (hCaptcha/reCAPTCHA) plus
29
+ # fingerprint checks. Browserless approach to fill in here:
30
+ # 1. Drive the SSO step sequence with the verified curl_cffi transport, carrying cookies
31
+ # across redirects (login -> authorize -> callback).
32
+ # 2. Solve the embedded captcha via the registry (hCaptcha/reCAPTCHA token solver) OR a
33
+ # JS-runtime crack of the gov.br challenge script (QuickJS/Node + DOM shim seeded from
34
+ # the active Profile + proxy IP).
35
+ # 3. Complete the OAuth/SSO redirect; return Solved(token=<session cookie / SSO assertion>).
36
+ # Note: some gov.br services accept ICP-Brasil mutual-TLS client certs — see crawlerkit.core.tls.
37
+ raise NotImplementedError(
38
+ f"browserless gov.br solve is a TODO (params={challenge.params!r}) "
39
+ "— implement the SSO/JS-runtime crack"
40
+ )
@@ -0,0 +1,46 @@
1
+ """Own image-captcha solver: fetch the challenge image over the verified transport, classify with
2
+ a pluggable vision LLM, return the answer/token. Provider-agnostic — inject a `classify` callable
3
+ `(image_bytes, prompt) -> str`. Prompts ported from atlas's GPT solver.
4
+
5
+ Image captchas are target-specific, so the crawler builds the Challenge with the image location
6
+ (`params["image_url"]` or `params["image_bytes"]`); `detect()` returns None.
7
+ """
8
+
9
+ from .base import CaptchaServiceError, Challenge, Solved
10
+
11
+ OCR_PROMPT = (
12
+ "This image is a CAPTCHA. Read the characters exactly. Respond with ONLY the characters "
13
+ "(letters/digits), no spaces, no explanation."
14
+ )
15
+ # 3x3 / 4x4 grid-selection prompts (hCaptcha / reCAPTCHA) are available for grid challenges;
16
+ # port the full set from atlas chatgpt_captcha_solver.py when wiring a grid flow.
17
+ GRID_3X3_PROMPT = (
18
+ "A reference image sits above a 3x3 grid (tiles numbered 1-9, left-to-right, top-to-bottom). "
19
+ "Return the tile numbers that clearly and fully match the reference, separated by '/', e.g. '2/5/9'. "
20
+ "If none match, return 'none'. No other text."
21
+ )
22
+
23
+
24
+ class LlmImageSolver:
25
+ kind = "image"
26
+
27
+ def __init__(self, classify, *, prompt: str = OCR_PROMPT):
28
+ # classify: Callable[[bytes, str], str]
29
+ self._classify = classify
30
+ self._prompt = prompt
31
+
32
+ @classmethod
33
+ def detect(cls, text: str):
34
+ return None # the crawler constructs the image Challenge explicitly
35
+
36
+ def solve(self, challenge: Challenge, transport) -> Solved:
37
+ img = challenge.params.get("image_bytes")
38
+ if img is None:
39
+ url = challenge.params.get("image_url")
40
+ if not url:
41
+ raise CaptchaServiceError("LlmImageSolver needs params['image_url'] or ['image_bytes']")
42
+ img = transport.get(url, timeout=30).content
43
+ answer = (self._classify(img, challenge.params.get("prompt", self._prompt)) or "").strip()
44
+ if not answer:
45
+ raise CaptchaServiceError("vision LLM returned an empty answer")
46
+ return Solved(token=answer)