crawlerkit-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlerkit_core-0.1.0/LICENSE +21 -0
- crawlerkit_core-0.1.0/PKG-INFO +80 -0
- crawlerkit_core-0.1.0/README.md +35 -0
- crawlerkit_core-0.1.0/crawlerkit/core/__init__.py +6 -0
- crawlerkit_core-0.1.0/crawlerkit/core/base_crawler.py +126 -0
- crawlerkit_core-0.1.0/crawlerkit/core/base_parser.py +77 -0
- crawlerkit_core-0.1.0/crawlerkit/core/captcha/__init__.py +34 -0
- crawlerkit_core-0.1.0/crawlerkit/core/captcha/base.py +90 -0
- crawlerkit_core-0.1.0/crawlerkit/core/captcha/govbr.py +40 -0
- crawlerkit_core-0.1.0/crawlerkit/core/captcha/llm_image.py +46 -0
- crawlerkit_core-0.1.0/crawlerkit/core/captcha/mcaptcha.py +142 -0
- crawlerkit_core-0.1.0/crawlerkit/core/captcha/token_adapters.py +77 -0
- crawlerkit_core-0.1.0/crawlerkit/core/captcha/turnstile.py +40 -0
- crawlerkit_core-0.1.0/crawlerkit/core/cookies.py +40 -0
- crawlerkit_core-0.1.0/crawlerkit/core/errors.py +48 -0
- crawlerkit_core-0.1.0/crawlerkit/core/identity.py +108 -0
- crawlerkit_core-0.1.0/crawlerkit/core/proxy.py +96 -0
- crawlerkit_core-0.1.0/crawlerkit/core/tls.py +113 -0
- crawlerkit_core-0.1.0/crawlerkit/core/transport.py +76 -0
- crawlerkit_core-0.1.0/crawlerkit_core.egg-info/PKG-INFO +80 -0
- crawlerkit_core-0.1.0/crawlerkit_core.egg-info/SOURCES.txt +27 -0
- crawlerkit_core-0.1.0/crawlerkit_core.egg-info/dependency_links.txt +1 -0
- crawlerkit_core-0.1.0/crawlerkit_core.egg-info/requires.txt +21 -0
- crawlerkit_core-0.1.0/crawlerkit_core.egg-info/top_level.txt +1 -0
- crawlerkit_core-0.1.0/pyproject.toml +90 -0
- crawlerkit_core-0.1.0/setup.cfg +4 -0
- crawlerkit_core-0.1.0/tests/test_base_parser.py +37 -0
- crawlerkit_core-0.1.0/tests/test_captcha_registry.py +20 -0
- crawlerkit_core-0.1.0/tests/test_errors.py +32 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lucas Caovilla
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlerkit-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Browserless crawler base: curl_cffi transport, TLS/AIA, identity, proxy, captcha, BaseCrawler/BaseParser.
|
|
5
|
+
Author-email: Lucas Caovilla <lucasgrisac@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/lucascaovilla/crawlerkit
|
|
8
|
+
Project-URL: Repository, https://github.com/lucascaovilla/crawlerkit
|
|
9
|
+
Project-URL: Documentation, https://github.com/lucascaovilla/crawlerkit#readme
|
|
10
|
+
Project-URL: Issues, https://github.com/lucascaovilla/crawlerkit/issues
|
|
11
|
+
Keywords: crawler,scraping,curl_cffi,tls,fingerprint,captcha,browserless
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: curl_cffi>=0.7
|
|
26
|
+
Requires-Dist: browserforge>=1.2
|
|
27
|
+
Requires-Dist: cryptography>=42
|
|
28
|
+
Requires-Dist: certifi>=2024.0
|
|
29
|
+
Requires-Dist: selectolax>=0.3
|
|
30
|
+
Requires-Dist: lxml>=5.0
|
|
31
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
32
|
+
Requires-Dist: structlog>=24.1
|
|
33
|
+
Requires-Dist: tenacity>=8.2
|
|
34
|
+
Requires-Dist: weasyprint>=60
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
38
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
39
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
40
|
+
Requires-Dist: commitizen>=3.27; extra == "dev"
|
|
41
|
+
Provides-Extra: docs
|
|
42
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
43
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == "docs"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# crawlerkit-core
|
|
47
|
+
|
|
48
|
+
[](https://pypi.org/project/crawlerkit-core/)
|
|
49
|
+
[](https://pypi.org/project/crawlerkit-core/)
|
|
50
|
+
[](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml)
|
|
51
|
+
[](LICENSE)
|
|
52
|
+
|
|
53
|
+
A **standalone, browserless** crawler base (`crawlerkit.core`): fingerprinted **curl_cffi** transport,
|
|
54
|
+
per-host TLS with **AIA repair** + `.pfx` client certs, **browserforge** identity (UA snapped to the
|
|
55
|
+
impersonate target), proxy providers, a pluggable **captcha** registry, an error taxonomy with
|
|
56
|
+
retry+rotation, and the `BaseCrawler.flow()` / `BaseParser.parse()` hooks. Zero non-PyPI dependencies —
|
|
57
|
+
`parse()` returns **your own type**, not one the library dictates.
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install crawlerkit-core
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Use
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from crawlerkit.core import BaseCrawler, BaseParser, RawResponse, Transport, Profile
|
|
69
|
+
from crawlerkit.core.captcha import default_registry, McaptchaPowSolver, mcaptcha_hint
|
|
70
|
+
from crawlerkit.core.proxy import StaticProxyProvider, BrightDataProxyProvider
|
|
71
|
+
from crawlerkit.core.errors import BlockedError, TransientError, raise_for_block
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**HTTP is curl_cffi only — `requests` is never used.** Deps: curl_cffi, browserforge, cryptography,
|
|
75
|
+
certifi, selectolax, lxml, beautifulsoup4, weasyprint, structlog, tenacity.
|
|
76
|
+
|
|
77
|
+
**Build a crawler:** [GETTING_STARTED.md](GETTING_STARTED.md). **Run the demos:**
|
|
78
|
+
[`examples/`](examples/) (`quotes.py` — a full crawl+parse; `fingerprint_demo.py` — identity proof).
|
|
79
|
+
Reference: [`docs/`](docs/) (identity, transport-tls, proxy, captcha, cracking-govbr-turnstile, errors,
|
|
80
|
+
api). License: MIT.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# crawlerkit-core
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/crawlerkit-core/)
|
|
4
|
+
[](https://pypi.org/project/crawlerkit-core/)
|
|
5
|
+
[](https://github.com/lucascaovilla/crawlerkit/actions/workflows/ci.yml)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
A **standalone, browserless** crawler base (`crawlerkit.core`): fingerprinted **curl_cffi** transport,
|
|
9
|
+
per-host TLS with **AIA repair** + `.pfx` client certs, **browserforge** identity (UA snapped to the
|
|
10
|
+
impersonate target), proxy providers, a pluggable **captcha** registry, an error taxonomy with
|
|
11
|
+
retry+rotation, and the `BaseCrawler.flow()` / `BaseParser.parse()` hooks. Zero non-PyPI dependencies —
|
|
12
|
+
`parse()` returns **your own type**, not one the library dictates.
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install crawlerkit-core
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Use
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from crawlerkit.core import BaseCrawler, BaseParser, RawResponse, Transport, Profile
|
|
24
|
+
from crawlerkit.core.captcha import default_registry, McaptchaPowSolver, mcaptcha_hint
|
|
25
|
+
from crawlerkit.core.proxy import StaticProxyProvider, BrightDataProxyProvider
|
|
26
|
+
from crawlerkit.core.errors import BlockedError, TransientError, raise_for_block
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**HTTP is curl_cffi only — `requests` is never used.** Deps: curl_cffi, browserforge, cryptography,
|
|
30
|
+
certifi, selectolax, lxml, beautifulsoup4, weasyprint, structlog, tenacity.
|
|
31
|
+
|
|
32
|
+
**Build a crawler:** [GETTING_STARTED.md](GETTING_STARTED.md). **Run the demos:**
|
|
33
|
+
[`examples/`](examples/) (`quotes.py` — a full crawl+parse; `fingerprint_demo.py` — identity proof).
|
|
34
|
+
Reference: [`docs/`](docs/) (identity, transport-tls, proxy, captcha, cracking-govbr-turnstile, errors,
|
|
35
|
+
api). License: MIT.
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""BaseCrawler — the crawl stage. A new target fills one hook: flow()."""
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
import structlog
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
|
|
11
|
+
from .captcha.base import CaptchaRegistry, Challenge, default_registry
|
|
12
|
+
from .errors import BlockedError, PermanentError, TransientError
|
|
13
|
+
from .identity import Profile, pick
|
|
14
|
+
from .proxy import NullProxyProvider, ProxyProvider
|
|
15
|
+
from .transport import Transport
|
|
16
|
+
|
|
17
|
+
log = structlog.get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class RawResponse:
|
|
22
|
+
url: str
|
|
23
|
+
status: int
|
|
24
|
+
text: str
|
|
25
|
+
headers: dict = field(default_factory=dict)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BaseCrawler(ABC):
|
|
29
|
+
"""Owns transport+identity+proxy+captcha; subclass implements only flow().
|
|
30
|
+
|
|
31
|
+
No business logic, no parsing here — crawl and return the raw response.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
captcha_hint: Challenge | None = None # known sitekey when the widget isn't inline
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
*,
|
|
39
|
+
proxy_provider: ProxyProvider | None = None,
|
|
40
|
+
registry: CaptchaRegistry | None = None,
|
|
41
|
+
verify: bool = True,
|
|
42
|
+
profile: Profile | None = None,
|
|
43
|
+
client_cert: str | None = None,
|
|
44
|
+
max_attempts: int = 3,
|
|
45
|
+
):
|
|
46
|
+
self._proxy_provider = proxy_provider or NullProxyProvider()
|
|
47
|
+
self._verify = verify
|
|
48
|
+
self._client_cert = client_cert
|
|
49
|
+
self._fixed_profile = profile
|
|
50
|
+
self.max_attempts = max_attempts
|
|
51
|
+
self.registry = registry or default_registry()
|
|
52
|
+
self._build_transport()
|
|
53
|
+
|
|
54
|
+
def _build_transport(self) -> None:
|
|
55
|
+
"""(Re)create identity + proxy lease + transport — on init and on each rotation."""
|
|
56
|
+
self.profile = self._fixed_profile or pick()
|
|
57
|
+
self.proxy = self._proxy_provider.lease()
|
|
58
|
+
self.transport = Transport(
|
|
59
|
+
self.profile, self.proxy, verify=self._verify, client_cert=self._client_cert
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def _rotate(self) -> None:
|
|
63
|
+
log.info("rotate_identity_proxy")
|
|
64
|
+
self._build_transport()
|
|
65
|
+
|
|
66
|
+
# --- helpers exposed to flow() ---
|
|
67
|
+
def get(self, url: str, **kw):
|
|
68
|
+
return self.transport.get(url, **kw)
|
|
69
|
+
|
|
70
|
+
def post(self, url: str, **kw):
|
|
71
|
+
return self.transport.post(url, **kw)
|
|
72
|
+
|
|
73
|
+
def solve_captcha(self, source) -> str | None:
|
|
74
|
+
"""detect+solve; returns a token, None (no challenge), or raises UnsupportedCaptcha."""
|
|
75
|
+
solved = self.registry.solve(source, self.transport, hint=self.captcha_hint)
|
|
76
|
+
return solved.token if solved else None
|
|
77
|
+
|
|
78
|
+
def hidden_fields(self, html: str) -> dict:
|
|
79
|
+
"""All hidden inputs of the form (JSF ViewState / WebForms __VIEWSTATE postback state)."""
|
|
80
|
+
try:
|
|
81
|
+
soup = BeautifulSoup(html, "lxml")
|
|
82
|
+
except Exception: # noqa: BLE001
|
|
83
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
84
|
+
form = soup.find("form") if soup else None
|
|
85
|
+
scope = form or soup
|
|
86
|
+
hidden: dict[str, str] = {}
|
|
87
|
+
if scope:
|
|
88
|
+
for inp in scope.find_all("input"):
|
|
89
|
+
name = inp.get("name")
|
|
90
|
+
if name and (inp.get("type") == "hidden" or "ViewState" in name or "VIEWSTATE" in name.upper()):
|
|
91
|
+
hidden[name] = inp.get("value", "")
|
|
92
|
+
return hidden
|
|
93
|
+
|
|
94
|
+
# --- the only required hook ---
|
|
95
|
+
@abstractmethod
|
|
96
|
+
def flow(self, params: dict) -> RawResponse:
|
|
97
|
+
...
|
|
98
|
+
|
|
99
|
+
def run(self, params: dict) -> RawResponse:
|
|
100
|
+
"""Run flow() with retry + rotation. TransientError -> back off, retry (same identity);
|
|
101
|
+
BlockedError -> rotate identity+proxy, then retry; PermanentError -> fail fast."""
|
|
102
|
+
last: Exception | None = None
|
|
103
|
+
for attempt in range(1, self.max_attempts + 1):
|
|
104
|
+
try:
|
|
105
|
+
log.info("crawl_start", crawler=type(self).__name__, attempt=attempt)
|
|
106
|
+
raw = self.flow(params)
|
|
107
|
+
log.info("crawl_done", status=raw.status, bytes=len(raw.text))
|
|
108
|
+
return raw
|
|
109
|
+
except PermanentError:
|
|
110
|
+
raise
|
|
111
|
+
except BlockedError as e:
|
|
112
|
+
last = e
|
|
113
|
+
log.warning("blocked", attempt=attempt, error=str(e))
|
|
114
|
+
if attempt < self.max_attempts:
|
|
115
|
+
self._rotate()
|
|
116
|
+
self._backoff(attempt)
|
|
117
|
+
except TransientError as e:
|
|
118
|
+
last = e
|
|
119
|
+
log.warning("transient", attempt=attempt, error=str(e))
|
|
120
|
+
if attempt < self.max_attempts:
|
|
121
|
+
self._backoff(attempt)
|
|
122
|
+
raise last or RuntimeError("crawl failed with no captured error")
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def _backoff(attempt: int, cap: float = 30.0) -> None:
|
|
126
|
+
time.sleep(min(2.0**attempt + random.uniform(0, 1), cap))
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""BaseParser — the parse stage. A new target fills one hook: parse().
|
|
2
|
+
|
|
3
|
+
Pure + item-local: no network beyond fetching static assets for the optional PDF, no
|
|
4
|
+
cross-item state. Operates on the RawResponse the crawler returned (or a replayed one).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Generic, TypeVar
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
import structlog
|
|
12
|
+
|
|
13
|
+
from .base_crawler import RawResponse
|
|
14
|
+
|
|
15
|
+
log = structlog.get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
#: What ``parse()`` yields — your own model, a dataclass, a ``dict``, anything.
|
|
18
|
+
#: crawlerkit-core stays dependency-free: it never dictates the output type.
|
|
19
|
+
T = TypeVar("T")
|
|
20
|
+
|
|
21
|
+
# Print fixups: hide leftover form inputs, landscape, fit wide tables.
|
|
22
|
+
_PDF_FIXUP_CSS = """
|
|
23
|
+
input { display: none !important; }
|
|
24
|
+
@page { size: A4 landscape; margin: 1.2cm; }
|
|
25
|
+
table { font-size: 9px; table-layout: fixed; width: 100%; }
|
|
26
|
+
td, th { overflow-wrap: anywhere; }
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def render_pdf(html: str, base_url: str) -> bytes:
|
|
31
|
+
"""HTML -> PDF (WeasyPrint, no browser). Fetches remote CSS over a verified, AIA-repaired
|
|
32
|
+
TLS connection (curl_cffi + crawlerkit.core.tls). No `requests`."""
|
|
33
|
+
from curl_cffi import requests as cffi
|
|
34
|
+
from weasyprint import CSS, HTML, default_url_fetcher
|
|
35
|
+
|
|
36
|
+
from . import tls
|
|
37
|
+
|
|
38
|
+
def fetcher(url: str, **kw):
|
|
39
|
+
if url.startswith(("http://", "https://")):
|
|
40
|
+
host = urlparse(url).hostname or ""
|
|
41
|
+
try:
|
|
42
|
+
r = cffi.get(url, verify=tls.build_ca_bundle(host), timeout=30, impersonate="chrome131")
|
|
43
|
+
ct = r.headers.get("content-type", "")
|
|
44
|
+
out = {"string": r.content, "redirected_url": str(r.url)}
|
|
45
|
+
mime = ct.split(";")[0].strip()
|
|
46
|
+
if mime:
|
|
47
|
+
out["mime_type"] = mime
|
|
48
|
+
return out
|
|
49
|
+
except Exception as e: # noqa: BLE001 — a missing asset must not kill the PDF
|
|
50
|
+
log.warning("pdf_asset_skipped", url=url, error=str(e))
|
|
51
|
+
return {"string": b"", "mime_type": "text/plain"}
|
|
52
|
+
return default_url_fetcher(url, **kw)
|
|
53
|
+
|
|
54
|
+
return HTML(string=html, base_url=base_url, url_fetcher=fetcher).write_pdf(
|
|
55
|
+
stylesheets=[CSS(string=_PDF_FIXUP_CSS)]
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class BaseParser(ABC, Generic[T]):
|
|
60
|
+
"""Parse stage. Subclass with your own item type: ``class MyParser(BaseParser[MyModel])``
|
|
61
|
+
(or ``BaseParser[dict]``). ``parse()`` returns ``list[T]``; the type is yours, not the lib's."""
|
|
62
|
+
|
|
63
|
+
render_pdf_enabled: bool = True
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def parse(self, raw: RawResponse) -> list[T]:
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
def pdf(self, raw: RawResponse) -> bytes | None:
|
|
70
|
+
if not self.render_pdf_enabled:
|
|
71
|
+
return None
|
|
72
|
+
return render_pdf(raw.text, base_url=raw.url)
|
|
73
|
+
|
|
74
|
+
def run(self, raw: RawResponse) -> tuple[list[T], bytes | None]:
|
|
75
|
+
items = self.parse(raw)
|
|
76
|
+
log.info("parse_done", count=len(items))
|
|
77
|
+
return items, self.pdf(raw)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
CaptchaRegistry,
|
|
3
|
+
CaptchaServiceError,
|
|
4
|
+
Challenge,
|
|
5
|
+
Solved,
|
|
6
|
+
UnsupportedCaptcha,
|
|
7
|
+
default_registry,
|
|
8
|
+
)
|
|
9
|
+
from .govbr import GovBrSolver
|
|
10
|
+
from .llm_image import LlmImageSolver
|
|
11
|
+
from .mcaptcha import McaptchaPowSolver, mcaptcha_hint
|
|
12
|
+
from .token_adapters import HcaptchaSolver, RecaptchaV2Solver, RecaptchaV3Solver, TokenProvider
|
|
13
|
+
from .turnstile import TurnstileSolver
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Challenge",
|
|
17
|
+
"Solved",
|
|
18
|
+
"UnsupportedCaptcha",
|
|
19
|
+
"CaptchaServiceError",
|
|
20
|
+
"CaptchaRegistry",
|
|
21
|
+
"default_registry",
|
|
22
|
+
# own solvers
|
|
23
|
+
"McaptchaPowSolver",
|
|
24
|
+
"mcaptcha_hint",
|
|
25
|
+
"LlmImageSolver",
|
|
26
|
+
# browserless stubs (TODO crack)
|
|
27
|
+
"TurnstileSolver",
|
|
28
|
+
"GovBrSolver",
|
|
29
|
+
# optional token-adapters (opt-in)
|
|
30
|
+
"TokenProvider",
|
|
31
|
+
"RecaptchaV2Solver",
|
|
32
|
+
"RecaptchaV3Solver",
|
|
33
|
+
"HcaptchaSolver",
|
|
34
|
+
]
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Captcha detection + a registry of our own solvers.
|
|
2
|
+
|
|
3
|
+
Three outcomes when a source (HTML or response) is checked:
|
|
4
|
+
- no challenge -> registry.solve returns None
|
|
5
|
+
- challenge + solver -> Solved{token, expires_at}
|
|
6
|
+
- challenge, no solver -> raise UnsupportedCaptcha
|
|
7
|
+
|
|
8
|
+
A solver produces a token; the backend (compute / LLM-image / JS-runtime) is its own business.
|
|
9
|
+
Tokens are single-use and solved on submit (never pre-solved).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Optional, Protocol, runtime_checkable
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Challenge:
|
|
18
|
+
kind: str
|
|
19
|
+
params: dict = field(default_factory=dict)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Solved:
|
|
24
|
+
token: str
|
|
25
|
+
expires_at: float | None = None # absolute epoch seconds, from the challenge's own ttl
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class UnsupportedCaptcha(Exception):
|
|
29
|
+
def __init__(self, kind: str):
|
|
30
|
+
super().__init__(f"no solver registered for captcha kind: {kind}")
|
|
31
|
+
self.kind = kind
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CaptchaServiceError(Exception):
|
|
35
|
+
"""The captcha backend returned an unexpected/error response (often transient)."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@runtime_checkable
|
|
39
|
+
class Solver(Protocol):
|
|
40
|
+
kind: str
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def detect(cls, text: str) -> Optional[Challenge]:
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class CaptchaRegistry:
|
|
51
|
+
def __init__(self) -> None:
|
|
52
|
+
self._solvers: dict[str, Solver] = {}
|
|
53
|
+
|
|
54
|
+
def register(self, solver: Solver) -> "CaptchaRegistry":
|
|
55
|
+
self._solvers[solver.kind] = solver
|
|
56
|
+
return self
|
|
57
|
+
|
|
58
|
+
def detect(self, source) -> Optional[Challenge]:
|
|
59
|
+
text = source if isinstance(source, str) else getattr(source, "text", "") or ""
|
|
60
|
+
for solver in self._solvers.values():
|
|
61
|
+
ch = solver.detect(text)
|
|
62
|
+
if ch is not None:
|
|
63
|
+
return ch
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def solve(self, source, transport, *, hint: Optional[Challenge] = None) -> Optional[Solved]:
|
|
67
|
+
challenge = self.detect(source) or hint
|
|
68
|
+
if challenge is None:
|
|
69
|
+
return None
|
|
70
|
+
solver = self._solvers.get(challenge.kind)
|
|
71
|
+
if solver is None:
|
|
72
|
+
raise UnsupportedCaptcha(challenge.kind)
|
|
73
|
+
return solver.solve(challenge, transport)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def default_registry() -> CaptchaRegistry:
|
|
77
|
+
"""Registry with the built-in own-solvers: mCaptcha PoW (working) + gov.br/Turnstile
|
|
78
|
+
browserless stubs (detect works, solve raises NotImplementedError until cracked).
|
|
79
|
+
Optional token-adapters (reCAPTCHA/hCaptcha) and the LLM image solver are opt-in —
|
|
80
|
+
register them yourself when configured."""
|
|
81
|
+
from .govbr import GovBrSolver
|
|
82
|
+
from .mcaptcha import McaptchaPowSolver
|
|
83
|
+
from .turnstile import TurnstileSolver
|
|
84
|
+
|
|
85
|
+
return (
|
|
86
|
+
CaptchaRegistry()
|
|
87
|
+
.register(McaptchaPowSolver())
|
|
88
|
+
.register(TurnstileSolver())
|
|
89
|
+
.register(GovBrSolver())
|
|
90
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""gov.br (sso.acesso.gov.br) — BROWSERLESS solver scaffold.
|
|
2
|
+
|
|
3
|
+
gov.br SSO is the fleet's most common gate (~79 repos) and is browser-only in atlas today.
|
|
4
|
+
`detect()` works now; `solve()` is a TODO for a manual, browserless crack — fails loudly.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from .base import Challenge, Solved
|
|
10
|
+
|
|
11
|
+
_SIGNATURE = re.compile(r"sso\.acesso\.gov\.br|acesso\.gov\.br|\bgovbr\b", re.I)
|
|
12
|
+
_SITEKEY_RE = re.compile(r'data-sitekey=["\']([0-9A-Za-z_-]{8,})["\']')
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GovBrSolver:
|
|
16
|
+
kind = "govbr"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def detect(cls, text: str):
|
|
20
|
+
text = text or ""
|
|
21
|
+
if not _SIGNATURE.search(text):
|
|
22
|
+
return None
|
|
23
|
+
m = _SITEKEY_RE.search(text) # gov.br embeds hCaptcha/reCAPTCHA
|
|
24
|
+
return Challenge(kind=cls.kind, params={"sitekey": m.group(1) if m else None})
|
|
25
|
+
|
|
26
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
27
|
+
# TODO(crawlerkit): implement the BROWSERLESS gov.br SSO authentication.
|
|
28
|
+
# gov.br (sso.acesso.gov.br) is JS-heavy and gated by a captcha (hCaptcha/reCAPTCHA) plus
|
|
29
|
+
# fingerprint checks. Browserless approach to fill in here:
|
|
30
|
+
# 1. Drive the SSO step sequence with the verified curl_cffi transport, carrying cookies
|
|
31
|
+
# across redirects (login -> authorize -> callback).
|
|
32
|
+
# 2. Solve the embedded captcha via the registry (hCaptcha/reCAPTCHA token solver) OR a
|
|
33
|
+
# JS-runtime crack of the gov.br challenge script (QuickJS/Node + DOM shim seeded from
|
|
34
|
+
# the active Profile + proxy IP).
|
|
35
|
+
# 3. Complete the OAuth/SSO redirect; return Solved(token=<session cookie / SSO assertion>).
|
|
36
|
+
# Note: some gov.br services accept ICP-Brasil mutual-TLS client certs — see crawlerkit.core.tls.
|
|
37
|
+
raise NotImplementedError(
|
|
38
|
+
f"browserless gov.br solve is a TODO (params={challenge.params!r}) "
|
|
39
|
+
"— implement the SSO/JS-runtime crack"
|
|
40
|
+
)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Own image-captcha solver: fetch the challenge image over the verified transport, classify with
|
|
2
|
+
a pluggable vision LLM, return the answer/token. Provider-agnostic — inject a `classify` callable
|
|
3
|
+
`(image_bytes, prompt) -> str`. Prompts ported from atlas's GPT solver.
|
|
4
|
+
|
|
5
|
+
Image captchas are target-specific, so the crawler builds the Challenge with the image location
|
|
6
|
+
(`params["image_url"]` or `params["image_bytes"]`); `detect()` returns None.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .base import CaptchaServiceError, Challenge, Solved
|
|
10
|
+
|
|
11
|
+
OCR_PROMPT = (
|
|
12
|
+
"This image is a CAPTCHA. Read the characters exactly. Respond with ONLY the characters "
|
|
13
|
+
"(letters/digits), no spaces, no explanation."
|
|
14
|
+
)
|
|
15
|
+
# 3x3 / 4x4 grid-selection prompts (hCaptcha / reCAPTCHA) are available for grid challenges;
|
|
16
|
+
# port the full set from atlas chatgpt_captcha_solver.py when wiring a grid flow.
|
|
17
|
+
GRID_3X3_PROMPT = (
|
|
18
|
+
"A reference image sits above a 3x3 grid (tiles numbered 1-9, left-to-right, top-to-bottom). "
|
|
19
|
+
"Return the tile numbers that clearly and fully match the reference, separated by '/', e.g. '2/5/9'. "
|
|
20
|
+
"If none match, return 'none'. No other text."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LlmImageSolver:
|
|
25
|
+
kind = "image"
|
|
26
|
+
|
|
27
|
+
def __init__(self, classify, *, prompt: str = OCR_PROMPT):
|
|
28
|
+
# classify: Callable[[bytes, str], str]
|
|
29
|
+
self._classify = classify
|
|
30
|
+
self._prompt = prompt
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def detect(cls, text: str):
|
|
34
|
+
return None # the crawler constructs the image Challenge explicitly
|
|
35
|
+
|
|
36
|
+
def solve(self, challenge: Challenge, transport) -> Solved:
|
|
37
|
+
img = challenge.params.get("image_bytes")
|
|
38
|
+
if img is None:
|
|
39
|
+
url = challenge.params.get("image_url")
|
|
40
|
+
if not url:
|
|
41
|
+
raise CaptchaServiceError("LlmImageSolver needs params['image_url'] or ['image_bytes']")
|
|
42
|
+
img = transport.get(url, timeout=30).content
|
|
43
|
+
answer = (self._classify(img, challenge.params.get("prompt", self._prompt)) or "").strip()
|
|
44
|
+
if not answer:
|
|
45
|
+
raise CaptchaServiceError("vision LLM returned an empty answer")
|
|
46
|
+
return Solved(token=answer)
|