crawlsmith 0.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
crawlsmith/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ """Top-level package for crawlsmith."""
2
+
3
+ from crawlsmith.crawlsmith import CurlCffiScraper, FetchResult
4
+
5
+ __author__ = "Juan Manuel Cristóbal Moreno"
6
+ __email__ = "juanmcristobal"
7
+ __version__ = "0.1.0"
8
+
9
+ __all__ = ["CurlCffiScraper", "FetchResult"]
crawlsmith/cli.py ADDED
@@ -0,0 +1,69 @@
1
+ """Console script for crawlsmith."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import sys
8
+
9
+ import click
10
+
11
+ from crawlsmith.crawlsmith import (DEFAULT_READ_TIMEOUT_SECONDS,
12
+ MIN_CONTENT_LENGTH, CurlCffiScraper)
13
+
14
+
15
+ @click.command()
16
+ @click.argument("url", required=False)
17
+ @click.option("--proxy", multiple=True, help="Proxy URL. Can be passed multiple times.")
18
+ @click.option("--impersonate", help="curl_cffi impersonation, e.g. chrome120")
19
+ @click.option(
20
+ "--timeout",
21
+ default=DEFAULT_READ_TIMEOUT_SECONDS,
22
+ type=int,
23
+ show_default=True,
24
+ help="Read timeout in seconds",
25
+ )
26
+ @click.option(
27
+ "--min-content-length",
28
+ default=MIN_CONTENT_LENGTH,
29
+ type=int,
30
+ show_default=True,
31
+ help="Minimum content length for proxy success",
32
+ )
33
+ @click.option("--insecure", is_flag=True, help="Disable TLS verification")
34
+ @click.option("--print-content", is_flag=True, help="Print the response body")
35
+ def main(
36
+ url: str | None,
37
+ proxy: tuple[str, ...],
38
+ impersonate: str | None,
39
+ timeout: int,
40
+ min_content_length: int,
41
+ insecure: bool,
42
+ print_content: bool,
43
+ ) -> int:
44
+ """Fetch a URL using the library scraper.
45
+
46
+ URL to fetch.
47
+ """
48
+ if not url:
49
+ click.echo(click.get_current_context().get_help())
50
+ return 0
51
+
52
+ scraper = CurlCffiScraper(
53
+ proxies=list(proxy),
54
+ impersonate=impersonate,
55
+ verify=not insecure,
56
+ min_content_length=min_content_length,
57
+ read_timeout=timeout,
58
+ )
59
+ result = asyncio.run(scraper.fetch(url))
60
+
61
+ click.echo(json.dumps(result.to_dict(), ensure_ascii=True))
62
+ if print_content and result.content:
63
+ click.echo(result.content)
64
+
65
+ raise SystemExit(0 if result.ok else 1)
66
+
67
+
68
+ if __name__ == "__main__":
69
+ sys.exit(main()) # pragma: no cover
@@ -0,0 +1,667 @@
1
+ """Public scraping library built on top of curl_cffi."""
2
+
3
+ # flake8: noqa: E501
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import gzip
9
+ import random
10
+ import re
11
+ import ssl
12
+ from dataclasses import asdict, dataclass
13
+ from html import unescape
14
+ from html.parser import HTMLParser
15
+ from typing import Any, Optional, cast
16
+
17
+ try:
18
+ from curl_cffi import requests as curl_requests
19
+ from curl_cffi.requests import errors as curl_errors
20
+ except ImportError: # pragma: no cover - exercised only without dependency installed
21
+ curl_requests = None
22
+ curl_errors = None
23
+
24
+ try:
25
+ from markdownify import markdownify as html_to_markdown
26
+ except ImportError: # pragma: no cover - exercised only without dependency installed
27
+ html_to_markdown = None
28
+
29
+
30
+ ERROR_TYPE_TIMEOUT = "TIMEOUT"
31
+ ERROR_TYPE_CONNECTION = "CONNECTION"
32
+ ERROR_TYPE_SSL = "SSL"
33
+ ERROR_TYPE_INVALID_URL = "INVALID_URL"
34
+ ERROR_TYPE_BLOCKED = "BLOCKED"
35
+ ERROR_TYPE_HTTP_403 = "HTTP_403"
36
+ ERROR_TYPE_HTTP_429 = "HTTP_429"
37
+ ERROR_TYPE_HTTP_4XX = "HTTP_4XX"
38
+ ERROR_TYPE_HTTP_5XX = "HTTP_5XX"
39
+ ERROR_TYPE_UNKNOWN = "UNKNOWN"
40
+ NON_RETRYABLE_HTTP_STATUSES = {404, 410}
41
+
42
+ DEFAULT_CONNECT_TIMEOUT_SECONDS = 5
43
+ DEFAULT_READ_TIMEOUT_SECONDS = 15
44
+ MIN_CONTENT_LENGTH = 5000
45
+
46
+ DEFAULT_USER_AGENTS = [
47
+ # Windows and macOS User Agents
48
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
49
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
50
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.2520.81",
51
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 OPR/109.0.0.0",
52
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
53
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.5; rv:125.0) Gecko/20100101 Firefox/125.0",
54
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5.1 Safari/605.1.15",
55
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 OPR/109.0.0.0",
56
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
57
+ "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0",
58
+ # Android User Agents
59
+ "Mozilla/5.0 (Linux; Android 14; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
60
+ "Mozilla/5.0 (Linux; Android 14; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
61
+ "Mozilla/5.0 (Linux; Android 14; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36,gzip(gfe)",
62
+ "Mozilla/5.0 (Linux; Android 14; SM-S901B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
63
+ "Mozilla/5.0 (Linux; Android 14; SM-S901U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
64
+ "Mozilla/5.0 (Linux; Android 14; SM-S908B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
65
+ "Mozilla/5.0 (Linux; Android 14; SM-S908U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
66
+ "Mozilla/5.0 (Linux; Android 14; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
67
+ "Mozilla/5.0 (Linux; Android 14; SM-G991U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
68
+ "Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
69
+ "Mozilla/5.0 (Linux; Android 14; SM-G998U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
70
+ "Mozilla/5.0 (Linux; Android 14; SM-A536B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
71
+ "Mozilla/5.0 (Linux; Android 14; SM-A536U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
72
+ "Mozilla/5.0 (Linux; Android 14; SM-A515F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
73
+ "Mozilla/5.0 (Linux; Android 14; SM-A515U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
74
+ "Mozilla/5.0 (Linux; Android 14; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
75
+ "Mozilla/5.0 (Linux; Android 14; SM-G973U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
76
+ "Mozilla/5.0 (Linux; Android 14; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
77
+ "Mozilla/5.0 (Linux; Android 14; Pixel 6a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
78
+ "Mozilla/5.0 (Linux; Android 14; Pixel 6 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
79
+ "Mozilla/5.0 (Linux; Android 14; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
80
+ "Mozilla/5.0 (Linux; Android 14; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
81
+ "Mozilla/5.0 (Linux; Android 14; moto g pure) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
82
+ "Mozilla/5.0 (Linux; Android 14; moto g stylus 5G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
83
+ "Mozilla/5.0 (Linux; Android 14; moto g stylus 5G (2022)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
84
+ "Mozilla/5.0 (Linux; Android 14; moto g 5G (2022)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
85
+ "Mozilla/5.0 (Linux; Android 14; moto g power (2022)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
86
+ "Mozilla/5.0 (Linux; Android 14; Redmi Note 9 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
87
+ "Mozilla/5.0 (Linux; Android 14; Redmi Note 8 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
88
+ "Mozilla/5.0 (Linux; Android 14; VOG-L29) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
89
+ "Mozilla/5.0 (Linux; Android 14; MAR-LX1A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
90
+ "Mozilla/5.0 (Linux; Android 14; M2101K6G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
91
+ "Mozilla/5.0 (Linux; Android 14; M2102J20SG) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
92
+ "Mozilla/5.0 (Linux; Android 14; 2201116SG) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
93
+ "Mozilla/5.0 (Linux; Android 14; DE2118) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
94
+ # iPhone User Agents
95
+ "Mozilla/5.0 (iPhone16,6; U; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19E241 Safari/602.1",
96
+ "Mozilla/5.0 (iPhone16,3; U; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19A346 Safari/602.1",
97
+ "Mozilla/5.0 (iPhone15,2; U; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1",
98
+ "Mozilla/5.0 (iPhone14,1; U; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1",
99
+ "Mozilla/5.0 (iPhone14,1; U; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1",
100
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
101
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/69.0.3497.105 Mobile/15E148 Safari/605.1",
102
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/13.2b11866 Mobile/16A366 Safari/605.1.15",
103
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
104
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1",
105
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A5370a Safari/604.1",
106
+ "Mozilla/5.0 (iPhone9,3; U; CPU iPhone OS 12_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1",
107
+ "Mozilla/5.0 (iPhone9,4; U; CPU iPhone OS 12_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1",
108
+ "Mozilla/5.0 (Apple-iPhone7C2/1202.466; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543 Safari/419.3",
109
+ # Windows Phone User Agents
110
+ "Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; RM-1152) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.36 Edge/15.15254",
111
+ "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; RM-1127_16056) AppleWebKit/537.36(KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10536",
112
+ "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/13.1058",
113
+ # Tablet User Agents
114
+ "Mozilla/5.0 (Linux; Android 14; SM-X906C Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/80.0.3987.119 Mobile Safari/537.36",
115
+ "Mozilla/5.0 (Linux; Android 13; Lenovo YT-J706X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
116
+ "Mozilla/5.0 (Linux; Android 9; Pixel C Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.98 Safari/537.36",
117
+ "Mozilla/5.0 (Linux; Android 8.1.0; SGP771 Build/32.2.A.0.253; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.98 Safari/537.36",
118
+ "Mozilla/5.0 (Linux; Android 8.1.0; SHIELD Tablet K1 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Safari/537.36",
119
+ "Mozilla/5.0 (Linux; Android 9; SM-T827R4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.116 Safari/537.36",
120
+ "Mozilla/5.0 (Linux; Android 7.0; SAMSUNG SM-T550 Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/3.3 Chrome/38.0.2125.102 Safari/537.36",
121
+ "Mozilla/5.0 (Linux; Android 4.4.3; KFTHWI Build/KTU84M) AppleWebKit/537.36 (KHTML, like Gecko) Silk/47.1.79 like Chrome/47.0.2526.80 Safari/537.36",
122
+ "Mozilla/5.0 (Linux; Android 7.0; LG-V410/V41020c Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/34.0.1847.118 Safari/537.36",
123
+ # Desktop User Agents
124
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
125
+ "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
126
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
127
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
128
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
129
+ ]
130
+
131
+ _CURL_TIMEOUT_TYPES = tuple(
132
+ t
133
+ for t in (
134
+ getattr(curl_errors, "Timeout", None),
135
+ getattr(curl_errors, "TimeoutError", None),
136
+ getattr(curl_errors, "ConnectTimeout", None),
137
+ getattr(curl_errors, "ReadTimeout", None),
138
+ getattr(curl_errors, "RequestsTimeout", None),
139
+ )
140
+ if t
141
+ )
142
+ _CURL_INVALID_URL = getattr(curl_errors, "InvalidURL", None)
143
+ _CURL_SSL_ERROR = getattr(curl_errors, "SSLError", None)
144
+ _CURL_PROXY_ERROR = getattr(curl_errors, "ProxyError", None)
145
+ _CURL_CONNECTION_ERROR = getattr(curl_errors, "ConnectionError", None)
146
+ _CURL_REQUESTS_ERROR = tuple(
147
+ t
148
+ for t in (
149
+ getattr(curl_errors, "RequestsError", None),
150
+ getattr(curl_errors, "RequestError", None),
151
+ )
152
+ if t
153
+ )
154
+ _CURL_REQUESTS_ERROR_TYPES = _CURL_REQUESTS_ERROR or (Exception,)
155
+ _CURL_REQUESTS_AND_TIMEOUT_TYPES = _CURL_REQUESTS_ERROR_TYPES + (
156
+ asyncio.TimeoutError,
157
+ ssl.SSLError,
158
+ )
159
+
160
+
161
+ @dataclass(frozen=True)
162
+ class FetchResult:
163
+ ok: bool
164
+ url: str
165
+ status: int | None = None
166
+ content: str | None = None
167
+ markdown: str | None = None
168
+ metadata: dict[str, Any] | None = None
169
+ error_type: str | None = None
170
+ error: str | None = None
171
+ via_proxy: bool = False
172
+ proxy_url: str | None = None
173
+ content_length: int = 0
174
+ is_blocked: bool = False
175
+
176
+ def to_dict(self) -> dict[str, Any]:
177
+ return asdict(self)
178
+
179
+
180
+ class BlockDetector:
181
+ STRONG_PATTERNS = [
182
+ re.compile(r"Attention Required!\s*\|\s*Cloudflare", re.I),
183
+ re.compile(r"Checking your browser", re.I),
184
+ re.compile(r"Just a moment\.\.\.", re.I),
185
+ re.compile(r"cf-chl-", re.I),
186
+ re.compile(r"access\.denied\.\.DDoS\.Guard", re.I),
187
+ re.compile(r"px-captcha", re.I),
188
+ re.compile(r"distil_r_captcha", re.I),
189
+ re.compile(r"distilnetworks", re.I),
190
+ re.compile(r"whoa there, pardner!", re.I),
191
+ re.compile(r"blocked due to a network policy", re.I),
192
+ ]
193
+ SOFT_PATTERNS = [
194
+ re.compile(r"cf_challenge", re.I),
195
+ re.compile(r"cf-turnstile", re.I),
196
+ re.compile(r"g-recaptcha", re.I),
197
+ re.compile(r"www\.google\.com/recaptcha", re.I),
198
+ re.compile(r"recaptcha/api\.js", re.I),
199
+ re.compile(r"hcaptcha\.com/1/api\.js", re.I),
200
+ re.compile(r"hcaptcha", re.I),
201
+ ]
202
+ JS_CHALLENGE_PATTERNS = [
203
+ re.compile(r"window\.location\.href\s*=.*challenge", re.I),
204
+ re.compile(r"document\.cookie\s*=.*challenge", re.I),
205
+ re.compile(r"challenge-platform", re.I),
206
+ re.compile(r"challenge-form", re.I),
207
+ re.compile(r"jschal-answer", re.I),
208
+ re.compile(r"captcha-solution", re.I),
209
+ re.compile(r"verification-token", re.I),
210
+ ]
211
+ SUSPICIOUS_PATTERNS = [
212
+ re.compile(r"<title>Access Denied", re.I),
213
+ re.compile(r"<title>403 Forbidden", re.I),
214
+ re.compile(r"<title>429 Too Many Requests", re.I),
215
+ re.compile(r"<title>You don't have permission", re.I),
216
+ re.compile(r"please verify you are human", re.I),
217
+ re.compile(r"please complete the security check", re.I),
218
+ ]
219
+
220
+ @classmethod
221
+ def is_blocked(cls, response_text: str, status_code: Optional[int] = None) -> bool:
222
+ if not response_text:
223
+ return status_code in (403, 429) if status_code else False
224
+
225
+ if status_code in (403, 429):
226
+ return True
227
+
228
+ if status_code in (520, 521, 522, 523, 524, 525, 526, 527):
229
+ return False
230
+
231
+ head = response_text.lstrip()[:500].lower()
232
+ looks_like_xml = (
233
+ head.startswith("<?xml")
234
+ or head.startswith("<rss")
235
+ or head.startswith("<feed")
236
+ or head.startswith("<rdf:rdf")
237
+ or head.startswith("<sitemapindex")
238
+ or head.startswith("<urlset")
239
+ )
240
+ if looks_like_xml:
241
+ return False
242
+
243
+ text_only = re.sub(r"<[^>]+>", " ", response_text)
244
+ text_only = re.sub(r"\s+", " ", text_only).strip()
245
+ text_len = len(text_only)
246
+ text_ratio = text_len / max(len(response_text), 1)
247
+ looks_like_real_content = (text_len >= 1500) or (
248
+ (text_len >= 800) and (text_ratio >= 0.01)
249
+ )
250
+ looks_like_interstitial = (text_len < 600) or (text_ratio < 0.008)
251
+
252
+ for pattern in cls.STRONG_PATTERNS:
253
+ if pattern.search(response_text):
254
+ return True
255
+
256
+ for pattern in (
257
+ cls.SOFT_PATTERNS + cls.JS_CHALLENGE_PATTERNS + cls.SUSPICIOUS_PATTERNS
258
+ ):
259
+ if pattern.search(response_text):
260
+ return looks_like_interstitial and not looks_like_real_content
261
+
262
+ return False
263
+
264
+ @classmethod
265
+ def get_block_reason(cls, response_text: str) -> str | None:
266
+ if not response_text:
267
+ return None
268
+ for pattern in (
269
+ cls.STRONG_PATTERNS + cls.JS_CHALLENGE_PATTERNS + cls.SUSPICIOUS_PATTERNS
270
+ ):
271
+ if pattern.search(response_text):
272
+ return pattern.pattern
273
+ return None
274
+
275
+
276
+ def _classify_status(status: int) -> str:
277
+ if status == 403:
278
+ return ERROR_TYPE_HTTP_403
279
+ if status == 429:
280
+ return ERROR_TYPE_HTTP_429
281
+ if 400 <= status <= 499:
282
+ return ERROR_TYPE_HTTP_4XX
283
+ if 500 <= status <= 599:
284
+ return ERROR_TYPE_HTTP_5XX
285
+ return ERROR_TYPE_UNKNOWN
286
+
287
+
288
+ def _header_map(response: Any) -> dict[str, str]:
289
+ headers = getattr(response, "headers", {}) or {}
290
+ return {str(k).lower(): str(v) for k, v in dict(headers).items()}
291
+
292
+
293
+ def _is_gzip_payload(url: str, headers: dict[str, str], body: bytes) -> bool:
294
+ if body.startswith(b"\x1f\x8b"):
295
+ return True
296
+ if "gzip" in headers.get("content-type", "").lower():
297
+ return True
298
+ if ".gz" in headers.get("content-disposition", "").lower():
299
+ return True
300
+ if url.lower().endswith(".gz"):
301
+ return True
302
+ return False
303
+
304
+
305
+ def _extract_response_text(response: Any, url: str) -> str:
306
+ body = getattr(response, "content", None)
307
+ if isinstance(body, bytearray):
308
+ body = bytes(body)
309
+
310
+ if isinstance(body, bytes) and body:
311
+ headers = _header_map(response)
312
+ if _is_gzip_payload(url, headers, body):
313
+ try:
314
+ return gzip.decompress(body).decode("utf-8", errors="replace")
315
+ except OSError:
316
+ pass
317
+
318
+ text = getattr(response, "text", None)
319
+ if isinstance(text, str):
320
+ return text
321
+
322
+ if isinstance(body, bytes):
323
+ return body.decode("utf-8", errors="replace")
324
+
325
+ return ""
326
+
327
+
328
+ def _looks_like_xml_document(content: str) -> bool:
329
+ head = content.lstrip()[:500].lower()
330
+ return (
331
+ head.startswith("<?xml")
332
+ or head.startswith("<rss")
333
+ or head.startswith("<feed")
334
+ or head.startswith("<rdf:rdf")
335
+ or head.startswith("<sitemapindex")
336
+ or head.startswith("<urlset")
337
+ )
338
+
339
+
340
+ def _convert_html_to_markdown(content: str) -> str:
341
+ if html_to_markdown is not None and not _looks_like_xml_document(content):
342
+ return html_to_markdown(content, heading_style="ATX").strip()
343
+
344
+ text = content
345
+ replacements = (
346
+ (r"<h1[^>]*>(.*?)</h1>", r"# \1\n\n"),
347
+ (r"<h2[^>]*>(.*?)</h2>", r"## \1\n\n"),
348
+ (r"<strong[^>]*>(.*?)</strong>", r"**\1**"),
349
+ (r"<b[^>]*>(.*?)</b>", r"**\1**"),
350
+ (r"<em[^>]*>(.*?)</em>", r"*\1*"),
351
+ (r"<i[^>]*>(.*?)</i>", r"*\1*"),
352
+ (r"<p[^>]*>(.*?)</p>", r"\1\n\n"),
353
+ (r"<br\s*/?>", "\n"),
354
+ )
355
+ for pattern, replacement in replacements:
356
+ text = re.sub(pattern, replacement, text, flags=re.I | re.S)
357
+ text = re.sub(r"<[^>]+>", "", text)
358
+ text = unescape(text)
359
+ text = re.sub(r"\n{3,}", "\n\n", text)
360
+ text = re.sub(r"[ \t]+\n", "\n", text)
361
+ return text.strip()
362
+
363
+
364
+ class _MetadataHTMLParser(HTMLParser):
365
+ def __init__(self) -> None:
366
+ super().__init__()
367
+ self.document: dict[str, Any] = {}
368
+ self.open_graph: dict[str, Any] = {}
369
+ self.twitter: dict[str, Any] = {}
370
+ self._in_title = False
371
+ self._title_parts: list[str] = []
372
+
373
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
374
+ attr_map = {key.lower(): (value or "") for key, value in attrs}
375
+
376
+ if tag == "html" and attr_map.get("lang"):
377
+ self.document["lang"] = attr_map["lang"]
378
+
379
+ if tag == "title":
380
+ self._in_title = True
381
+
382
+ if tag == "meta":
383
+ name = attr_map.get("name", "").strip().lower()
384
+ prop = attr_map.get("property", "").strip().lower()
385
+ content = attr_map.get("content", "").strip()
386
+ if not content:
387
+ return
388
+ if name in {
389
+ "description",
390
+ "author",
391
+ "keywords",
392
+ "published_time",
393
+ "modified_time",
394
+ }:
395
+ self.document[name] = content
396
+ elif name.startswith("twitter:"):
397
+ self.twitter[name.removeprefix("twitter:")] = content
398
+ elif prop.startswith("og:"):
399
+ self.open_graph[prop.removeprefix("og:")] = content
400
+
401
+ if tag == "link":
402
+ rel = attr_map.get("rel", "").strip().lower()
403
+ href = attr_map.get("href", "").strip()
404
+ if rel == "canonical" and href:
405
+ self.document["canonical_url"] = href
406
+
407
+ def handle_endtag(self, tag: str) -> None:
408
+ if tag == "title":
409
+ self._in_title = False
410
+ title = "".join(self._title_parts).strip()
411
+ if title:
412
+ self.document["title"] = unescape(title)
413
+ self._title_parts.clear()
414
+
415
+ def handle_data(self, data: str) -> None:
416
+ if self._in_title:
417
+ self._title_parts.append(data)
418
+
419
+
420
+ def _extract_metadata(
421
+ response: Any,
422
+ html: str,
423
+ *,
424
+ requested_url: str,
425
+ final_url: str | None,
426
+ status: int | None,
427
+ ) -> dict[str, Any]:
428
+ parser = _MetadataHTMLParser()
429
+ try:
430
+ parser.feed(html)
431
+ except Exception:
432
+ pass
433
+
434
+ headers = _header_map(response)
435
+ return {
436
+ "document": parser.document,
437
+ "open_graph": parser.open_graph,
438
+ "twitter": parser.twitter,
439
+ "http": {
440
+ "status": status,
441
+ "requested_url": requested_url,
442
+ "final_url": final_url or requested_url,
443
+ "content_type": headers.get("content-type"),
444
+ "content_encoding": headers.get("content-encoding"),
445
+ "headers": headers,
446
+ },
447
+ }
448
+
449
+
450
+ def _classify_exception(exc: BaseException) -> tuple[str, str]:
451
+ if isinstance(exc, asyncio.TimeoutError) or (
452
+ _CURL_TIMEOUT_TYPES and isinstance(exc, _CURL_TIMEOUT_TYPES)
453
+ ):
454
+ return ERROR_TYPE_TIMEOUT, "Request timed out"
455
+ if _CURL_INVALID_URL and isinstance(exc, _CURL_INVALID_URL):
456
+ return ERROR_TYPE_INVALID_URL, "Invalid URL"
457
+ if isinstance(exc, ssl.SSLError) or (
458
+ _CURL_SSL_ERROR and isinstance(exc, _CURL_SSL_ERROR)
459
+ ):
460
+ return ERROR_TYPE_SSL, "TLS/SSL error"
461
+ if _CURL_PROXY_ERROR and isinstance(exc, _CURL_PROXY_ERROR):
462
+ return ERROR_TYPE_CONNECTION, "Proxy connection failed"
463
+ if _CURL_CONNECTION_ERROR and isinstance(exc, _CURL_CONNECTION_ERROR):
464
+ return ERROR_TYPE_CONNECTION, "Connection failed"
465
+ if _CURL_REQUESTS_ERROR and isinstance(exc, _CURL_REQUESTS_ERROR):
466
+ return ERROR_TYPE_CONNECTION, "HTTP client error"
467
+ return ERROR_TYPE_UNKNOWN, "Unknown error"
468
+
469
+
470
+ def generate_fingerprint() -> dict[str, Any]:
471
+ return {
472
+ "headers": {
473
+ "User-Agent": random.choice(DEFAULT_USER_AGENTS),
474
+ "Accept-Encoding": "gzip, deflate, br, zstd",
475
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
476
+ "Accept-Language": (
477
+ f"{random.choice(['en', 'es', 'fr'])}-"
478
+ f"{random.choice(['US', 'ES', 'CA'])};q=0.{random.randint(5, 9)}"
479
+ ),
480
+ "Sec-Ch-Ua": f'"Chromium";v="{random.randint(120, 124)}", "Not.A/Brand";v="24"',
481
+ "Sec-Ch-Ua-Mobile": "?0",
482
+ "Sec-Ch-Ua-Platform": random.choice(['"Windows"', '"macOS"', '"Linux"']),
483
+ "DNT": "1",
484
+ "Sec-Fetch-Dest": "document",
485
+ "Sec-Fetch-Mode": "navigate",
486
+ "Sec-Fetch-Site": "none",
487
+ "Upgrade-Insecure-Requests": "1",
488
+ "Cache-Control": f"max-age={random.randint(0, 3600)}",
489
+ },
490
+ "http2": True,
491
+ }
492
+
493
+
494
+ class StealthRequest:
495
+ BROWSER_IMPERSONATIONS = [
496
+ "chrome120",
497
+ "chrome119",
498
+ "chrome116",
499
+ "safari15_5",
500
+ "safari15_3",
501
+ ]
502
+
503
+ def __init__(
504
+ self,
505
+ proxies: Optional[list[str]] = None,
506
+ verify: bool = True,
507
+ impersonate: Optional[str] = None,
508
+ min_content_length: int = MIN_CONTENT_LENGTH,
509
+ connect_timeout: int = DEFAULT_CONNECT_TIMEOUT_SECONDS,
510
+ read_timeout: int = DEFAULT_READ_TIMEOUT_SECONDS,
511
+ ) -> None:
512
+ self.proxies = list(proxies or [])
513
+ self.verify = verify
514
+ self.impersonate = impersonate or random.choice(self.BROWSER_IMPERSONATIONS)
515
+ self.min_content_length = min_content_length
516
+ self.connect_timeout = connect_timeout
517
+ self.read_timeout = read_timeout
518
+
519
+ async def fetch(self, url: str) -> FetchResult:
520
+ if curl_requests is None:
521
+ return FetchResult(
522
+ ok=False,
523
+ url=url,
524
+ error_type=ERROR_TYPE_CONNECTION,
525
+ error="curl_cffi is not installed",
526
+ )
527
+
528
+ headers = generate_fingerprint()["headers"]
529
+ last_result: FetchResult | None = None
530
+
531
+ async def _attempt(proxy_url: str | None) -> FetchResult:
532
+ proxy = None
533
+ if proxy_url:
534
+ proxy = (
535
+ f"http://{proxy_url}"
536
+ if not proxy_url.startswith("http")
537
+ else proxy_url
538
+ )
539
+
540
+ try:
541
+ proxy_dict = cast(
542
+ Any, {"http": proxy, "https": proxy} if proxy else None
543
+ )
544
+ response = await asyncio.to_thread(
545
+ curl_requests.get,
546
+ url,
547
+ headers=headers,
548
+ proxies=proxy_dict,
549
+ timeout=(self.connect_timeout, self.read_timeout),
550
+ verify=self.verify,
551
+ allow_redirects=True,
552
+ impersonate=cast(Any, self.impersonate),
553
+ )
554
+ status = response.status_code
555
+ text = _extract_response_text(response, url)
556
+ final_url = cast(str | None, getattr(response, "url", None))
557
+ content_length = len(text.encode("utf-8"))
558
+ is_blocked = BlockDetector.is_blocked(text, status)
559
+ metadata = _extract_metadata(
560
+ response,
561
+ text,
562
+ requested_url=url,
563
+ final_url=final_url,
564
+ status=status,
565
+ )
566
+
567
+ if is_blocked:
568
+ return FetchResult(
569
+ ok=False,
570
+ url=url,
571
+ status=status,
572
+ error_type=ERROR_TYPE_BLOCKED,
573
+ error=BlockDetector.get_block_reason(text)
574
+ or "Anti-bot challenge detected",
575
+ metadata=metadata,
576
+ via_proxy=proxy is not None,
577
+ proxy_url=proxy_url,
578
+ content_length=content_length,
579
+ is_blocked=True,
580
+ )
581
+
582
+ if status == 200:
583
+ return FetchResult(
584
+ ok=True,
585
+ url=url,
586
+ status=status,
587
+ content=text,
588
+ markdown=_convert_html_to_markdown(text),
589
+ metadata=metadata,
590
+ via_proxy=proxy is not None,
591
+ proxy_url=proxy_url,
592
+ content_length=content_length,
593
+ is_blocked=False,
594
+ )
595
+
596
+ return FetchResult(
597
+ ok=False,
598
+ url=url,
599
+ status=status,
600
+ error_type=_classify_status(status),
601
+ error=f"HTTP {status}",
602
+ metadata=metadata,
603
+ via_proxy=proxy is not None,
604
+ proxy_url=proxy_url,
605
+ content_length=content_length,
606
+ is_blocked=False,
607
+ )
608
+ except _CURL_REQUESTS_AND_TIMEOUT_TYPES as exc:
609
+ error_type, message = _classify_exception(exc)
610
+ return FetchResult(
611
+ ok=False,
612
+ url=url,
613
+ status=None,
614
+ error_type=error_type,
615
+ error=message,
616
+ via_proxy=proxy is not None,
617
+ proxy_url=proxy_url,
618
+ is_blocked=False,
619
+ )
620
+
621
+ if self.proxies:
622
+ to_try = list(dict.fromkeys(self.proxies))
623
+ random.shuffle(to_try)
624
+ for proxy_url in to_try[: min(3, len(to_try))]:
625
+ result = await _attempt(proxy_url)
626
+ last_result = result
627
+ if result.status in NON_RETRYABLE_HTTP_STATUSES:
628
+ return result
629
+ if (
630
+ result.ok
631
+ and not result.is_blocked
632
+ and result.content_length >= self.min_content_length
633
+ ):
634
+ return result
635
+ return last_result or FetchResult(
636
+ ok=False,
637
+ url=url,
638
+ error_type=ERROR_TYPE_CONNECTION,
639
+ error="All proxy attempts failed",
640
+ via_proxy=True,
641
+ proxy_url=to_try[0] if to_try else None,
642
+ )
643
+
644
+ return await _attempt(None)
645
+
646
+
647
+ class CurlCffiScraper:
648
+ def __init__(
649
+ self,
650
+ proxies: Optional[list[str]] = None,
651
+ impersonate: Optional[str] = None,
652
+ verify: bool = True,
653
+ min_content_length: int = MIN_CONTENT_LENGTH,
654
+ connect_timeout: int = DEFAULT_CONNECT_TIMEOUT_SECONDS,
655
+ read_timeout: int = DEFAULT_READ_TIMEOUT_SECONDS,
656
+ ) -> None:
657
+ self.stealth_request = StealthRequest(
658
+ proxies=proxies,
659
+ verify=verify,
660
+ impersonate=impersonate,
661
+ min_content_length=min_content_length,
662
+ connect_timeout=connect_timeout,
663
+ read_timeout=read_timeout,
664
+ )
665
+
666
+ async def fetch(self, url: str) -> FetchResult:
667
+ return await self.stealth_request.fetch(url)
@@ -0,0 +1,396 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlsmith
3
+ Version: 0.1.0
4
+ Summary: Crawlsmith helps you craft reliable web crawlers in Python, combining page fetching, HTML parsing, link discovery, and content extraction into a simple and extensible toolkit.
5
+ Home-page: https://github.com/juanmcristobal/crawlsmith
6
+ Author: Juan Manuel Cristóbal Moreno
7
+ Author-email: juanmcristobal@gmail.com
8
+ Keywords: crawlsmith
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Natural Language :: English
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: AUTHORS.md
19
+ Requires-Dist: Click==8.1.7
20
+ Requires-Dist: curl_cffi>=0.7.0
21
+ Requires-Dist: markdownify>=0.13.1
22
+ Dynamic: author
23
+ Dynamic: author-email
24
+ Dynamic: classifier
25
+ Dynamic: description
26
+ Dynamic: description-content-type
27
+ Dynamic: home-page
28
+ Dynamic: keywords
29
+ Dynamic: license-file
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ ![Crawlsmith banner](banner.png)
35
+
36
+ # CrawlSmith
37
+
38
+ Crawlsmith is a Python scraping toolkit for fetching web pages with
39
+ `curl_cffi`, extracting readable content, detecting common anti-bot
40
+ interstitials, and returning structured metadata in a single result object.
41
+
42
+ It is designed for Python developers who want a small, pragmatic interface for:
43
+
44
+ - fetching HTML or XML content
45
+ - converting HTML to Markdown
46
+ - rotating browser impersonation profiles
47
+ - trying multiple proxies
48
+ - classifying HTTP and network failures
49
+ - extracting document, Open Graph, Twitter, and HTTP metadata
50
+
51
+ ## Features
52
+
53
+ - Async-first Python API built around `CurlCffiScraper`
54
+ - Structured `FetchResult` object with success state, content, Markdown, and metadata
55
+ - Automatic browser fingerprint headers and `curl_cffi` impersonation support
56
+ - Proxy rotation with early success and retry limits
57
+ - Detection of common anti-bot challenge pages such as Cloudflare-style interstitials
58
+ - Gzip payload handling for compressed responses and feeds
59
+ - Built-in CLI for quick fetch, inspection, and debugging
60
+
61
+ ## Installation
62
+
63
+ Install from PyPI:
64
+
65
+ ```bash
66
+ pip install crawlsmith
67
+ ```
68
+
69
+ Requirements:
70
+
71
+ - Python 3.10+
72
+
73
+ ## Quick Start
74
+
75
+ ```python
76
+ import asyncio
77
+
78
+ from crawlsmith import CurlCffiScraper
79
+
80
+
81
+ async def main() -> None:
82
+ scraper = CurlCffiScraper()
83
+ result = await scraper.fetch("https://example.com")
84
+
85
+ if result.ok:
86
+ print(result.status)
87
+ print(result.content[:200])
88
+ print(result.markdown[:200])
89
+ else:
90
+ print(result.error_type, result.error)
91
+
92
+
93
+ asyncio.run(main())
94
+ ```
95
+
96
+ ## Python Usage
97
+
98
+ ### Basic Fetch
99
+
100
+ ```python
101
+ import asyncio
102
+
103
+ from crawlsmith import CurlCffiScraper
104
+
105
+
106
+ async def main() -> None:
107
+ scraper = CurlCffiScraper()
108
+ result = await scraper.fetch("https://example.com")
109
+
110
+ if not result.ok:
111
+ raise RuntimeError(f"{result.error_type}: {result.error}")
112
+
113
+ print("Status:", result.status)
114
+ print("URL:", result.url)
115
+ print("Content length:", result.content_length)
116
+
117
+
118
+ asyncio.run(main())
119
+ ```
120
+
121
+ ### Read HTML and Markdown
122
+
123
+ When a request succeeds with HTTP `200`, Crawlsmith returns both the raw response
124
+ body and a Markdown representation.
125
+
126
+ ```python
127
+ import asyncio
128
+
129
+ from crawlsmith import CurlCffiScraper
130
+
131
+
132
+ async def main() -> None:
133
+ scraper = CurlCffiScraper()
134
+ result = await scraper.fetch("https://example.com")
135
+
136
+ if result.ok:
137
+ html = result.content
138
+ markdown = result.markdown
139
+ print(html[:300])
140
+ print(markdown[:300])
141
+
142
+
143
+ asyncio.run(main())
144
+ ```
145
+
146
+ ### Access Structured Metadata
147
+
148
+ Each result includes metadata extracted from the response body and headers.
149
+
150
+ ```python
151
+ import asyncio
152
+
153
+ from crawlsmith import CurlCffiScraper
154
+
155
+
156
+ async def main() -> None:
157
+ scraper = CurlCffiScraper()
158
+ result = await scraper.fetch("https://example.com")
159
+
160
+ metadata = result.metadata or {}
161
+ document = metadata.get("document", {})
162
+ open_graph = metadata.get("open_graph", {})
163
+ twitter = metadata.get("twitter", {})
164
+ http = metadata.get("http", {})
165
+
166
+ print("Title:", document.get("title"))
167
+ print("Description:", document.get("description"))
168
+ print("Canonical URL:", document.get("canonical_url"))
169
+ print("OG Title:", open_graph.get("title"))
170
+ print("Twitter Card:", twitter.get("card"))
171
+ print("Final URL:", http.get("final_url"))
172
+
173
+
174
+ asyncio.run(main())
175
+ ```
176
+
177
+ ### Use Proxies
178
+
179
+ Pass a list of proxies. Crawlsmith will shuffle them, try up to three unique
180
+ entries, and return as soon as one succeeds with enough content.
181
+
182
+ ```python
183
+ import asyncio
184
+
185
+ from crawlsmith import CurlCffiScraper
186
+
187
+
188
+ async def main() -> None:
189
+ scraper = CurlCffiScraper(
190
+ proxies=[
191
+ "http://user:pass@proxy-1.example:8080",
192
+ "http://user:pass@proxy-2.example:8080",
193
+ "proxy-3.example:8080",
194
+ ],
195
+ min_content_length=2000,
196
+ )
197
+
198
+ result = await scraper.fetch("https://example.com")
199
+ print(result.ok, result.via_proxy, result.proxy_url)
200
+
201
+
202
+ asyncio.run(main())
203
+ ```
204
+
205
+ ### Control Browser Impersonation
206
+
207
+ You can force a specific `curl_cffi` impersonation profile instead of using the
208
+ default randomized behavior.
209
+
210
+ ```python
211
+ import asyncio
212
+
213
+ from crawlsmith import CurlCffiScraper
214
+
215
+
216
+ async def main() -> None:
217
+ scraper = CurlCffiScraper(impersonate="chrome120")
218
+ result = await scraper.fetch("https://example.com")
219
+ print(result.status, result.error_type)
220
+
221
+
222
+ asyncio.run(main())
223
+ ```
224
+
225
+ ### Configure TLS and Timeouts
226
+
227
+ ```python
228
+ import asyncio
229
+
230
+ from crawlsmith import CurlCffiScraper
231
+
232
+
233
+ async def main() -> None:
234
+ scraper = CurlCffiScraper(
235
+ verify=True,
236
+ connect_timeout=5,
237
+ read_timeout=20,
238
+ )
239
+ result = await scraper.fetch("https://example.com")
240
+ print(result.to_dict())
241
+
242
+
243
+ asyncio.run(main())
244
+ ```
245
+
246
+ If you need to disable TLS certificate verification for a controlled internal
247
+ environment, set `verify=False`.
248
+
249
+ ### Handle Errors Explicitly
250
+
251
+ Failures are returned as structured results instead of raising request errors in
252
+ normal operation.
253
+
254
+ ```python
255
+ import asyncio
256
+
257
+ from crawlsmith import CurlCffiScraper
258
+
259
+
260
+ async def main() -> None:
261
+ scraper = CurlCffiScraper()
262
+ result = await scraper.fetch("https://example.com")
263
+
264
+ if result.ok:
265
+ print("Fetched successfully")
266
+ return
267
+
268
+ print("Error type:", result.error_type)
269
+ print("Error message:", result.error)
270
+ print("HTTP status:", result.status)
271
+ print("Blocked:", result.is_blocked)
272
+
273
+
274
+ asyncio.run(main())
275
+ ```
276
+
277
+ Common error types include:
278
+
279
+ - `TIMEOUT`
280
+ - `CONNECTION`
281
+ - `SSL`
282
+ - `INVALID_URL`
283
+ - `BLOCKED`
284
+ - `HTTP_403`
285
+ - `HTTP_429`
286
+ - `HTTP_4XX`
287
+ - `HTTP_5XX`
288
+ - `UNKNOWN`
289
+
290
+ ### Serialize Results
291
+
292
+ `FetchResult` can be converted directly into a plain dictionary for logging,
293
+ storage, or JSON serialization.
294
+
295
+ ```python
296
+ import asyncio
297
+ import json
298
+
299
+ from crawlsmith import CurlCffiScraper
300
+
301
+
302
+ async def main() -> None:
303
+ scraper = CurlCffiScraper()
304
+ result = await scraper.fetch("https://example.com")
305
+ print(json.dumps(result.to_dict(), indent=2))
306
+
307
+
308
+ asyncio.run(main())
309
+ ```
310
+
311
+ ## CLI Usage
312
+
313
+ The package installs a `crawlsmith` command for quick fetches from the terminal.
314
+
315
+ ### Basic CLI Request
316
+
317
+ ```bash
318
+ crawlsmith https://example.com
319
+ ```
320
+
321
+ The CLI prints a JSON-serialized `FetchResult` to stdout.
322
+
323
+ ### Print the Response Body
324
+
325
+ ```bash
326
+ crawlsmith https://example.com --print-content
327
+ ```
328
+
329
+ ### Use One or More Proxies
330
+
331
+ ```bash
332
+ crawlsmith https://example.com \
333
+ --proxy http://user:pass@proxy-1.example:8080 \
334
+ --proxy http://user:pass@proxy-2.example:8080 \
335
+ --min-content-length 2000
336
+ ```
337
+
338
+ ### Force an Impersonation Profile
339
+
340
+ ```bash
341
+ crawlsmith https://example.com --impersonate chrome120
342
+ ```
343
+
344
+ ### Change Timeout or Disable TLS Verification
345
+
346
+ ```bash
347
+ crawlsmith https://example.com --timeout 20
348
+ ```
349
+
350
+ ```bash
351
+ crawlsmith https://example.com --insecure
352
+ ```
353
+
354
+ ### CLI Exit Codes
355
+
356
+ - `0` when the request succeeds
357
+ - `1` when the request fails
358
+
359
+ ### CLI Help
360
+
361
+ ```bash
362
+ crawlsmith --help
363
+ ```
364
+
365
+ ## Result Model
366
+
367
+ `FetchResult` exposes the following fields:
368
+
369
+ - `ok`: whether the request was considered successful
370
+ - `url`: requested URL
371
+ - `status`: HTTP status code when available
372
+ - `content`: raw response text when successful
373
+ - `markdown`: Markdown conversion of the response body when successful
374
+ - `metadata`: extracted document and HTTP metadata
375
+ - `error_type`: normalized error classification
376
+ - `error`: human-readable error summary
377
+ - `via_proxy`: whether the successful or failed attempt used a proxy
378
+ - `proxy_url`: proxy used for the final attempt, if any
379
+ - `content_length`: UTF-8 byte length of the extracted text
380
+ - `is_blocked`: whether the response looks like an anti-bot interstitial
381
+
382
+
383
+ ## Support & Connect
384
+
385
+ * ⭐ **Star the repo** if you found it useful
386
+ * ☕ **Support me:** Say thanks by buying me a coffee! [https://buymeacoffee.com/juanmcristobal](https://buymeacoffee.com/juanmcristobal)
387
+ * 💼 **Open to work:** [https://www.linkedin.com/in/jmcristobal/](https://www.linkedin.com/in/jmcristobal/)
388
+
389
+
390
+ # History
391
+
392
+
393
+ ## 0.1.0 (2026-04-07)
394
+
395
+
396
+ * First release.
@@ -0,0 +1,9 @@
1
+ crawlsmith/__init__.py,sha256=E7DzsXgbN1dLolUj0mwmbNFwI8uO8QDD4d2Z8LbjM8M,247
2
+ crawlsmith/cli.py,sha256=4pfHq7xxXGGOZwKDsnXxq-UDxdqi7KkR-IjVwATKV0c,1822
3
+ crawlsmith/crawlsmith.py,sha256=WwPZ1_UMGm_AqC1KTqvSvUj54WCj-LTl6ubbLG1CcAI,30390
4
+ crawlsmith-0.1.0.dist-info/licenses/AUTHORS.md,sha256=_iiWmZhfEpCmhL904fE_TcL5lFcj72NTwzzaaXAQ2-s,132
5
+ crawlsmith-0.1.0.dist-info/METADATA,sha256=jdfGEpKhDiyJoCsdGiYB-QRUCQ4qW_kcuQkPEC6n1_o,9123
6
+ crawlsmith-0.1.0.dist-info/WHEEL,sha256=TdQ5LtNwLuxTCjgxN51AgdU5w-KkB9ttmLbzjTH02pg,109
7
+ crawlsmith-0.1.0.dist-info/entry_points.txt,sha256=9gEU7jvKLjF5d6GsOJn471DwTuT378bSGCjd04Pc_qk,51
8
+ crawlsmith-0.1.0.dist-info/top_level.txt,sha256=cC0xCz1cJd3GmiYwJyUTZENu0osLLW6HZWmvUyHLBWI,11
9
+ crawlsmith-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,6 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any
6
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ crawlsmith = crawlsmith.cli:main
@@ -0,0 +1,10 @@
1
+ # Credits
2
+
3
+
4
+ ## Development Lead
5
+
6
+ * Juan Manuel Cristóbal Moreno <juanmcristobal>
7
+
8
+ ## Contributors
9
+
10
+ None yet. Why not be the first?
@@ -0,0 +1 @@
1
+ crawlsmith