crawlsmith 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlsmith-0.1.0/AUTHORS.md +10 -0
- crawlsmith-0.1.0/HISTORY.md +7 -0
- crawlsmith-0.1.0/MANIFEST.in +10 -0
- crawlsmith-0.1.0/PKG-INFO +396 -0
- crawlsmith-0.1.0/README.md +354 -0
- crawlsmith-0.1.0/crawlsmith/__init__.py +9 -0
- crawlsmith-0.1.0/crawlsmith/cli.py +69 -0
- crawlsmith-0.1.0/crawlsmith/crawlsmith.py +667 -0
- crawlsmith-0.1.0/crawlsmith.egg-info/PKG-INFO +396 -0
- crawlsmith-0.1.0/crawlsmith.egg-info/SOURCES.txt +19 -0
- crawlsmith-0.1.0/crawlsmith.egg-info/dependency_links.txt +1 -0
- crawlsmith-0.1.0/crawlsmith.egg-info/entry_points.txt +2 -0
- crawlsmith-0.1.0/crawlsmith.egg-info/not-zip-safe +1 -0
- crawlsmith-0.1.0/crawlsmith.egg-info/requires.txt +3 -0
- crawlsmith-0.1.0/crawlsmith.egg-info/top_level.txt +1 -0
- crawlsmith-0.1.0/requirements.txt +3 -0
- crawlsmith-0.1.0/setup.cfg +29 -0
- crawlsmith-0.1.0/setup.py +52 -0
- crawlsmith-0.1.0/tests/__init__.py +1 -0
- crawlsmith-0.1.0/tests/test_crawlsmith.py +510 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlsmith
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Crawlsmith helps you craft reliable web crawlers in Python, combining page fetching, HTML parsing, link discovery, and content extraction into a simple and extensible toolkit.
|
|
5
|
+
Home-page: https://github.com/juanmcristobal/crawlsmith
|
|
6
|
+
Author: Juan Manuel Cristóbal Moreno
|
|
7
|
+
Author-email: juanmcristobal@gmail.com
|
|
8
|
+
Keywords: crawlsmith
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Natural Language :: English
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: AUTHORS.md
|
|
19
|
+
Requires-Dist: Click==8.1.7
|
|
20
|
+
Requires-Dist: curl_cffi>=0.7.0
|
|
21
|
+
Requires-Dist: markdownify>=0.13.1
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: keywords
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+
|
|
36
|
+
# CrawlSmith
|
|
37
|
+
|
|
38
|
+
Crawlsmith is a Python scraping toolkit for fetching web pages with
|
|
39
|
+
`curl_cffi`, extracting readable content, detecting common anti-bot
|
|
40
|
+
interstitials, and returning structured metadata in a single result object.
|
|
41
|
+
|
|
42
|
+
It is designed for Python developers who want a small, pragmatic interface for:
|
|
43
|
+
|
|
44
|
+
- fetching HTML or XML content
|
|
45
|
+
- converting HTML to Markdown
|
|
46
|
+
- rotating browser impersonation profiles
|
|
47
|
+
- trying multiple proxies
|
|
48
|
+
- classifying HTTP and network failures
|
|
49
|
+
- extracting document, Open Graph, Twitter, and HTTP metadata
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- Async-first Python API built around `CurlCffiScraper`
|
|
54
|
+
- Structured `FetchResult` object with success state, content, Markdown, and metadata
|
|
55
|
+
- Automatic browser fingerprint headers and `curl_cffi` impersonation support
|
|
56
|
+
- Proxy rotation with early success and retry limits
|
|
57
|
+
- Detection of common anti-bot challenge pages such as Cloudflare-style interstitials
|
|
58
|
+
- Gzip payload handling for compressed responses and feeds
|
|
59
|
+
- Built-in CLI for quick fetch, inspection, and debugging
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
Install from PyPI:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install crawlsmith
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Requirements:
|
|
70
|
+
|
|
71
|
+
- Python 3.10+
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import asyncio
|
|
77
|
+
|
|
78
|
+
from crawlsmith import CurlCffiScraper
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def main() -> None:
|
|
82
|
+
scraper = CurlCffiScraper()
|
|
83
|
+
result = await scraper.fetch("https://example.com")
|
|
84
|
+
|
|
85
|
+
if result.ok:
|
|
86
|
+
print(result.status)
|
|
87
|
+
print(result.content[:200])
|
|
88
|
+
print(result.markdown[:200])
|
|
89
|
+
else:
|
|
90
|
+
print(result.error_type, result.error)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
asyncio.run(main())
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Python Usage
|
|
97
|
+
|
|
98
|
+
### Basic Fetch
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
import asyncio
|
|
102
|
+
|
|
103
|
+
from crawlsmith import CurlCffiScraper
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def main() -> None:
|
|
107
|
+
scraper = CurlCffiScraper()
|
|
108
|
+
result = await scraper.fetch("https://example.com")
|
|
109
|
+
|
|
110
|
+
if not result.ok:
|
|
111
|
+
raise RuntimeError(f"{result.error_type}: {result.error}")
|
|
112
|
+
|
|
113
|
+
print("Status:", result.status)
|
|
114
|
+
print("URL:", result.url)
|
|
115
|
+
print("Content length:", result.content_length)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
asyncio.run(main())
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Read HTML and Markdown
|
|
122
|
+
|
|
123
|
+
When a request succeeds with HTTP `200`, Crawlsmith returns both the raw response
|
|
124
|
+
body and a Markdown representation.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import asyncio
|
|
128
|
+
|
|
129
|
+
from crawlsmith import CurlCffiScraper
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
async def main() -> None:
|
|
133
|
+
scraper = CurlCffiScraper()
|
|
134
|
+
result = await scraper.fetch("https://example.com")
|
|
135
|
+
|
|
136
|
+
if result.ok:
|
|
137
|
+
html = result.content
|
|
138
|
+
markdown = result.markdown
|
|
139
|
+
print(html[:300])
|
|
140
|
+
print(markdown[:300])
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
asyncio.run(main())
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Access Structured Metadata
|
|
147
|
+
|
|
148
|
+
Each result includes metadata extracted from the response body and headers.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
import asyncio
|
|
152
|
+
|
|
153
|
+
from crawlsmith import CurlCffiScraper
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
async def main() -> None:
|
|
157
|
+
scraper = CurlCffiScraper()
|
|
158
|
+
result = await scraper.fetch("https://example.com")
|
|
159
|
+
|
|
160
|
+
metadata = result.metadata or {}
|
|
161
|
+
document = metadata.get("document", {})
|
|
162
|
+
open_graph = metadata.get("open_graph", {})
|
|
163
|
+
twitter = metadata.get("twitter", {})
|
|
164
|
+
http = metadata.get("http", {})
|
|
165
|
+
|
|
166
|
+
print("Title:", document.get("title"))
|
|
167
|
+
print("Description:", document.get("description"))
|
|
168
|
+
print("Canonical URL:", document.get("canonical_url"))
|
|
169
|
+
print("OG Title:", open_graph.get("title"))
|
|
170
|
+
print("Twitter Card:", twitter.get("card"))
|
|
171
|
+
print("Final URL:", http.get("final_url"))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
asyncio.run(main())
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Use Proxies
|
|
178
|
+
|
|
179
|
+
Pass a list of proxies. Crawlsmith will shuffle them, try up to three unique
|
|
180
|
+
entries, and return as soon as one succeeds with enough content.
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
import asyncio
|
|
184
|
+
|
|
185
|
+
from crawlsmith import CurlCffiScraper
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def main() -> None:
|
|
189
|
+
scraper = CurlCffiScraper(
|
|
190
|
+
proxies=[
|
|
191
|
+
"http://user:pass@proxy-1.example:8080",
|
|
192
|
+
"http://user:pass@proxy-2.example:8080",
|
|
193
|
+
"proxy-3.example:8080",
|
|
194
|
+
],
|
|
195
|
+
min_content_length=2000,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
result = await scraper.fetch("https://example.com")
|
|
199
|
+
print(result.ok, result.via_proxy, result.proxy_url)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
asyncio.run(main())
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Control Browser Impersonation
|
|
206
|
+
|
|
207
|
+
You can force a specific `curl_cffi` impersonation profile instead of using the
|
|
208
|
+
default randomized behavior.
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
import asyncio
|
|
212
|
+
|
|
213
|
+
from crawlsmith import CurlCffiScraper
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
async def main() -> None:
|
|
217
|
+
scraper = CurlCffiScraper(impersonate="chrome120")
|
|
218
|
+
result = await scraper.fetch("https://example.com")
|
|
219
|
+
print(result.status, result.error_type)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
asyncio.run(main())
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Configure TLS and Timeouts
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
import asyncio
|
|
229
|
+
|
|
230
|
+
from crawlsmith import CurlCffiScraper
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
async def main() -> None:
|
|
234
|
+
scraper = CurlCffiScraper(
|
|
235
|
+
verify=True,
|
|
236
|
+
connect_timeout=5,
|
|
237
|
+
read_timeout=20,
|
|
238
|
+
)
|
|
239
|
+
result = await scraper.fetch("https://example.com")
|
|
240
|
+
print(result.to_dict())
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
asyncio.run(main())
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
If you need to disable TLS certificate verification for a controlled internal
|
|
247
|
+
environment, set `verify=False`.
|
|
248
|
+
|
|
249
|
+
### Handle Errors Explicitly
|
|
250
|
+
|
|
251
|
+
Failures are returned as structured results instead of raising request errors in
|
|
252
|
+
normal operation.
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
import asyncio
|
|
256
|
+
|
|
257
|
+
from crawlsmith import CurlCffiScraper
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
async def main() -> None:
|
|
261
|
+
scraper = CurlCffiScraper()
|
|
262
|
+
result = await scraper.fetch("https://example.com")
|
|
263
|
+
|
|
264
|
+
if result.ok:
|
|
265
|
+
print("Fetched successfully")
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
print("Error type:", result.error_type)
|
|
269
|
+
print("Error message:", result.error)
|
|
270
|
+
print("HTTP status:", result.status)
|
|
271
|
+
print("Blocked:", result.is_blocked)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
asyncio.run(main())
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Common error types include:
|
|
278
|
+
|
|
279
|
+
- `TIMEOUT`
|
|
280
|
+
- `CONNECTION`
|
|
281
|
+
- `SSL`
|
|
282
|
+
- `INVALID_URL`
|
|
283
|
+
- `BLOCKED`
|
|
284
|
+
- `HTTP_403`
|
|
285
|
+
- `HTTP_429`
|
|
286
|
+
- `HTTP_4XX`
|
|
287
|
+
- `HTTP_5XX`
|
|
288
|
+
- `UNKNOWN`
|
|
289
|
+
|
|
290
|
+
### Serialize Results
|
|
291
|
+
|
|
292
|
+
`FetchResult` can be converted directly into a plain dictionary for logging,
|
|
293
|
+
storage, or JSON serialization.
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
import asyncio
|
|
297
|
+
import json
|
|
298
|
+
|
|
299
|
+
from crawlsmith import CurlCffiScraper
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
async def main() -> None:
|
|
303
|
+
scraper = CurlCffiScraper()
|
|
304
|
+
result = await scraper.fetch("https://example.com")
|
|
305
|
+
print(json.dumps(result.to_dict(), indent=2))
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
asyncio.run(main())
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## CLI Usage
|
|
312
|
+
|
|
313
|
+
The package installs a `crawlsmith` command for quick fetches from the terminal.
|
|
314
|
+
|
|
315
|
+
### Basic CLI Request
|
|
316
|
+
|
|
317
|
+
```bash
|
|
318
|
+
crawlsmith https://example.com
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
The CLI prints a JSON-serialized `FetchResult` to stdout.
|
|
322
|
+
|
|
323
|
+
### Print the Response Body
|
|
324
|
+
|
|
325
|
+
```bash
|
|
326
|
+
crawlsmith https://example.com --print-content
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### Use One or More Proxies
|
|
330
|
+
|
|
331
|
+
```bash
|
|
332
|
+
crawlsmith https://example.com \
|
|
333
|
+
--proxy http://user:pass@proxy-1.example:8080 \
|
|
334
|
+
--proxy http://user:pass@proxy-2.example:8080 \
|
|
335
|
+
--min-content-length 2000
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Force an Impersonation Profile
|
|
339
|
+
|
|
340
|
+
```bash
|
|
341
|
+
crawlsmith https://example.com --impersonate chrome120
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
### Change Timeout or Disable TLS Verification
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
crawlsmith https://example.com --timeout 20
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
```bash
|
|
351
|
+
crawlsmith https://example.com --insecure
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
### CLI Exit Codes
|
|
355
|
+
|
|
356
|
+
- `0` when the request succeeds
|
|
357
|
+
- `1` when the request fails
|
|
358
|
+
|
|
359
|
+
### CLI Help
|
|
360
|
+
|
|
361
|
+
```bash
|
|
362
|
+
crawlsmith --help
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
## Result Model
|
|
366
|
+
|
|
367
|
+
`FetchResult` exposes the following fields:
|
|
368
|
+
|
|
369
|
+
- `ok`: whether the request was considered successful
|
|
370
|
+
- `url`: requested URL
|
|
371
|
+
- `status`: HTTP status code when available
|
|
372
|
+
- `content`: raw response text when successful
|
|
373
|
+
- `markdown`: Markdown conversion of the response body when successful
|
|
374
|
+
- `metadata`: extracted document and HTTP metadata
|
|
375
|
+
- `error_type`: normalized error classification
|
|
376
|
+
- `error`: human-readable error summary
|
|
377
|
+
- `via_proxy`: whether the successful or failed attempt used a proxy
|
|
378
|
+
- `proxy_url`: proxy used for the final attempt, if any
|
|
379
|
+
- `content_length`: UTF-8 byte length of the extracted text
|
|
380
|
+
- `is_blocked`: whether the response looks like an anti-bot interstitial
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
## Support & Connect
|
|
384
|
+
|
|
385
|
+
* ⭐ **Star the repo** if you found it useful
|
|
386
|
+
* ☕ **Support me:** Say thanks by buying me a coffee! [https://buymeacoffee.com/juanmcristobal](https://buymeacoffee.com/juanmcristobal)
|
|
387
|
+
* 💼 **Open to work:** [https://www.linkedin.com/in/jmcristobal/](https://www.linkedin.com/in/jmcristobal/)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
# History
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
## 0.1.0 (2026-04-07)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
* First release.
|