crawlsmith 0.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlsmith/__init__.py +9 -0
- crawlsmith/cli.py +69 -0
- crawlsmith/crawlsmith.py +667 -0
- crawlsmith-0.1.0.dist-info/METADATA +396 -0
- crawlsmith-0.1.0.dist-info/RECORD +9 -0
- crawlsmith-0.1.0.dist-info/WHEEL +6 -0
- crawlsmith-0.1.0.dist-info/entry_points.txt +2 -0
- crawlsmith-0.1.0.dist-info/licenses/AUTHORS.md +10 -0
- crawlsmith-0.1.0.dist-info/top_level.txt +1 -0
crawlsmith/__init__.py
ADDED
crawlsmith/cli.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Console script for crawlsmith."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
from crawlsmith.crawlsmith import (DEFAULT_READ_TIMEOUT_SECONDS,
|
|
12
|
+
MIN_CONTENT_LENGTH, CurlCffiScraper)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.command()
|
|
16
|
+
@click.argument("url", required=False)
|
|
17
|
+
@click.option("--proxy", multiple=True, help="Proxy URL. Can be passed multiple times.")
|
|
18
|
+
@click.option("--impersonate", help="curl_cffi impersonation, e.g. chrome120")
|
|
19
|
+
@click.option(
|
|
20
|
+
"--timeout",
|
|
21
|
+
default=DEFAULT_READ_TIMEOUT_SECONDS,
|
|
22
|
+
type=int,
|
|
23
|
+
show_default=True,
|
|
24
|
+
help="Read timeout in seconds",
|
|
25
|
+
)
|
|
26
|
+
@click.option(
|
|
27
|
+
"--min-content-length",
|
|
28
|
+
default=MIN_CONTENT_LENGTH,
|
|
29
|
+
type=int,
|
|
30
|
+
show_default=True,
|
|
31
|
+
help="Minimum content length for proxy success",
|
|
32
|
+
)
|
|
33
|
+
@click.option("--insecure", is_flag=True, help="Disable TLS verification")
|
|
34
|
+
@click.option("--print-content", is_flag=True, help="Print the response body")
|
|
35
|
+
def main(
|
|
36
|
+
url: str | None,
|
|
37
|
+
proxy: tuple[str, ...],
|
|
38
|
+
impersonate: str | None,
|
|
39
|
+
timeout: int,
|
|
40
|
+
min_content_length: int,
|
|
41
|
+
insecure: bool,
|
|
42
|
+
print_content: bool,
|
|
43
|
+
) -> int:
|
|
44
|
+
"""Fetch a URL using the library scraper.
|
|
45
|
+
|
|
46
|
+
URL to fetch.
|
|
47
|
+
"""
|
|
48
|
+
if not url:
|
|
49
|
+
click.echo(click.get_current_context().get_help())
|
|
50
|
+
return 0
|
|
51
|
+
|
|
52
|
+
scraper = CurlCffiScraper(
|
|
53
|
+
proxies=list(proxy),
|
|
54
|
+
impersonate=impersonate,
|
|
55
|
+
verify=not insecure,
|
|
56
|
+
min_content_length=min_content_length,
|
|
57
|
+
read_timeout=timeout,
|
|
58
|
+
)
|
|
59
|
+
result = asyncio.run(scraper.fetch(url))
|
|
60
|
+
|
|
61
|
+
click.echo(json.dumps(result.to_dict(), ensure_ascii=True))
|
|
62
|
+
if print_content and result.content:
|
|
63
|
+
click.echo(result.content)
|
|
64
|
+
|
|
65
|
+
raise SystemExit(0 if result.ok else 1)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
sys.exit(main()) # pragma: no cover
|
crawlsmith/crawlsmith.py
ADDED
|
@@ -0,0 +1,667 @@
|
|
|
1
|
+
"""Public scraping library built on top of curl_cffi."""
|
|
2
|
+
|
|
3
|
+
# flake8: noqa: E501
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import gzip
|
|
9
|
+
import random
|
|
10
|
+
import re
|
|
11
|
+
import ssl
|
|
12
|
+
from dataclasses import asdict, dataclass
|
|
13
|
+
from html import unescape
|
|
14
|
+
from html.parser import HTMLParser
|
|
15
|
+
from typing import Any, Optional, cast
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from curl_cffi import requests as curl_requests
|
|
19
|
+
from curl_cffi.requests import errors as curl_errors
|
|
20
|
+
except ImportError: # pragma: no cover - exercised only without dependency installed
|
|
21
|
+
curl_requests = None
|
|
22
|
+
curl_errors = None
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from markdownify import markdownify as html_to_markdown
|
|
26
|
+
except ImportError: # pragma: no cover - exercised only without dependency installed
|
|
27
|
+
html_to_markdown = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
ERROR_TYPE_TIMEOUT = "TIMEOUT"
|
|
31
|
+
ERROR_TYPE_CONNECTION = "CONNECTION"
|
|
32
|
+
ERROR_TYPE_SSL = "SSL"
|
|
33
|
+
ERROR_TYPE_INVALID_URL = "INVALID_URL"
|
|
34
|
+
ERROR_TYPE_BLOCKED = "BLOCKED"
|
|
35
|
+
ERROR_TYPE_HTTP_403 = "HTTP_403"
|
|
36
|
+
ERROR_TYPE_HTTP_429 = "HTTP_429"
|
|
37
|
+
ERROR_TYPE_HTTP_4XX = "HTTP_4XX"
|
|
38
|
+
ERROR_TYPE_HTTP_5XX = "HTTP_5XX"
|
|
39
|
+
ERROR_TYPE_UNKNOWN = "UNKNOWN"
|
|
40
|
+
NON_RETRYABLE_HTTP_STATUSES = {404, 410}
|
|
41
|
+
|
|
42
|
+
DEFAULT_CONNECT_TIMEOUT_SECONDS = 5
|
|
43
|
+
DEFAULT_READ_TIMEOUT_SECONDS = 15
|
|
44
|
+
MIN_CONTENT_LENGTH = 5000
|
|
45
|
+
|
|
46
|
+
DEFAULT_USER_AGENTS = [
|
|
47
|
+
# Windows and macOS User Agents
|
|
48
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
|
|
49
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
|
|
50
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.2520.81",
|
|
51
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 OPR/109.0.0.0",
|
|
52
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
|
|
53
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.5; rv:125.0) Gecko/20100101 Firefox/125.0",
|
|
54
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5.1 Safari/605.1.15",
|
|
55
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 OPR/109.0.0.0",
|
|
56
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
|
|
57
|
+
"Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0",
|
|
58
|
+
# Android User Agents
|
|
59
|
+
"Mozilla/5.0 (Linux; Android 14; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
60
|
+
"Mozilla/5.0 (Linux; Android 14; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
|
|
61
|
+
"Mozilla/5.0 (Linux; Android 14; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36,gzip(gfe)",
|
|
62
|
+
"Mozilla/5.0 (Linux; Android 14; SM-S901B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
63
|
+
"Mozilla/5.0 (Linux; Android 14; SM-S901U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
64
|
+
"Mozilla/5.0 (Linux; Android 14; SM-S908B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
65
|
+
"Mozilla/5.0 (Linux; Android 14; SM-S908U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
66
|
+
"Mozilla/5.0 (Linux; Android 14; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
67
|
+
"Mozilla/5.0 (Linux; Android 14; SM-G991U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
68
|
+
"Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
69
|
+
"Mozilla/5.0 (Linux; Android 14; SM-G998U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
70
|
+
"Mozilla/5.0 (Linux; Android 14; SM-A536B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
71
|
+
"Mozilla/5.0 (Linux; Android 14; SM-A536U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
72
|
+
"Mozilla/5.0 (Linux; Android 14; SM-A515F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
73
|
+
"Mozilla/5.0 (Linux; Android 14; SM-A515U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
74
|
+
"Mozilla/5.0 (Linux; Android 14; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
75
|
+
"Mozilla/5.0 (Linux; Android 14; SM-G973U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
76
|
+
"Mozilla/5.0 (Linux; Android 14; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
77
|
+
"Mozilla/5.0 (Linux; Android 14; Pixel 6a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
78
|
+
"Mozilla/5.0 (Linux; Android 14; Pixel 6 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
79
|
+
"Mozilla/5.0 (Linux; Android 14; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
80
|
+
"Mozilla/5.0 (Linux; Android 14; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
81
|
+
"Mozilla/5.0 (Linux; Android 14; moto g pure) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
82
|
+
"Mozilla/5.0 (Linux; Android 14; moto g stylus 5G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
83
|
+
"Mozilla/5.0 (Linux; Android 14; moto g stylus 5G (2022)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
84
|
+
"Mozilla/5.0 (Linux; Android 14; moto g 5G (2022)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
85
|
+
"Mozilla/5.0 (Linux; Android 14; moto g power (2022)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
86
|
+
"Mozilla/5.0 (Linux; Android 14; Redmi Note 9 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
87
|
+
"Mozilla/5.0 (Linux; Android 14; Redmi Note 8 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
88
|
+
"Mozilla/5.0 (Linux; Android 14; VOG-L29) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
89
|
+
"Mozilla/5.0 (Linux; Android 14; MAR-LX1A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
90
|
+
"Mozilla/5.0 (Linux; Android 14; M2101K6G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
91
|
+
"Mozilla/5.0 (Linux; Android 14; M2102J20SG) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
92
|
+
"Mozilla/5.0 (Linux; Android 14; 2201116SG) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
93
|
+
"Mozilla/5.0 (Linux; Android 14; DE2118) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Mobile Safari/537.36",
|
|
94
|
+
# iPhone User Agents
|
|
95
|
+
"Mozilla/5.0 (iPhone16,6; U; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19E241 Safari/602.1",
|
|
96
|
+
"Mozilla/5.0 (iPhone16,3; U; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19A346 Safari/602.1",
|
|
97
|
+
"Mozilla/5.0 (iPhone15,2; U; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1",
|
|
98
|
+
"Mozilla/5.0 (iPhone14,1; U; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1",
|
|
99
|
+
"Mozilla/5.0 (iPhone14,1; U; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1",
|
|
100
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
|
|
101
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/69.0.3497.105 Mobile/15E148 Safari/605.1",
|
|
102
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/13.2b11866 Mobile/16A366 Safari/605.1.15",
|
|
103
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
|
|
104
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1",
|
|
105
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A5370a Safari/604.1",
|
|
106
|
+
"Mozilla/5.0 (iPhone9,3; U; CPU iPhone OS 12_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1",
|
|
107
|
+
"Mozilla/5.0 (iPhone9,4; U; CPU iPhone OS 12_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1",
|
|
108
|
+
"Mozilla/5.0 (Apple-iPhone7C2/1202.466; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543 Safari/419.3",
|
|
109
|
+
# Windows Phone User Agents
|
|
110
|
+
"Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; RM-1152) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.36 Edge/15.15254",
|
|
111
|
+
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; RM-1127_16056) AppleWebKit/537.36(KHTML, like Gecko) Chrome/42.0.2311.135 Mobile Safari/537.36 Edge/12.10536",
|
|
112
|
+
"Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/13.1058",
|
|
113
|
+
# Tablet User Agents
|
|
114
|
+
"Mozilla/5.0 (Linux; Android 14; SM-X906C Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/80.0.3987.119 Mobile Safari/537.36",
|
|
115
|
+
"Mozilla/5.0 (Linux; Android 13; Lenovo YT-J706X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
|
|
116
|
+
"Mozilla/5.0 (Linux; Android 9; Pixel C Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.98 Safari/537.36",
|
|
117
|
+
"Mozilla/5.0 (Linux; Android 8.1.0; SGP771 Build/32.2.A.0.253; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.98 Safari/537.36",
|
|
118
|
+
"Mozilla/5.0 (Linux; Android 8.1.0; SHIELD Tablet K1 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Safari/537.36",
|
|
119
|
+
"Mozilla/5.0 (Linux; Android 9; SM-T827R4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.116 Safari/537.36",
|
|
120
|
+
"Mozilla/5.0 (Linux; Android 7.0; SAMSUNG SM-T550 Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/3.3 Chrome/38.0.2125.102 Safari/537.36",
|
|
121
|
+
"Mozilla/5.0 (Linux; Android 4.4.3; KFTHWI Build/KTU84M) AppleWebKit/537.36 (KHTML, like Gecko) Silk/47.1.79 like Chrome/47.0.2526.80 Safari/537.36",
|
|
122
|
+
"Mozilla/5.0 (Linux; Android 7.0; LG-V410/V41020c Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/34.0.1847.118 Safari/537.36",
|
|
123
|
+
# Desktop User Agents
|
|
124
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
|
|
125
|
+
"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
|
|
126
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
|
|
127
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36",
|
|
128
|
+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1",
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
_CURL_TIMEOUT_TYPES = tuple(
|
|
132
|
+
t
|
|
133
|
+
for t in (
|
|
134
|
+
getattr(curl_errors, "Timeout", None),
|
|
135
|
+
getattr(curl_errors, "TimeoutError", None),
|
|
136
|
+
getattr(curl_errors, "ConnectTimeout", None),
|
|
137
|
+
getattr(curl_errors, "ReadTimeout", None),
|
|
138
|
+
getattr(curl_errors, "RequestsTimeout", None),
|
|
139
|
+
)
|
|
140
|
+
if t
|
|
141
|
+
)
|
|
142
|
+
_CURL_INVALID_URL = getattr(curl_errors, "InvalidURL", None)
|
|
143
|
+
_CURL_SSL_ERROR = getattr(curl_errors, "SSLError", None)
|
|
144
|
+
_CURL_PROXY_ERROR = getattr(curl_errors, "ProxyError", None)
|
|
145
|
+
_CURL_CONNECTION_ERROR = getattr(curl_errors, "ConnectionError", None)
|
|
146
|
+
_CURL_REQUESTS_ERROR = tuple(
|
|
147
|
+
t
|
|
148
|
+
for t in (
|
|
149
|
+
getattr(curl_errors, "RequestsError", None),
|
|
150
|
+
getattr(curl_errors, "RequestError", None),
|
|
151
|
+
)
|
|
152
|
+
if t
|
|
153
|
+
)
|
|
154
|
+
_CURL_REQUESTS_ERROR_TYPES = _CURL_REQUESTS_ERROR or (Exception,)
|
|
155
|
+
_CURL_REQUESTS_AND_TIMEOUT_TYPES = _CURL_REQUESTS_ERROR_TYPES + (
|
|
156
|
+
asyncio.TimeoutError,
|
|
157
|
+
ssl.SSLError,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@dataclass(frozen=True)
|
|
162
|
+
class FetchResult:
|
|
163
|
+
ok: bool
|
|
164
|
+
url: str
|
|
165
|
+
status: int | None = None
|
|
166
|
+
content: str | None = None
|
|
167
|
+
markdown: str | None = None
|
|
168
|
+
metadata: dict[str, Any] | None = None
|
|
169
|
+
error_type: str | None = None
|
|
170
|
+
error: str | None = None
|
|
171
|
+
via_proxy: bool = False
|
|
172
|
+
proxy_url: str | None = None
|
|
173
|
+
content_length: int = 0
|
|
174
|
+
is_blocked: bool = False
|
|
175
|
+
|
|
176
|
+
def to_dict(self) -> dict[str, Any]:
|
|
177
|
+
return asdict(self)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class BlockDetector:
|
|
181
|
+
STRONG_PATTERNS = [
|
|
182
|
+
re.compile(r"Attention Required!\s*\|\s*Cloudflare", re.I),
|
|
183
|
+
re.compile(r"Checking your browser", re.I),
|
|
184
|
+
re.compile(r"Just a moment\.\.\.", re.I),
|
|
185
|
+
re.compile(r"cf-chl-", re.I),
|
|
186
|
+
re.compile(r"access\.denied\.\.DDoS\.Guard", re.I),
|
|
187
|
+
re.compile(r"px-captcha", re.I),
|
|
188
|
+
re.compile(r"distil_r_captcha", re.I),
|
|
189
|
+
re.compile(r"distilnetworks", re.I),
|
|
190
|
+
re.compile(r"whoa there, pardner!", re.I),
|
|
191
|
+
re.compile(r"blocked due to a network policy", re.I),
|
|
192
|
+
]
|
|
193
|
+
SOFT_PATTERNS = [
|
|
194
|
+
re.compile(r"cf_challenge", re.I),
|
|
195
|
+
re.compile(r"cf-turnstile", re.I),
|
|
196
|
+
re.compile(r"g-recaptcha", re.I),
|
|
197
|
+
re.compile(r"www\.google\.com/recaptcha", re.I),
|
|
198
|
+
re.compile(r"recaptcha/api\.js", re.I),
|
|
199
|
+
re.compile(r"hcaptcha\.com/1/api\.js", re.I),
|
|
200
|
+
re.compile(r"hcaptcha", re.I),
|
|
201
|
+
]
|
|
202
|
+
JS_CHALLENGE_PATTERNS = [
|
|
203
|
+
re.compile(r"window\.location\.href\s*=.*challenge", re.I),
|
|
204
|
+
re.compile(r"document\.cookie\s*=.*challenge", re.I),
|
|
205
|
+
re.compile(r"challenge-platform", re.I),
|
|
206
|
+
re.compile(r"challenge-form", re.I),
|
|
207
|
+
re.compile(r"jschal-answer", re.I),
|
|
208
|
+
re.compile(r"captcha-solution", re.I),
|
|
209
|
+
re.compile(r"verification-token", re.I),
|
|
210
|
+
]
|
|
211
|
+
SUSPICIOUS_PATTERNS = [
|
|
212
|
+
re.compile(r"<title>Access Denied", re.I),
|
|
213
|
+
re.compile(r"<title>403 Forbidden", re.I),
|
|
214
|
+
re.compile(r"<title>429 Too Many Requests", re.I),
|
|
215
|
+
re.compile(r"<title>You don't have permission", re.I),
|
|
216
|
+
re.compile(r"please verify you are human", re.I),
|
|
217
|
+
re.compile(r"please complete the security check", re.I),
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
@classmethod
|
|
221
|
+
def is_blocked(cls, response_text: str, status_code: Optional[int] = None) -> bool:
|
|
222
|
+
if not response_text:
|
|
223
|
+
return status_code in (403, 429) if status_code else False
|
|
224
|
+
|
|
225
|
+
if status_code in (403, 429):
|
|
226
|
+
return True
|
|
227
|
+
|
|
228
|
+
if status_code in (520, 521, 522, 523, 524, 525, 526, 527):
|
|
229
|
+
return False
|
|
230
|
+
|
|
231
|
+
head = response_text.lstrip()[:500].lower()
|
|
232
|
+
looks_like_xml = (
|
|
233
|
+
head.startswith("<?xml")
|
|
234
|
+
or head.startswith("<rss")
|
|
235
|
+
or head.startswith("<feed")
|
|
236
|
+
or head.startswith("<rdf:rdf")
|
|
237
|
+
or head.startswith("<sitemapindex")
|
|
238
|
+
or head.startswith("<urlset")
|
|
239
|
+
)
|
|
240
|
+
if looks_like_xml:
|
|
241
|
+
return False
|
|
242
|
+
|
|
243
|
+
text_only = re.sub(r"<[^>]+>", " ", response_text)
|
|
244
|
+
text_only = re.sub(r"\s+", " ", text_only).strip()
|
|
245
|
+
text_len = len(text_only)
|
|
246
|
+
text_ratio = text_len / max(len(response_text), 1)
|
|
247
|
+
looks_like_real_content = (text_len >= 1500) or (
|
|
248
|
+
(text_len >= 800) and (text_ratio >= 0.01)
|
|
249
|
+
)
|
|
250
|
+
looks_like_interstitial = (text_len < 600) or (text_ratio < 0.008)
|
|
251
|
+
|
|
252
|
+
for pattern in cls.STRONG_PATTERNS:
|
|
253
|
+
if pattern.search(response_text):
|
|
254
|
+
return True
|
|
255
|
+
|
|
256
|
+
for pattern in (
|
|
257
|
+
cls.SOFT_PATTERNS + cls.JS_CHALLENGE_PATTERNS + cls.SUSPICIOUS_PATTERNS
|
|
258
|
+
):
|
|
259
|
+
if pattern.search(response_text):
|
|
260
|
+
return looks_like_interstitial and not looks_like_real_content
|
|
261
|
+
|
|
262
|
+
return False
|
|
263
|
+
|
|
264
|
+
@classmethod
|
|
265
|
+
def get_block_reason(cls, response_text: str) -> str | None:
|
|
266
|
+
if not response_text:
|
|
267
|
+
return None
|
|
268
|
+
for pattern in (
|
|
269
|
+
cls.STRONG_PATTERNS + cls.JS_CHALLENGE_PATTERNS + cls.SUSPICIOUS_PATTERNS
|
|
270
|
+
):
|
|
271
|
+
if pattern.search(response_text):
|
|
272
|
+
return pattern.pattern
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _classify_status(status: int) -> str:
|
|
277
|
+
if status == 403:
|
|
278
|
+
return ERROR_TYPE_HTTP_403
|
|
279
|
+
if status == 429:
|
|
280
|
+
return ERROR_TYPE_HTTP_429
|
|
281
|
+
if 400 <= status <= 499:
|
|
282
|
+
return ERROR_TYPE_HTTP_4XX
|
|
283
|
+
if 500 <= status <= 599:
|
|
284
|
+
return ERROR_TYPE_HTTP_5XX
|
|
285
|
+
return ERROR_TYPE_UNKNOWN
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _header_map(response: Any) -> dict[str, str]:
|
|
289
|
+
headers = getattr(response, "headers", {}) or {}
|
|
290
|
+
return {str(k).lower(): str(v) for k, v in dict(headers).items()}
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _is_gzip_payload(url: str, headers: dict[str, str], body: bytes) -> bool:
|
|
294
|
+
if body.startswith(b"\x1f\x8b"):
|
|
295
|
+
return True
|
|
296
|
+
if "gzip" in headers.get("content-type", "").lower():
|
|
297
|
+
return True
|
|
298
|
+
if ".gz" in headers.get("content-disposition", "").lower():
|
|
299
|
+
return True
|
|
300
|
+
if url.lower().endswith(".gz"):
|
|
301
|
+
return True
|
|
302
|
+
return False
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _extract_response_text(response: Any, url: str) -> str:
|
|
306
|
+
body = getattr(response, "content", None)
|
|
307
|
+
if isinstance(body, bytearray):
|
|
308
|
+
body = bytes(body)
|
|
309
|
+
|
|
310
|
+
if isinstance(body, bytes) and body:
|
|
311
|
+
headers = _header_map(response)
|
|
312
|
+
if _is_gzip_payload(url, headers, body):
|
|
313
|
+
try:
|
|
314
|
+
return gzip.decompress(body).decode("utf-8", errors="replace")
|
|
315
|
+
except OSError:
|
|
316
|
+
pass
|
|
317
|
+
|
|
318
|
+
text = getattr(response, "text", None)
|
|
319
|
+
if isinstance(text, str):
|
|
320
|
+
return text
|
|
321
|
+
|
|
322
|
+
if isinstance(body, bytes):
|
|
323
|
+
return body.decode("utf-8", errors="replace")
|
|
324
|
+
|
|
325
|
+
return ""
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _looks_like_xml_document(content: str) -> bool:
|
|
329
|
+
head = content.lstrip()[:500].lower()
|
|
330
|
+
return (
|
|
331
|
+
head.startswith("<?xml")
|
|
332
|
+
or head.startswith("<rss")
|
|
333
|
+
or head.startswith("<feed")
|
|
334
|
+
or head.startswith("<rdf:rdf")
|
|
335
|
+
or head.startswith("<sitemapindex")
|
|
336
|
+
or head.startswith("<urlset")
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _convert_html_to_markdown(content: str) -> str:
|
|
341
|
+
if html_to_markdown is not None and not _looks_like_xml_document(content):
|
|
342
|
+
return html_to_markdown(content, heading_style="ATX").strip()
|
|
343
|
+
|
|
344
|
+
text = content
|
|
345
|
+
replacements = (
|
|
346
|
+
(r"<h1[^>]*>(.*?)</h1>", r"# \1\n\n"),
|
|
347
|
+
(r"<h2[^>]*>(.*?)</h2>", r"## \1\n\n"),
|
|
348
|
+
(r"<strong[^>]*>(.*?)</strong>", r"**\1**"),
|
|
349
|
+
(r"<b[^>]*>(.*?)</b>", r"**\1**"),
|
|
350
|
+
(r"<em[^>]*>(.*?)</em>", r"*\1*"),
|
|
351
|
+
(r"<i[^>]*>(.*?)</i>", r"*\1*"),
|
|
352
|
+
(r"<p[^>]*>(.*?)</p>", r"\1\n\n"),
|
|
353
|
+
(r"<br\s*/?>", "\n"),
|
|
354
|
+
)
|
|
355
|
+
for pattern, replacement in replacements:
|
|
356
|
+
text = re.sub(pattern, replacement, text, flags=re.I | re.S)
|
|
357
|
+
text = re.sub(r"<[^>]+>", "", text)
|
|
358
|
+
text = unescape(text)
|
|
359
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
360
|
+
text = re.sub(r"[ \t]+\n", "\n", text)
|
|
361
|
+
return text.strip()
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
class _MetadataHTMLParser(HTMLParser):
|
|
365
|
+
def __init__(self) -> None:
|
|
366
|
+
super().__init__()
|
|
367
|
+
self.document: dict[str, Any] = {}
|
|
368
|
+
self.open_graph: dict[str, Any] = {}
|
|
369
|
+
self.twitter: dict[str, Any] = {}
|
|
370
|
+
self._in_title = False
|
|
371
|
+
self._title_parts: list[str] = []
|
|
372
|
+
|
|
373
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
374
|
+
attr_map = {key.lower(): (value or "") for key, value in attrs}
|
|
375
|
+
|
|
376
|
+
if tag == "html" and attr_map.get("lang"):
|
|
377
|
+
self.document["lang"] = attr_map["lang"]
|
|
378
|
+
|
|
379
|
+
if tag == "title":
|
|
380
|
+
self._in_title = True
|
|
381
|
+
|
|
382
|
+
if tag == "meta":
|
|
383
|
+
name = attr_map.get("name", "").strip().lower()
|
|
384
|
+
prop = attr_map.get("property", "").strip().lower()
|
|
385
|
+
content = attr_map.get("content", "").strip()
|
|
386
|
+
if not content:
|
|
387
|
+
return
|
|
388
|
+
if name in {
|
|
389
|
+
"description",
|
|
390
|
+
"author",
|
|
391
|
+
"keywords",
|
|
392
|
+
"published_time",
|
|
393
|
+
"modified_time",
|
|
394
|
+
}:
|
|
395
|
+
self.document[name] = content
|
|
396
|
+
elif name.startswith("twitter:"):
|
|
397
|
+
self.twitter[name.removeprefix("twitter:")] = content
|
|
398
|
+
elif prop.startswith("og:"):
|
|
399
|
+
self.open_graph[prop.removeprefix("og:")] = content
|
|
400
|
+
|
|
401
|
+
if tag == "link":
|
|
402
|
+
rel = attr_map.get("rel", "").strip().lower()
|
|
403
|
+
href = attr_map.get("href", "").strip()
|
|
404
|
+
if rel == "canonical" and href:
|
|
405
|
+
self.document["canonical_url"] = href
|
|
406
|
+
|
|
407
|
+
def handle_endtag(self, tag: str) -> None:
|
|
408
|
+
if tag == "title":
|
|
409
|
+
self._in_title = False
|
|
410
|
+
title = "".join(self._title_parts).strip()
|
|
411
|
+
if title:
|
|
412
|
+
self.document["title"] = unescape(title)
|
|
413
|
+
self._title_parts.clear()
|
|
414
|
+
|
|
415
|
+
def handle_data(self, data: str) -> None:
|
|
416
|
+
if self._in_title:
|
|
417
|
+
self._title_parts.append(data)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _extract_metadata(
|
|
421
|
+
response: Any,
|
|
422
|
+
html: str,
|
|
423
|
+
*,
|
|
424
|
+
requested_url: str,
|
|
425
|
+
final_url: str | None,
|
|
426
|
+
status: int | None,
|
|
427
|
+
) -> dict[str, Any]:
|
|
428
|
+
parser = _MetadataHTMLParser()
|
|
429
|
+
try:
|
|
430
|
+
parser.feed(html)
|
|
431
|
+
except Exception:
|
|
432
|
+
pass
|
|
433
|
+
|
|
434
|
+
headers = _header_map(response)
|
|
435
|
+
return {
|
|
436
|
+
"document": parser.document,
|
|
437
|
+
"open_graph": parser.open_graph,
|
|
438
|
+
"twitter": parser.twitter,
|
|
439
|
+
"http": {
|
|
440
|
+
"status": status,
|
|
441
|
+
"requested_url": requested_url,
|
|
442
|
+
"final_url": final_url or requested_url,
|
|
443
|
+
"content_type": headers.get("content-type"),
|
|
444
|
+
"content_encoding": headers.get("content-encoding"),
|
|
445
|
+
"headers": headers,
|
|
446
|
+
},
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def _classify_exception(exc: BaseException) -> tuple[str, str]:
|
|
451
|
+
if isinstance(exc, asyncio.TimeoutError) or (
|
|
452
|
+
_CURL_TIMEOUT_TYPES and isinstance(exc, _CURL_TIMEOUT_TYPES)
|
|
453
|
+
):
|
|
454
|
+
return ERROR_TYPE_TIMEOUT, "Request timed out"
|
|
455
|
+
if _CURL_INVALID_URL and isinstance(exc, _CURL_INVALID_URL):
|
|
456
|
+
return ERROR_TYPE_INVALID_URL, "Invalid URL"
|
|
457
|
+
if isinstance(exc, ssl.SSLError) or (
|
|
458
|
+
_CURL_SSL_ERROR and isinstance(exc, _CURL_SSL_ERROR)
|
|
459
|
+
):
|
|
460
|
+
return ERROR_TYPE_SSL, "TLS/SSL error"
|
|
461
|
+
if _CURL_PROXY_ERROR and isinstance(exc, _CURL_PROXY_ERROR):
|
|
462
|
+
return ERROR_TYPE_CONNECTION, "Proxy connection failed"
|
|
463
|
+
if _CURL_CONNECTION_ERROR and isinstance(exc, _CURL_CONNECTION_ERROR):
|
|
464
|
+
return ERROR_TYPE_CONNECTION, "Connection failed"
|
|
465
|
+
if _CURL_REQUESTS_ERROR and isinstance(exc, _CURL_REQUESTS_ERROR):
|
|
466
|
+
return ERROR_TYPE_CONNECTION, "HTTP client error"
|
|
467
|
+
return ERROR_TYPE_UNKNOWN, "Unknown error"
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def generate_fingerprint() -> dict[str, Any]:
|
|
471
|
+
return {
|
|
472
|
+
"headers": {
|
|
473
|
+
"User-Agent": random.choice(DEFAULT_USER_AGENTS),
|
|
474
|
+
"Accept-Encoding": "gzip, deflate, br, zstd",
|
|
475
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
476
|
+
"Accept-Language": (
|
|
477
|
+
f"{random.choice(['en', 'es', 'fr'])}-"
|
|
478
|
+
f"{random.choice(['US', 'ES', 'CA'])};q=0.{random.randint(5, 9)}"
|
|
479
|
+
),
|
|
480
|
+
"Sec-Ch-Ua": f'"Chromium";v="{random.randint(120, 124)}", "Not.A/Brand";v="24"',
|
|
481
|
+
"Sec-Ch-Ua-Mobile": "?0",
|
|
482
|
+
"Sec-Ch-Ua-Platform": random.choice(['"Windows"', '"macOS"', '"Linux"']),
|
|
483
|
+
"DNT": "1",
|
|
484
|
+
"Sec-Fetch-Dest": "document",
|
|
485
|
+
"Sec-Fetch-Mode": "navigate",
|
|
486
|
+
"Sec-Fetch-Site": "none",
|
|
487
|
+
"Upgrade-Insecure-Requests": "1",
|
|
488
|
+
"Cache-Control": f"max-age={random.randint(0, 3600)}",
|
|
489
|
+
},
|
|
490
|
+
"http2": True,
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
class StealthRequest:
|
|
495
|
+
BROWSER_IMPERSONATIONS = [
|
|
496
|
+
"chrome120",
|
|
497
|
+
"chrome119",
|
|
498
|
+
"chrome116",
|
|
499
|
+
"safari15_5",
|
|
500
|
+
"safari15_3",
|
|
501
|
+
]
|
|
502
|
+
|
|
503
|
+
def __init__(
|
|
504
|
+
self,
|
|
505
|
+
proxies: Optional[list[str]] = None,
|
|
506
|
+
verify: bool = True,
|
|
507
|
+
impersonate: Optional[str] = None,
|
|
508
|
+
min_content_length: int = MIN_CONTENT_LENGTH,
|
|
509
|
+
connect_timeout: int = DEFAULT_CONNECT_TIMEOUT_SECONDS,
|
|
510
|
+
read_timeout: int = DEFAULT_READ_TIMEOUT_SECONDS,
|
|
511
|
+
) -> None:
|
|
512
|
+
self.proxies = list(proxies or [])
|
|
513
|
+
self.verify = verify
|
|
514
|
+
self.impersonate = impersonate or random.choice(self.BROWSER_IMPERSONATIONS)
|
|
515
|
+
self.min_content_length = min_content_length
|
|
516
|
+
self.connect_timeout = connect_timeout
|
|
517
|
+
self.read_timeout = read_timeout
|
|
518
|
+
|
|
519
|
+
async def fetch(self, url: str) -> FetchResult:
|
|
520
|
+
if curl_requests is None:
|
|
521
|
+
return FetchResult(
|
|
522
|
+
ok=False,
|
|
523
|
+
url=url,
|
|
524
|
+
error_type=ERROR_TYPE_CONNECTION,
|
|
525
|
+
error="curl_cffi is not installed",
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
headers = generate_fingerprint()["headers"]
|
|
529
|
+
last_result: FetchResult | None = None
|
|
530
|
+
|
|
531
|
+
async def _attempt(proxy_url: str | None) -> FetchResult:
|
|
532
|
+
proxy = None
|
|
533
|
+
if proxy_url:
|
|
534
|
+
proxy = (
|
|
535
|
+
f"http://{proxy_url}"
|
|
536
|
+
if not proxy_url.startswith("http")
|
|
537
|
+
else proxy_url
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
try:
|
|
541
|
+
proxy_dict = cast(
|
|
542
|
+
Any, {"http": proxy, "https": proxy} if proxy else None
|
|
543
|
+
)
|
|
544
|
+
response = await asyncio.to_thread(
|
|
545
|
+
curl_requests.get,
|
|
546
|
+
url,
|
|
547
|
+
headers=headers,
|
|
548
|
+
proxies=proxy_dict,
|
|
549
|
+
timeout=(self.connect_timeout, self.read_timeout),
|
|
550
|
+
verify=self.verify,
|
|
551
|
+
allow_redirects=True,
|
|
552
|
+
impersonate=cast(Any, self.impersonate),
|
|
553
|
+
)
|
|
554
|
+
status = response.status_code
|
|
555
|
+
text = _extract_response_text(response, url)
|
|
556
|
+
final_url = cast(str | None, getattr(response, "url", None))
|
|
557
|
+
content_length = len(text.encode("utf-8"))
|
|
558
|
+
is_blocked = BlockDetector.is_blocked(text, status)
|
|
559
|
+
metadata = _extract_metadata(
|
|
560
|
+
response,
|
|
561
|
+
text,
|
|
562
|
+
requested_url=url,
|
|
563
|
+
final_url=final_url,
|
|
564
|
+
status=status,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
if is_blocked:
|
|
568
|
+
return FetchResult(
|
|
569
|
+
ok=False,
|
|
570
|
+
url=url,
|
|
571
|
+
status=status,
|
|
572
|
+
error_type=ERROR_TYPE_BLOCKED,
|
|
573
|
+
error=BlockDetector.get_block_reason(text)
|
|
574
|
+
or "Anti-bot challenge detected",
|
|
575
|
+
metadata=metadata,
|
|
576
|
+
via_proxy=proxy is not None,
|
|
577
|
+
proxy_url=proxy_url,
|
|
578
|
+
content_length=content_length,
|
|
579
|
+
is_blocked=True,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
if status == 200:
|
|
583
|
+
return FetchResult(
|
|
584
|
+
ok=True,
|
|
585
|
+
url=url,
|
|
586
|
+
status=status,
|
|
587
|
+
content=text,
|
|
588
|
+
markdown=_convert_html_to_markdown(text),
|
|
589
|
+
metadata=metadata,
|
|
590
|
+
via_proxy=proxy is not None,
|
|
591
|
+
proxy_url=proxy_url,
|
|
592
|
+
content_length=content_length,
|
|
593
|
+
is_blocked=False,
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
return FetchResult(
|
|
597
|
+
ok=False,
|
|
598
|
+
url=url,
|
|
599
|
+
status=status,
|
|
600
|
+
error_type=_classify_status(status),
|
|
601
|
+
error=f"HTTP {status}",
|
|
602
|
+
metadata=metadata,
|
|
603
|
+
via_proxy=proxy is not None,
|
|
604
|
+
proxy_url=proxy_url,
|
|
605
|
+
content_length=content_length,
|
|
606
|
+
is_blocked=False,
|
|
607
|
+
)
|
|
608
|
+
except _CURL_REQUESTS_AND_TIMEOUT_TYPES as exc:
|
|
609
|
+
error_type, message = _classify_exception(exc)
|
|
610
|
+
return FetchResult(
|
|
611
|
+
ok=False,
|
|
612
|
+
url=url,
|
|
613
|
+
status=None,
|
|
614
|
+
error_type=error_type,
|
|
615
|
+
error=message,
|
|
616
|
+
via_proxy=proxy is not None,
|
|
617
|
+
proxy_url=proxy_url,
|
|
618
|
+
is_blocked=False,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
if self.proxies:
|
|
622
|
+
to_try = list(dict.fromkeys(self.proxies))
|
|
623
|
+
random.shuffle(to_try)
|
|
624
|
+
for proxy_url in to_try[: min(3, len(to_try))]:
|
|
625
|
+
result = await _attempt(proxy_url)
|
|
626
|
+
last_result = result
|
|
627
|
+
if result.status in NON_RETRYABLE_HTTP_STATUSES:
|
|
628
|
+
return result
|
|
629
|
+
if (
|
|
630
|
+
result.ok
|
|
631
|
+
and not result.is_blocked
|
|
632
|
+
and result.content_length >= self.min_content_length
|
|
633
|
+
):
|
|
634
|
+
return result
|
|
635
|
+
return last_result or FetchResult(
|
|
636
|
+
ok=False,
|
|
637
|
+
url=url,
|
|
638
|
+
error_type=ERROR_TYPE_CONNECTION,
|
|
639
|
+
error="All proxy attempts failed",
|
|
640
|
+
via_proxy=True,
|
|
641
|
+
proxy_url=to_try[0] if to_try else None,
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
return await _attempt(None)
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
class CurlCffiScraper:
|
|
648
|
+
def __init__(
|
|
649
|
+
self,
|
|
650
|
+
proxies: Optional[list[str]] = None,
|
|
651
|
+
impersonate: Optional[str] = None,
|
|
652
|
+
verify: bool = True,
|
|
653
|
+
min_content_length: int = MIN_CONTENT_LENGTH,
|
|
654
|
+
connect_timeout: int = DEFAULT_CONNECT_TIMEOUT_SECONDS,
|
|
655
|
+
read_timeout: int = DEFAULT_READ_TIMEOUT_SECONDS,
|
|
656
|
+
) -> None:
|
|
657
|
+
self.stealth_request = StealthRequest(
|
|
658
|
+
proxies=proxies,
|
|
659
|
+
verify=verify,
|
|
660
|
+
impersonate=impersonate,
|
|
661
|
+
min_content_length=min_content_length,
|
|
662
|
+
connect_timeout=connect_timeout,
|
|
663
|
+
read_timeout=read_timeout,
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
async def fetch(self, url: str) -> FetchResult:
|
|
667
|
+
return await self.stealth_request.fetch(url)
|
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crawlsmith
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Crawlsmith helps you craft reliable web crawlers in Python, combining page fetching, HTML parsing, link discovery, and content extraction into a simple and extensible toolkit.
|
|
5
|
+
Home-page: https://github.com/juanmcristobal/crawlsmith
|
|
6
|
+
Author: Juan Manuel Cristóbal Moreno
|
|
7
|
+
Author-email: juanmcristobal@gmail.com
|
|
8
|
+
Keywords: crawlsmith
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Natural Language :: English
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: AUTHORS.md
|
|
19
|
+
Requires-Dist: Click==8.1.7
|
|
20
|
+
Requires-Dist: curl_cffi>=0.7.0
|
|
21
|
+
Requires-Dist: markdownify>=0.13.1
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: keywords
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+
|
|
36
|
+
# CrawlSmith
|
|
37
|
+
|
|
38
|
+
Crawlsmith is a Python scraping toolkit for fetching web pages with
|
|
39
|
+
`curl_cffi`, extracting readable content, detecting common anti-bot
|
|
40
|
+
interstitials, and returning structured metadata in a single result object.
|
|
41
|
+
|
|
42
|
+
It is designed for Python developers who want a small, pragmatic interface for:
|
|
43
|
+
|
|
44
|
+
- fetching HTML or XML content
|
|
45
|
+
- converting HTML to Markdown
|
|
46
|
+
- rotating browser impersonation profiles
|
|
47
|
+
- trying multiple proxies
|
|
48
|
+
- classifying HTTP and network failures
|
|
49
|
+
- extracting document, Open Graph, Twitter, and HTTP metadata
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- Async-first Python API built around `CurlCffiScraper`
|
|
54
|
+
- Structured `FetchResult` object with success state, content, Markdown, and metadata
|
|
55
|
+
- Automatic browser fingerprint headers and `curl_cffi` impersonation support
|
|
56
|
+
- Proxy rotation with early success and retry limits
|
|
57
|
+
- Detection of common anti-bot challenge pages such as Cloudflare-style interstitials
|
|
58
|
+
- Gzip payload handling for compressed responses and feeds
|
|
59
|
+
- Built-in CLI for quick fetch, inspection, and debugging
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
Install from PyPI:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install crawlsmith
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Requirements:
|
|
70
|
+
|
|
71
|
+
- Python 3.10+
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import asyncio
|
|
77
|
+
|
|
78
|
+
from crawlsmith import CurlCffiScraper
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def main() -> None:
|
|
82
|
+
scraper = CurlCffiScraper()
|
|
83
|
+
result = await scraper.fetch("https://example.com")
|
|
84
|
+
|
|
85
|
+
if result.ok:
|
|
86
|
+
print(result.status)
|
|
87
|
+
print(result.content[:200])
|
|
88
|
+
print(result.markdown[:200])
|
|
89
|
+
else:
|
|
90
|
+
print(result.error_type, result.error)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
asyncio.run(main())
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Python Usage
|
|
97
|
+
|
|
98
|
+
### Basic Fetch
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
import asyncio
|
|
102
|
+
|
|
103
|
+
from crawlsmith import CurlCffiScraper
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def main() -> None:
|
|
107
|
+
scraper = CurlCffiScraper()
|
|
108
|
+
result = await scraper.fetch("https://example.com")
|
|
109
|
+
|
|
110
|
+
if not result.ok:
|
|
111
|
+
raise RuntimeError(f"{result.error_type}: {result.error}")
|
|
112
|
+
|
|
113
|
+
print("Status:", result.status)
|
|
114
|
+
print("URL:", result.url)
|
|
115
|
+
print("Content length:", result.content_length)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
asyncio.run(main())
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Read HTML and Markdown
|
|
122
|
+
|
|
123
|
+
When a request succeeds with HTTP `200`, Crawlsmith returns both the raw response
|
|
124
|
+
body and a Markdown representation.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import asyncio
|
|
128
|
+
|
|
129
|
+
from crawlsmith import CurlCffiScraper
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
async def main() -> None:
|
|
133
|
+
scraper = CurlCffiScraper()
|
|
134
|
+
result = await scraper.fetch("https://example.com")
|
|
135
|
+
|
|
136
|
+
if result.ok:
|
|
137
|
+
html = result.content
|
|
138
|
+
markdown = result.markdown
|
|
139
|
+
print(html[:300])
|
|
140
|
+
print(markdown[:300])
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
asyncio.run(main())
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Access Structured Metadata
|
|
147
|
+
|
|
148
|
+
Each result includes metadata extracted from the response body and headers.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
import asyncio
|
|
152
|
+
|
|
153
|
+
from crawlsmith import CurlCffiScraper
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
async def main() -> None:
|
|
157
|
+
scraper = CurlCffiScraper()
|
|
158
|
+
result = await scraper.fetch("https://example.com")
|
|
159
|
+
|
|
160
|
+
metadata = result.metadata or {}
|
|
161
|
+
document = metadata.get("document", {})
|
|
162
|
+
open_graph = metadata.get("open_graph", {})
|
|
163
|
+
twitter = metadata.get("twitter", {})
|
|
164
|
+
http = metadata.get("http", {})
|
|
165
|
+
|
|
166
|
+
print("Title:", document.get("title"))
|
|
167
|
+
print("Description:", document.get("description"))
|
|
168
|
+
print("Canonical URL:", document.get("canonical_url"))
|
|
169
|
+
print("OG Title:", open_graph.get("title"))
|
|
170
|
+
print("Twitter Card:", twitter.get("card"))
|
|
171
|
+
print("Final URL:", http.get("final_url"))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
asyncio.run(main())
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Use Proxies
|
|
178
|
+
|
|
179
|
+
Pass a list of proxies. Crawlsmith will shuffle them, try up to three unique
|
|
180
|
+
entries, and return as soon as one succeeds with enough content.
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
import asyncio
|
|
184
|
+
|
|
185
|
+
from crawlsmith import CurlCffiScraper
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def main() -> None:
|
|
189
|
+
scraper = CurlCffiScraper(
|
|
190
|
+
proxies=[
|
|
191
|
+
"http://user:pass@proxy-1.example:8080",
|
|
192
|
+
"http://user:pass@proxy-2.example:8080",
|
|
193
|
+
"proxy-3.example:8080",
|
|
194
|
+
],
|
|
195
|
+
min_content_length=2000,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
result = await scraper.fetch("https://example.com")
|
|
199
|
+
print(result.ok, result.via_proxy, result.proxy_url)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
asyncio.run(main())
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Control Browser Impersonation
|
|
206
|
+
|
|
207
|
+
You can force a specific `curl_cffi` impersonation profile instead of using the
|
|
208
|
+
default randomized behavior.
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
import asyncio
|
|
212
|
+
|
|
213
|
+
from crawlsmith import CurlCffiScraper
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
async def main() -> None:
|
|
217
|
+
scraper = CurlCffiScraper(impersonate="chrome120")
|
|
218
|
+
result = await scraper.fetch("https://example.com")
|
|
219
|
+
print(result.status, result.error_type)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
asyncio.run(main())
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Configure TLS and Timeouts
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
import asyncio
|
|
229
|
+
|
|
230
|
+
from crawlsmith import CurlCffiScraper
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
async def main() -> None:
|
|
234
|
+
scraper = CurlCffiScraper(
|
|
235
|
+
verify=True,
|
|
236
|
+
connect_timeout=5,
|
|
237
|
+
read_timeout=20,
|
|
238
|
+
)
|
|
239
|
+
result = await scraper.fetch("https://example.com")
|
|
240
|
+
print(result.to_dict())
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
asyncio.run(main())
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
If you need to disable TLS certificate verification for a controlled internal
|
|
247
|
+
environment, set `verify=False`.
|
|
248
|
+
|
|
249
|
+
### Handle Errors Explicitly
|
|
250
|
+
|
|
251
|
+
Failures are returned as structured results instead of raising request errors in
|
|
252
|
+
normal operation.
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
import asyncio
|
|
256
|
+
|
|
257
|
+
from crawlsmith import CurlCffiScraper
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
async def main() -> None:
|
|
261
|
+
scraper = CurlCffiScraper()
|
|
262
|
+
result = await scraper.fetch("https://example.com")
|
|
263
|
+
|
|
264
|
+
if result.ok:
|
|
265
|
+
print("Fetched successfully")
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
print("Error type:", result.error_type)
|
|
269
|
+
print("Error message:", result.error)
|
|
270
|
+
print("HTTP status:", result.status)
|
|
271
|
+
print("Blocked:", result.is_blocked)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
asyncio.run(main())
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Common error types include:
|
|
278
|
+
|
|
279
|
+
- `TIMEOUT`
|
|
280
|
+
- `CONNECTION`
|
|
281
|
+
- `SSL`
|
|
282
|
+
- `INVALID_URL`
|
|
283
|
+
- `BLOCKED`
|
|
284
|
+
- `HTTP_403`
|
|
285
|
+
- `HTTP_429`
|
|
286
|
+
- `HTTP_4XX`
|
|
287
|
+
- `HTTP_5XX`
|
|
288
|
+
- `UNKNOWN`
|
|
289
|
+
|
|
290
|
+
### Serialize Results
|
|
291
|
+
|
|
292
|
+
`FetchResult` can be converted directly into a plain dictionary for logging,
|
|
293
|
+
storage, or JSON serialization.
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
import asyncio
|
|
297
|
+
import json
|
|
298
|
+
|
|
299
|
+
from crawlsmith import CurlCffiScraper
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
async def main() -> None:
|
|
303
|
+
scraper = CurlCffiScraper()
|
|
304
|
+
result = await scraper.fetch("https://example.com")
|
|
305
|
+
print(json.dumps(result.to_dict(), indent=2))
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
asyncio.run(main())
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## CLI Usage
|
|
312
|
+
|
|
313
|
+
The package installs a `crawlsmith` command for quick fetches from the terminal.
|
|
314
|
+
|
|
315
|
+
### Basic CLI Request
|
|
316
|
+
|
|
317
|
+
```bash
|
|
318
|
+
crawlsmith https://example.com
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
The CLI prints a JSON-serialized `FetchResult` to stdout.
|
|
322
|
+
|
|
323
|
+
### Print the Response Body
|
|
324
|
+
|
|
325
|
+
```bash
|
|
326
|
+
crawlsmith https://example.com --print-content
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### Use One or More Proxies
|
|
330
|
+
|
|
331
|
+
```bash
|
|
332
|
+
crawlsmith https://example.com \
|
|
333
|
+
--proxy http://user:pass@proxy-1.example:8080 \
|
|
334
|
+
--proxy http://user:pass@proxy-2.example:8080 \
|
|
335
|
+
--min-content-length 2000
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Force an Impersonation Profile
|
|
339
|
+
|
|
340
|
+
```bash
|
|
341
|
+
crawlsmith https://example.com --impersonate chrome120
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
### Change Timeout or Disable TLS Verification
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
crawlsmith https://example.com --timeout 20
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
```bash
|
|
351
|
+
crawlsmith https://example.com --insecure
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
### CLI Exit Codes
|
|
355
|
+
|
|
356
|
+
- `0` when the request succeeds
|
|
357
|
+
- `1` when the request fails
|
|
358
|
+
|
|
359
|
+
### CLI Help
|
|
360
|
+
|
|
361
|
+
```bash
|
|
362
|
+
crawlsmith --help
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
## Result Model
|
|
366
|
+
|
|
367
|
+
`FetchResult` exposes the following fields:
|
|
368
|
+
|
|
369
|
+
- `ok`: whether the request was considered successful
|
|
370
|
+
- `url`: requested URL
|
|
371
|
+
- `status`: HTTP status code when available
|
|
372
|
+
- `content`: raw response text when successful
|
|
373
|
+
- `markdown`: Markdown conversion of the response body when successful
|
|
374
|
+
- `metadata`: extracted document and HTTP metadata
|
|
375
|
+
- `error_type`: normalized error classification
|
|
376
|
+
- `error`: human-readable error summary
|
|
377
|
+
- `via_proxy`: whether the successful or failed attempt used a proxy
|
|
378
|
+
- `proxy_url`: proxy used for the final attempt, if any
|
|
379
|
+
- `content_length`: UTF-8 byte length of the extracted text
|
|
380
|
+
- `is_blocked`: whether the response looks like an anti-bot interstitial
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
## Support & Connect
|
|
384
|
+
|
|
385
|
+
* ⭐ **Star the repo** if you found it useful
|
|
386
|
+
* ☕ **Support me:** Say thanks by buying me a coffee! [https://buymeacoffee.com/juanmcristobal](https://buymeacoffee.com/juanmcristobal)
|
|
387
|
+
* 💼 **Open to work:** [https://www.linkedin.com/in/jmcristobal/](https://www.linkedin.com/in/jmcristobal/)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
# History
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
## 0.1.0 (2026-04-07)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
* First release.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
crawlsmith/__init__.py,sha256=E7DzsXgbN1dLolUj0mwmbNFwI8uO8QDD4d2Z8LbjM8M,247
|
|
2
|
+
crawlsmith/cli.py,sha256=4pfHq7xxXGGOZwKDsnXxq-UDxdqi7KkR-IjVwATKV0c,1822
|
|
3
|
+
crawlsmith/crawlsmith.py,sha256=WwPZ1_UMGm_AqC1KTqvSvUj54WCj-LTl6ubbLG1CcAI,30390
|
|
4
|
+
crawlsmith-0.1.0.dist-info/licenses/AUTHORS.md,sha256=_iiWmZhfEpCmhL904fE_TcL5lFcj72NTwzzaaXAQ2-s,132
|
|
5
|
+
crawlsmith-0.1.0.dist-info/METADATA,sha256=jdfGEpKhDiyJoCsdGiYB-QRUCQ4qW_kcuQkPEC6n1_o,9123
|
|
6
|
+
crawlsmith-0.1.0.dist-info/WHEEL,sha256=TdQ5LtNwLuxTCjgxN51AgdU5w-KkB9ttmLbzjTH02pg,109
|
|
7
|
+
crawlsmith-0.1.0.dist-info/entry_points.txt,sha256=9gEU7jvKLjF5d6GsOJn471DwTuT378bSGCjd04Pc_qk,51
|
|
8
|
+
crawlsmith-0.1.0.dist-info/top_level.txt,sha256=cC0xCz1cJd3GmiYwJyUTZENu0osLLW6HZWmvUyHLBWI,11
|
|
9
|
+
crawlsmith-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
crawlsmith
|