scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +759 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +644 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +170 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +239 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.1.dist-info/METADATA +411 -0
- scrapling-0.3.1.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
scrapling/engines/static.py
CHANGED
@@ -1,156 +1,713 @@
|
|
1
|
-
import
|
2
|
-
from
|
1
|
+
from time import sleep as time_sleep
|
2
|
+
from asyncio import sleep as asyncio_sleep
|
3
3
|
|
4
|
-
from
|
5
|
-
from
|
4
|
+
from curl_cffi.requests.session import CurlError
|
5
|
+
from curl_cffi import CurlHttpVersion
|
6
|
+
from curl_cffi.requests.impersonate import DEFAULT_CHROME
|
7
|
+
from curl_cffi.requests import (
|
8
|
+
ProxySpec,
|
9
|
+
CookieTypes,
|
10
|
+
BrowserTypeLiteral,
|
11
|
+
Session as CurlSession,
|
12
|
+
AsyncSession as AsyncCurlSession,
|
13
|
+
)
|
6
14
|
|
7
|
-
from .
|
15
|
+
from scrapling.core.utils import log
|
16
|
+
from scrapling.core._types import (
|
17
|
+
Dict,
|
18
|
+
Optional,
|
19
|
+
Tuple,
|
20
|
+
Mapping,
|
21
|
+
SUPPORTED_HTTP_METHODS,
|
22
|
+
Awaitable,
|
23
|
+
List,
|
24
|
+
Any,
|
25
|
+
)
|
8
26
|
|
27
|
+
from .toolbelt import (
|
28
|
+
Response,
|
29
|
+
generate_convincing_referer,
|
30
|
+
generate_headers,
|
31
|
+
ResponseFactory,
|
32
|
+
__default_useragent__,
|
33
|
+
)
|
34
|
+
|
35
|
+
_UNSET = object()
|
36
|
+
|
37
|
+
|
38
|
+
class FetcherSession:
|
39
|
+
"""
|
40
|
+
A context manager that provides configured Fetcher sessions.
|
41
|
+
|
42
|
+
When this manager is used in a 'with' or 'async with' block,
|
43
|
+
it yields a new session configured with the manager's defaults.
|
44
|
+
A single instance of this manager should ideally be used for one active
|
45
|
+
session at a time (or sequentially). Re-entering a context with the
|
46
|
+
same manager instance while a session is already active is disallowed.
|
47
|
+
"""
|
9
48
|
|
10
|
-
@lru_cache(2, typed=True) # Singleton easily
|
11
|
-
class StaticEngine:
|
12
49
|
def __init__(
|
13
|
-
|
14
|
-
|
50
|
+
self,
|
51
|
+
impersonate: Optional[BrowserTypeLiteral] = DEFAULT_CHROME,
|
52
|
+
http3: Optional[bool] = False,
|
53
|
+
stealthy_headers: Optional[bool] = True,
|
54
|
+
proxies: Optional[Dict[str, str]] = None,
|
55
|
+
proxy: Optional[str] = None,
|
56
|
+
proxy_auth: Optional[Tuple[str, str]] = None,
|
57
|
+
timeout: Optional[int | float] = 30,
|
58
|
+
headers: Optional[Dict[str, str]] = None,
|
59
|
+
retries: Optional[int] = 3,
|
60
|
+
retry_delay: Optional[int] = 1,
|
61
|
+
follow_redirects: bool = True,
|
62
|
+
max_redirects: int = 30,
|
63
|
+
verify: bool = True,
|
64
|
+
cert: Optional[str | Tuple[str, str]] = None,
|
65
|
+
selector_config: Optional[Dict] = None,
|
15
66
|
):
|
16
|
-
"""
|
17
|
-
|
18
|
-
:param
|
19
|
-
:param stealthy_headers: If enabled (default),
|
20
|
-
|
21
|
-
:param proxy:
|
22
|
-
|
23
|
-
:param
|
24
|
-
:param
|
25
|
-
|
26
|
-
|
27
|
-
|
67
|
+
"""
|
68
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
69
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
70
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
71
|
+
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
72
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
73
|
+
Cannot be used together with the `proxies` parameter.
|
74
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
75
|
+
:param timeout: Number of seconds to wait before timing out.
|
76
|
+
:param headers: Headers to include in the session with every request.
|
77
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
78
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
79
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
80
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
81
|
+
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
82
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
83
|
+
:param selector_config: Arguments passed when creating the final Selector class.
|
84
|
+
"""
|
85
|
+
self.default_impersonate = impersonate
|
28
86
|
self.stealth = stealthy_headers
|
29
|
-
self.
|
30
|
-
self.
|
31
|
-
self.
|
32
|
-
self.
|
33
|
-
|
34
|
-
|
35
|
-
self.
|
36
|
-
|
37
|
-
|
87
|
+
self.default_proxies = proxies or {}
|
88
|
+
self.default_proxy = proxy or None
|
89
|
+
self.default_proxy_auth = proxy_auth or None
|
90
|
+
self.default_timeout = timeout
|
91
|
+
self.default_headers = headers or {}
|
92
|
+
self.default_retries = retries
|
93
|
+
self.default_retry_delay = retry_delay
|
94
|
+
self.default_follow_redirects = follow_redirects
|
95
|
+
self.default_max_redirects = max_redirects
|
96
|
+
self.default_verify = verify
|
97
|
+
self.default_cert = cert
|
98
|
+
self.default_http3 = http3
|
99
|
+
self.selector_config = selector_config or {}
|
100
|
+
|
101
|
+
self._curl_session: Optional[CurlSession] = None
|
102
|
+
self._async_curl_session: Optional[AsyncCurlSession] = None
|
103
|
+
|
104
|
+
def _merge_request_args(self, **kwargs) -> Dict[str, Any]:
|
105
|
+
"""Merge request-specific arguments with default session arguments."""
|
106
|
+
url = kwargs.pop("url")
|
107
|
+
request_args = {}
|
108
|
+
|
109
|
+
headers = self.get_with_precedence(kwargs, "headers", self.default_headers)
|
110
|
+
stealth = self.get_with_precedence(kwargs, "stealth", self.stealth)
|
111
|
+
impersonate = self.get_with_precedence(
|
112
|
+
kwargs, "impersonate", self.default_impersonate
|
113
|
+
)
|
114
|
+
|
115
|
+
if self.get_with_precedence(
|
116
|
+
kwargs, "http3", self.default_http3
|
117
|
+
): # pragma: no cover
|
118
|
+
request_args["http_version"] = CurlHttpVersion.V3ONLY
|
119
|
+
if impersonate:
|
120
|
+
log.warning(
|
121
|
+
"The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
|
122
|
+
)
|
123
|
+
|
124
|
+
request_args.update(
|
125
|
+
{
|
126
|
+
"url": url,
|
127
|
+
# Curl automatically generates the suitable browser headers when you use `impersonate`
|
128
|
+
"headers": self._headers_job(url, headers, stealth, bool(impersonate)),
|
129
|
+
"proxies": self.get_with_precedence(
|
130
|
+
kwargs, "proxies", self.default_proxies
|
131
|
+
),
|
132
|
+
"proxy": self.get_with_precedence(kwargs, "proxy", self.default_proxy),
|
133
|
+
"proxy_auth": self.get_with_precedence(
|
134
|
+
kwargs, "proxy_auth", self.default_proxy_auth
|
135
|
+
),
|
136
|
+
"timeout": self.get_with_precedence(
|
137
|
+
kwargs, "timeout", self.default_timeout
|
138
|
+
),
|
139
|
+
"allow_redirects": self.get_with_precedence(
|
140
|
+
kwargs, "allow_redirects", self.default_follow_redirects
|
141
|
+
),
|
142
|
+
"max_redirects": self.get_with_precedence(
|
143
|
+
kwargs, "max_redirects", self.default_max_redirects
|
144
|
+
),
|
145
|
+
"verify": self.get_with_precedence(
|
146
|
+
kwargs, "verify", self.default_verify
|
147
|
+
),
|
148
|
+
"cert": self.get_with_precedence(kwargs, "cert", self.default_cert),
|
149
|
+
"impersonate": impersonate,
|
150
|
+
**{
|
151
|
+
k: v
|
152
|
+
for k, v in kwargs.items()
|
153
|
+
if v
|
154
|
+
not in (
|
155
|
+
_UNSET,
|
156
|
+
None,
|
157
|
+
)
|
158
|
+
}, # Add any remaining parameters (after all known ones are popped)
|
159
|
+
}
|
160
|
+
)
|
161
|
+
return request_args
|
162
|
+
|
163
|
+
def _headers_job(
|
164
|
+
self,
|
165
|
+
url,
|
166
|
+
headers: Optional[Dict],
|
167
|
+
stealth: Optional[bool],
|
168
|
+
impersonate_enabled: bool,
|
169
|
+
) -> Dict:
|
38
170
|
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
39
171
|
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
40
172
|
|
41
173
|
:param headers: Current headers in the request if the user passed any
|
174
|
+
:param stealth: Whether to enable the `stealthy_headers` argument to this request or not. If `None`, it defaults to the session default value.
|
175
|
+
:param impersonate_enabled: Whether the browser impersonation is enabled or not.
|
42
176
|
:return: A dictionary of the new headers.
|
43
177
|
"""
|
44
|
-
|
178
|
+
# Handle headers - if it was _UNSET, use default_headers
|
179
|
+
if headers is _UNSET:
|
180
|
+
headers = self.default_headers.copy()
|
181
|
+
else:
|
182
|
+
# Merge session headers with request headers, request takes precedence
|
183
|
+
headers = {**self.default_headers, **(headers or {})}
|
184
|
+
|
45
185
|
headers_keys = set(map(str.lower, headers.keys()))
|
186
|
+
if stealth:
|
187
|
+
if "referer" not in headers_keys:
|
188
|
+
headers.update({"referer": generate_convincing_referer(url)})
|
189
|
+
|
190
|
+
if impersonate_enabled: # Curl will generate the suitable headers
|
191
|
+
return headers
|
46
192
|
|
47
|
-
if self.stealth:
|
48
193
|
extra_headers = generate_headers(browser_mode=False)
|
49
|
-
# Don't overwrite user
|
50
|
-
extra_headers = {
|
194
|
+
# Don't overwrite user-supplied headers
|
195
|
+
extra_headers = {
|
196
|
+
key: value
|
197
|
+
for key, value in extra_headers.items()
|
198
|
+
if key.lower() not in headers_keys
|
199
|
+
}
|
51
200
|
headers.update(extra_headers)
|
52
|
-
if 'referer' not in headers_keys:
|
53
|
-
headers.update({'referer': generate_convincing_referer(self.url)})
|
54
201
|
|
55
|
-
elif
|
56
|
-
headers[
|
57
|
-
log.debug(
|
202
|
+
elif "user-agent" not in headers_keys and not impersonate_enabled:
|
203
|
+
headers["User-Agent"] = __default_useragent__
|
204
|
+
log.debug(
|
205
|
+
f"Can't find useragent in headers so '{headers['User-Agent']}' was used."
|
206
|
+
)
|
58
207
|
|
59
208
|
return headers
|
60
209
|
|
61
|
-
def
|
62
|
-
"""
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
request_headers=dict(response.request.headers),
|
77
|
-
method=response.request.method,
|
78
|
-
history=[self._prepare_response(redirection) for redirection in response.history],
|
79
|
-
**self.adaptor_arguments
|
80
|
-
)
|
210
|
+
def __enter__(self):
|
211
|
+
"""Creates and returns a new synchronous Fetcher Session"""
|
212
|
+
if self._curl_session:
|
213
|
+
raise RuntimeError(
|
214
|
+
"This FetcherSession instance already has an active synchronous session. "
|
215
|
+
"Create a new FetcherSession instance for a new independent session, "
|
216
|
+
"or use the current instance sequentially after the previous context has exited."
|
217
|
+
)
|
218
|
+
if (
|
219
|
+
self._async_curl_session
|
220
|
+
): # Prevent mixing if async is active from this instance
|
221
|
+
raise RuntimeError(
|
222
|
+
"This FetcherSession instance has an active asynchronous session. "
|
223
|
+
"Cannot enter a synchronous context simultaneously with the same manager instance."
|
224
|
+
)
|
81
225
|
|
82
|
-
|
83
|
-
|
84
|
-
with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
|
85
|
-
request = getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
86
|
-
return self._prepare_response(request)
|
226
|
+
self._curl_session = CurlSession()
|
227
|
+
return self
|
87
228
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
229
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
230
|
+
"""Closes the active synchronous session managed by this instance, if any."""
|
231
|
+
if self._curl_session:
|
232
|
+
self._curl_session.close()
|
233
|
+
self._curl_session = None
|
93
234
|
|
94
|
-
def
|
95
|
-
"""
|
235
|
+
async def __aenter__(self):
|
236
|
+
"""Creates and returns a new asynchronous Session."""
|
237
|
+
if self._async_curl_session:
|
238
|
+
raise RuntimeError(
|
239
|
+
"This FetcherSession instance already has an active asynchronous session. "
|
240
|
+
"Create a new FetcherSession instance for a new independent session, "
|
241
|
+
"or use the current instance sequentially after the previous context has exited."
|
242
|
+
)
|
243
|
+
if self._curl_session: # Prevent mixing if sync is active from this instance
|
244
|
+
raise RuntimeError(
|
245
|
+
"This FetcherSession instance has an active synchronous session. "
|
246
|
+
"Cannot enter an asynchronous context simultaneously with the same manager instance."
|
247
|
+
)
|
96
248
|
|
97
|
-
|
98
|
-
|
99
|
-
"""
|
100
|
-
return self._make_request('get', **kwargs)
|
249
|
+
self._async_curl_session = AsyncCurlSession()
|
250
|
+
return self
|
101
251
|
|
102
|
-
async def
|
103
|
-
"""
|
252
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
253
|
+
"""Closes the active asynchronous session managed by this instance, if any."""
|
254
|
+
if self._async_curl_session:
|
255
|
+
await self._async_curl_session.close()
|
256
|
+
self._async_curl_session = None
|
104
257
|
|
105
|
-
|
106
|
-
|
258
|
+
def __make_request(
|
259
|
+
self,
|
260
|
+
method: SUPPORTED_HTTP_METHODS,
|
261
|
+
request_args: Dict[str, Any],
|
262
|
+
max_retries: int,
|
263
|
+
retry_delay: int,
|
264
|
+
selector_config: Optional[Dict] = None,
|
265
|
+
) -> Response:
|
107
266
|
"""
|
108
|
-
|
267
|
+
Perform an HTTP request using the configured session.
|
268
|
+
|
269
|
+
:param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
|
270
|
+
:param url: Target URL for the request.
|
271
|
+
:param request_args: Arguments to be passed to the session's `request()` method.
|
272
|
+
:param max_retries: Maximum number of retries for the request.
|
273
|
+
:param retry_delay: Number of seconds to wait between retries.
|
274
|
+
:param selector_config: Arguments passed when creating the final Selector class.
|
275
|
+
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
276
|
+
"""
|
277
|
+
session = self._curl_session
|
278
|
+
if session is True and not any(
|
279
|
+
(self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
|
280
|
+
):
|
281
|
+
# For usage inside FetcherClient
|
282
|
+
# It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
|
283
|
+
session = CurlSession()
|
284
|
+
|
285
|
+
if session:
|
286
|
+
for attempt in range(max_retries):
|
287
|
+
try:
|
288
|
+
response = session.request(method, **request_args)
|
289
|
+
# response.raise_for_status() # Retry responses with a status code between 200-400
|
290
|
+
return ResponseFactory.from_http_request(response, selector_config)
|
291
|
+
except CurlError as e: # pragma: no cover
|
292
|
+
if attempt < max_retries - 1:
|
293
|
+
log.error(
|
294
|
+
f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds..."
|
295
|
+
)
|
296
|
+
time_sleep(retry_delay)
|
297
|
+
else:
|
298
|
+
log.error(f"Failed after {max_retries} attempts: {e}")
|
299
|
+
raise # Raise the exception if all retries fail
|
109
300
|
|
110
|
-
|
111
|
-
"""Make basic HTTP POST request for you but with some added flavors.
|
301
|
+
raise RuntimeError("No active session available.") # pragma: no cover
|
112
302
|
|
113
|
-
|
114
|
-
|
303
|
+
async def __make_async_request(
|
304
|
+
self,
|
305
|
+
method: SUPPORTED_HTTP_METHODS,
|
306
|
+
request_args: Dict[str, Any],
|
307
|
+
max_retries: int,
|
308
|
+
retry_delay: int,
|
309
|
+
selector_config: Optional[Dict] = None,
|
310
|
+
) -> Response:
|
115
311
|
"""
|
116
|
-
|
312
|
+
Perform an HTTP request using the configured session.
|
117
313
|
|
118
|
-
|
119
|
-
|
314
|
+
:param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
|
315
|
+
:param url: Target URL for the request.
|
316
|
+
:param request_args: Arguments to be passed to the session's `request()` method.
|
317
|
+
:param max_retries: Maximum number of retries for the request.
|
318
|
+
:param retry_delay: Number of seconds to wait between retries.
|
319
|
+
:param selector_config: Arguments passed when creating the final Selector class.
|
320
|
+
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
321
|
+
"""
|
322
|
+
session = self._async_curl_session
|
323
|
+
if session is True and not any(
|
324
|
+
(self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
|
325
|
+
):
|
326
|
+
# For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
|
327
|
+
# 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
|
328
|
+
# 2. `curl_cffi` doesn't support making async requests without sessions
|
329
|
+
# 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
|
330
|
+
session = AsyncCurlSession()
|
331
|
+
|
332
|
+
if session:
|
333
|
+
for attempt in range(max_retries):
|
334
|
+
try:
|
335
|
+
response = await session.request(method, **request_args)
|
336
|
+
# response.raise_for_status() # Retry responses with a status code between 200-400
|
337
|
+
return ResponseFactory.from_http_request(response, selector_config)
|
338
|
+
except CurlError as e: # pragma: no cover
|
339
|
+
if attempt < max_retries - 1:
|
340
|
+
log.error(
|
341
|
+
f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds..."
|
342
|
+
)
|
343
|
+
await asyncio_sleep(retry_delay)
|
344
|
+
else:
|
345
|
+
log.error(f"Failed after {max_retries} attempts: {e}")
|
346
|
+
raise # Raise the exception if all retries fail
|
347
|
+
|
348
|
+
raise RuntimeError("No active session available.") # pragma: no cover
|
120
349
|
|
121
|
-
|
122
|
-
|
350
|
+
@staticmethod
|
351
|
+
def get_with_precedence(kwargs, key, default_value):
|
352
|
+
"""Get value with request-level priority over session-level"""
|
353
|
+
request_value = kwargs.pop(key, _UNSET)
|
354
|
+
return request_value if request_value is not _UNSET else default_value
|
355
|
+
|
356
|
+
def __prepare_and_dispatch(
|
357
|
+
self,
|
358
|
+
method: SUPPORTED_HTTP_METHODS,
|
359
|
+
stealth: Optional[bool] = None,
|
360
|
+
**kwargs,
|
361
|
+
) -> Response | Awaitable[Response]:
|
123
362
|
"""
|
124
|
-
|
363
|
+
Internal dispatcher. Prepares arguments and calls sync or async request helper.
|
125
364
|
|
126
|
-
|
127
|
-
|
365
|
+
:param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
|
366
|
+
:param stealth: Whether to enable the `stealthy_headers` argument to this request or not. If `None`, it defaults to the session default value.
|
367
|
+
:param url: Target URL for the request.
|
368
|
+
:param kwargs: Additional request-specific arguments.
|
369
|
+
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
370
|
+
"""
|
371
|
+
stealth = self.stealth if stealth is None else stealth
|
128
372
|
|
129
|
-
|
130
|
-
|
373
|
+
selector_config = kwargs.pop("selector_config", {}) or self.selector_config
|
374
|
+
max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
|
375
|
+
retry_delay = self.get_with_precedence(
|
376
|
+
kwargs, "retry_delay", self.default_retry_delay
|
377
|
+
)
|
378
|
+
request_args = self._merge_request_args(stealth=stealth, **kwargs)
|
379
|
+
if self._curl_session:
|
380
|
+
return self.__make_request(
|
381
|
+
method, request_args, max_retries, retry_delay, selector_config
|
382
|
+
)
|
383
|
+
elif self._async_curl_session:
|
384
|
+
# The returned value is a Coroutine
|
385
|
+
return self.__make_async_request(
|
386
|
+
method, request_args, max_retries, retry_delay, selector_config
|
387
|
+
)
|
388
|
+
|
389
|
+
raise RuntimeError("No active session available.")
|
390
|
+
|
391
|
+
def get(
|
392
|
+
self,
|
393
|
+
url: str,
|
394
|
+
params: Optional[Dict | List | Tuple] = None,
|
395
|
+
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
396
|
+
cookies: Optional[CookieTypes] = None,
|
397
|
+
timeout: Optional[int | float] = _UNSET,
|
398
|
+
follow_redirects: Optional[bool] = _UNSET,
|
399
|
+
max_redirects: Optional[int] = _UNSET,
|
400
|
+
retries: Optional[int] = _UNSET,
|
401
|
+
retry_delay: Optional[int] = _UNSET,
|
402
|
+
proxies: Optional[ProxySpec] = _UNSET,
|
403
|
+
proxy: Optional[str] = _UNSET,
|
404
|
+
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
405
|
+
auth: Optional[Tuple[str, str]] = None,
|
406
|
+
verify: Optional[bool] = _UNSET,
|
407
|
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
408
|
+
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
409
|
+
http3: Optional[bool] = _UNSET,
|
410
|
+
stealthy_headers: Optional[bool] = _UNSET,
|
411
|
+
**kwargs,
|
412
|
+
) -> Response | Awaitable[Response]:
|
131
413
|
"""
|
132
|
-
|
414
|
+
Perform a GET request.
|
133
415
|
|
134
|
-
|
135
|
-
|
416
|
+
:param url: Target URL for the request.
|
417
|
+
:param params: Query string parameters for the request.
|
418
|
+
:param headers: Headers to include in the request.
|
419
|
+
:param cookies: Cookies to use in the request.
|
420
|
+
:param timeout: Number of seconds to wait before timing out.
|
421
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
422
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
423
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
424
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
425
|
+
:param proxies: Dict of proxies to use.
|
426
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
427
|
+
Cannot be used together with the `proxies` parameter.
|
428
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
429
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
430
|
+
:param verify: Whether to verify HTTPS certificates.
|
431
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
432
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
433
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
434
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
435
|
+
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
436
|
+
:return: A `Response` object or an awaitable for async.
|
437
|
+
"""
|
438
|
+
request_args = {
|
439
|
+
"url": url,
|
440
|
+
"params": params,
|
441
|
+
"headers": headers,
|
442
|
+
"cookies": cookies,
|
443
|
+
"timeout": timeout,
|
444
|
+
"retry_delay": retry_delay,
|
445
|
+
"allow_redirects": follow_redirects,
|
446
|
+
"max_redirects": max_redirects,
|
447
|
+
"retries": retries,
|
448
|
+
"proxies": proxies,
|
449
|
+
"proxy": proxy,
|
450
|
+
"proxy_auth": proxy_auth,
|
451
|
+
"auth": auth,
|
452
|
+
"verify": verify,
|
453
|
+
"cert": cert,
|
454
|
+
"impersonate": impersonate,
|
455
|
+
"http3": http3,
|
456
|
+
**kwargs,
|
457
|
+
}
|
458
|
+
return self.__prepare_and_dispatch(
|
459
|
+
"GET", stealth=stealthy_headers, **request_args
|
460
|
+
)
|
136
461
|
|
137
|
-
|
138
|
-
|
462
|
+
def post(
|
463
|
+
self,
|
464
|
+
url: str,
|
465
|
+
data: Optional[Dict | str] = None,
|
466
|
+
json: Optional[Dict | List] = None,
|
467
|
+
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
468
|
+
params: Optional[Dict | List | Tuple] = None,
|
469
|
+
cookies: Optional[CookieTypes] = None,
|
470
|
+
timeout: Optional[int | float] = _UNSET,
|
471
|
+
follow_redirects: Optional[bool] = _UNSET,
|
472
|
+
max_redirects: Optional[int] = _UNSET,
|
473
|
+
retries: Optional[int] = _UNSET,
|
474
|
+
retry_delay: Optional[int] = _UNSET,
|
475
|
+
proxies: Optional[ProxySpec] = _UNSET,
|
476
|
+
proxy: Optional[str] = _UNSET,
|
477
|
+
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
478
|
+
auth: Optional[Tuple[str, str]] = None,
|
479
|
+
verify: Optional[bool] = _UNSET,
|
480
|
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
481
|
+
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
482
|
+
http3: Optional[bool] = _UNSET,
|
483
|
+
stealthy_headers: Optional[bool] = _UNSET,
|
484
|
+
**kwargs,
|
485
|
+
) -> Response | Awaitable[Response]:
|
139
486
|
"""
|
140
|
-
|
487
|
+
Perform a POST request.
|
141
488
|
|
142
|
-
|
143
|
-
|
489
|
+
:param url: Target URL for the request.
|
490
|
+
:param data: Form data to include in the request body.
|
491
|
+
:param json: A JSON serializable object to include in the body of the request.
|
492
|
+
:param headers: Headers to include in the request.
|
493
|
+
:param params: Query string parameters for the request.
|
494
|
+
:param cookies: Cookies to use in the request.
|
495
|
+
:param timeout: Number of seconds to wait before timing out.
|
496
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
497
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
498
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
499
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
500
|
+
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
501
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
502
|
+
Cannot be used together with the `proxies` parameter.
|
503
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
504
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
505
|
+
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
506
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
507
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
508
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
509
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
510
|
+
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
511
|
+
:return: A `Response` object or an awaitable for async.
|
512
|
+
"""
|
513
|
+
request_args = {
|
514
|
+
"url": url,
|
515
|
+
"data": data,
|
516
|
+
"json": json,
|
517
|
+
"headers": headers,
|
518
|
+
"params": params,
|
519
|
+
"cookies": cookies,
|
520
|
+
"timeout": timeout,
|
521
|
+
"retry_delay": retry_delay,
|
522
|
+
"proxy": proxy,
|
523
|
+
"impersonate": impersonate,
|
524
|
+
"allow_redirects": follow_redirects,
|
525
|
+
"max_redirects": max_redirects,
|
526
|
+
"retries": retries,
|
527
|
+
"proxies": proxies,
|
528
|
+
"proxy_auth": proxy_auth,
|
529
|
+
"auth": auth,
|
530
|
+
"verify": verify,
|
531
|
+
"cert": cert,
|
532
|
+
"http3": http3,
|
533
|
+
**kwargs,
|
534
|
+
}
|
535
|
+
return self.__prepare_and_dispatch(
|
536
|
+
"POST", stealth=stealthy_headers, **request_args
|
537
|
+
)
|
144
538
|
|
145
|
-
|
146
|
-
|
539
|
+
def put(
|
540
|
+
self,
|
541
|
+
url: str,
|
542
|
+
data: Optional[Dict | str] = None,
|
543
|
+
json: Optional[Dict | List] = None,
|
544
|
+
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
545
|
+
params: Optional[Dict | List | Tuple] = None,
|
546
|
+
cookies: Optional[CookieTypes] = None,
|
547
|
+
timeout: Optional[int | float] = _UNSET,
|
548
|
+
follow_redirects: Optional[bool] = _UNSET,
|
549
|
+
max_redirects: Optional[int] = _UNSET,
|
550
|
+
retries: Optional[int] = _UNSET,
|
551
|
+
retry_delay: Optional[int] = _UNSET,
|
552
|
+
proxies: Optional[ProxySpec] = _UNSET,
|
553
|
+
proxy: Optional[str] = _UNSET,
|
554
|
+
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
555
|
+
auth: Optional[Tuple[str, str]] = None,
|
556
|
+
verify: Optional[bool] = _UNSET,
|
557
|
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
558
|
+
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
559
|
+
http3: Optional[bool] = _UNSET,
|
560
|
+
stealthy_headers: Optional[bool] = _UNSET,
|
561
|
+
**kwargs,
|
562
|
+
) -> Response | Awaitable[Response]:
|
147
563
|
"""
|
148
|
-
|
564
|
+
Perform a PUT request.
|
149
565
|
|
150
|
-
|
151
|
-
|
566
|
+
:param url: Target URL for the request.
|
567
|
+
:param data: Form data to include in the request body.
|
568
|
+
:param json: A JSON serializable object to include in the body of the request.
|
569
|
+
:param headers: Headers to include in the request.
|
570
|
+
:param params: Query string parameters for the request.
|
571
|
+
:param cookies: Cookies to use in the request.
|
572
|
+
:param timeout: Number of seconds to wait before timing out.
|
573
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
574
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
575
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
576
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
577
|
+
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
578
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
579
|
+
Cannot be used together with the `proxies` parameter.
|
580
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
581
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
582
|
+
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
583
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
584
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
585
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
586
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
587
|
+
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
588
|
+
:return: A `Response` object or an awaitable for async.
|
589
|
+
"""
|
590
|
+
request_args = {
|
591
|
+
"url": url,
|
592
|
+
"data": data,
|
593
|
+
"json": json,
|
594
|
+
"headers": headers,
|
595
|
+
"params": params,
|
596
|
+
"cookies": cookies,
|
597
|
+
"timeout": timeout,
|
598
|
+
"retry_delay": retry_delay,
|
599
|
+
"proxy": proxy,
|
600
|
+
"impersonate": impersonate,
|
601
|
+
"allow_redirects": follow_redirects,
|
602
|
+
"max_redirects": max_redirects,
|
603
|
+
"retries": retries,
|
604
|
+
"proxies": proxies,
|
605
|
+
"proxy_auth": proxy_auth,
|
606
|
+
"auth": auth,
|
607
|
+
"verify": verify,
|
608
|
+
"cert": cert,
|
609
|
+
"http3": http3,
|
610
|
+
**kwargs,
|
611
|
+
}
|
612
|
+
return self.__prepare_and_dispatch(
|
613
|
+
"PUT", stealth=stealthy_headers, **request_args
|
614
|
+
)
|
615
|
+
|
616
|
+
def delete(
|
617
|
+
self,
|
618
|
+
url: str,
|
619
|
+
data: Optional[Dict | str] = None,
|
620
|
+
json: Optional[Dict | List] = None,
|
621
|
+
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
622
|
+
params: Optional[Dict | List | Tuple] = None,
|
623
|
+
cookies: Optional[CookieTypes] = None,
|
624
|
+
timeout: Optional[int | float] = _UNSET,
|
625
|
+
follow_redirects: Optional[bool] = _UNSET,
|
626
|
+
max_redirects: Optional[int] = _UNSET,
|
627
|
+
retries: Optional[int] = _UNSET,
|
628
|
+
retry_delay: Optional[int] = _UNSET,
|
629
|
+
proxies: Optional[ProxySpec] = _UNSET,
|
630
|
+
proxy: Optional[str] = _UNSET,
|
631
|
+
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
632
|
+
auth: Optional[Tuple[str, str]] = None,
|
633
|
+
verify: Optional[bool] = _UNSET,
|
634
|
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
635
|
+
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
636
|
+
http3: Optional[bool] = _UNSET,
|
637
|
+
stealthy_headers: Optional[bool] = _UNSET,
|
638
|
+
**kwargs,
|
639
|
+
) -> Response | Awaitable[Response]:
|
640
|
+
"""
|
641
|
+
Perform a DELETE request.
|
152
642
|
|
153
|
-
:param
|
154
|
-
:
|
643
|
+
:param url: Target URL for the request.
|
644
|
+
:param data: Form data to include in the request body.
|
645
|
+
:param json: A JSON serializable object to include in the body of the request.
|
646
|
+
:param headers: Headers to include in the request.
|
647
|
+
:param params: Query string parameters for the request.
|
648
|
+
:param cookies: Cookies to use in the request.
|
649
|
+
:param timeout: Number of seconds to wait before timing out.
|
650
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
651
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
652
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
653
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
654
|
+
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
655
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
656
|
+
Cannot be used together with the `proxies` parameter.
|
657
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
658
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
659
|
+
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
660
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
661
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
662
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
663
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
664
|
+
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
665
|
+
:return: A `Response` object or an awaitable for async.
|
155
666
|
"""
|
156
|
-
|
667
|
+
request_args = {
|
668
|
+
"url": url,
|
669
|
+
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
670
|
+
# But some websites accept it, it depends on the implementation used.
|
671
|
+
"data": data,
|
672
|
+
"json": json,
|
673
|
+
"headers": headers,
|
674
|
+
"params": params,
|
675
|
+
"cookies": cookies,
|
676
|
+
"timeout": timeout,
|
677
|
+
"retry_delay": retry_delay,
|
678
|
+
"proxy": proxy,
|
679
|
+
"impersonate": impersonate,
|
680
|
+
"allow_redirects": follow_redirects,
|
681
|
+
"max_redirects": max_redirects,
|
682
|
+
"retries": retries,
|
683
|
+
"proxies": proxies,
|
684
|
+
"proxy_auth": proxy_auth,
|
685
|
+
"auth": auth,
|
686
|
+
"verify": verify,
|
687
|
+
"cert": cert,
|
688
|
+
"http3": http3,
|
689
|
+
**kwargs,
|
690
|
+
}
|
691
|
+
return self.__prepare_and_dispatch(
|
692
|
+
"DELETE", stealth=stealthy_headers, **request_args
|
693
|
+
)
|
694
|
+
|
695
|
+
|
696
|
+
class FetcherClient(FetcherSession):
|
697
|
+
def __init__(self, *args, **kwargs):
|
698
|
+
super().__init__(*args, **kwargs)
|
699
|
+
self.__enter__ = None
|
700
|
+
self.__exit__ = None
|
701
|
+
self.__aenter__ = None
|
702
|
+
self.__aexit__ = None
|
703
|
+
self._curl_session = True
|
704
|
+
|
705
|
+
|
706
|
+
class AsyncFetcherClient(FetcherSession):
|
707
|
+
def __init__(self, *args, **kwargs):
|
708
|
+
super().__init__(*args, **kwargs)
|
709
|
+
self.__enter__ = None
|
710
|
+
self.__exit__ = None
|
711
|
+
self.__aenter__ = None
|
712
|
+
self.__aexit__ = None
|
713
|
+
self._async_curl_session = True
|