scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +21 -4
- scrapling/core/_types.py +3 -2
- scrapling/core/ai.py +24 -15
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +6 -4
- scrapling/core/storage.py +7 -6
- scrapling/core/translator.py +13 -8
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +45 -21
- scrapling/engines/_browsers/_camoufox.py +98 -43
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +34 -13
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +749 -336
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +46 -0
- scrapling/fetchers/chrome.py +210 -0
- scrapling/fetchers/firefox.py +212 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +109 -84
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.5.dist-info/RECORD +0 -44
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
scrapling/engines/static.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
+
from abc import ABC
|
1
2
|
from time import sleep as time_sleep
|
2
3
|
from asyncio import sleep as asyncio_sleep
|
3
4
|
|
4
|
-
from curl_cffi.
|
5
|
+
from curl_cffi.curl import CurlError
|
5
6
|
from curl_cffi import CurlHttpVersion
|
6
|
-
from curl_cffi.requests.impersonate import DEFAULT_CHROME
|
7
7
|
from curl_cffi.requests import (
|
8
8
|
ProxySpec,
|
9
9
|
CookieTypes,
|
@@ -28,23 +28,15 @@ from .toolbelt.custom import Response
|
|
28
28
|
from .toolbelt.convertor import ResponseFactory
|
29
29
|
from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__
|
30
30
|
|
31
|
-
_UNSET = object()
|
31
|
+
_UNSET: Any = object()
|
32
|
+
_NO_SESSION: Any = object()
|
32
33
|
|
33
34
|
|
34
|
-
class
|
35
|
-
|
36
|
-
A context manager that provides configured Fetcher sessions.
|
37
|
-
|
38
|
-
When this manager is used in a 'with' or 'async with' block,
|
39
|
-
it yields a new session configured with the manager's defaults.
|
40
|
-
A single instance of this manager should ideally be used for one active
|
41
|
-
session at a time (or sequentially). Re-entering a context with the
|
42
|
-
same manager instance while a session is already active is disallowed.
|
43
|
-
"""
|
44
|
-
|
35
|
+
class _ConfigurationLogic(ABC):
|
36
|
+
# Core Logic Handler (Internal Engine)
|
45
37
|
def __init__(
|
46
38
|
self,
|
47
|
-
impersonate: Optional[BrowserTypeLiteral] =
|
39
|
+
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
48
40
|
http3: Optional[bool] = False,
|
49
41
|
stealthy_headers: Optional[bool] = True,
|
50
42
|
proxies: Optional[Dict[str, str]] = None,
|
@@ -60,203 +52,185 @@ class FetcherSession:
|
|
60
52
|
cert: Optional[str | Tuple[str, str]] = None,
|
61
53
|
selector_config: Optional[Dict] = None,
|
62
54
|
):
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
78
|
-
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
79
|
-
:param selector_config: Arguments passed when creating the final Selector class.
|
80
|
-
"""
|
81
|
-
self.default_impersonate = impersonate
|
82
|
-
self.stealth = stealthy_headers
|
83
|
-
self.default_proxies = proxies or {}
|
84
|
-
self.default_proxy = proxy or None
|
85
|
-
self.default_proxy_auth = proxy_auth or None
|
86
|
-
self.default_timeout = timeout
|
87
|
-
self.default_headers = headers or {}
|
88
|
-
self.default_retries = retries
|
89
|
-
self.default_retry_delay = retry_delay
|
90
|
-
self.default_follow_redirects = follow_redirects
|
91
|
-
self.default_max_redirects = max_redirects
|
92
|
-
self.default_verify = verify
|
93
|
-
self.default_cert = cert
|
94
|
-
self.default_http3 = http3
|
55
|
+
self._default_impersonate = impersonate
|
56
|
+
self._stealth = stealthy_headers
|
57
|
+
self._default_proxies = proxies or {}
|
58
|
+
self._default_proxy = proxy or None
|
59
|
+
self._default_proxy_auth = proxy_auth or None
|
60
|
+
self._default_timeout = timeout
|
61
|
+
self._default_headers = headers or {}
|
62
|
+
self._default_retries = retries
|
63
|
+
self._default_retry_delay = retry_delay
|
64
|
+
self._default_follow_redirects = follow_redirects
|
65
|
+
self._default_max_redirects = max_redirects
|
66
|
+
self._default_verify = verify
|
67
|
+
self._default_cert = cert
|
68
|
+
self._default_http3 = http3
|
95
69
|
self.selector_config = selector_config or {}
|
96
70
|
|
97
|
-
|
98
|
-
|
71
|
+
@staticmethod
|
72
|
+
def _get_with_precedence(request_val: Any, default_val: Any) -> Any:
|
73
|
+
"""Get value with request-level priority over session-level"""
|
74
|
+
return request_val if request_val is not _UNSET else default_val
|
99
75
|
|
100
|
-
def _merge_request_args(self, **
|
76
|
+
def _merge_request_args(self, **method_kwargs) -> Dict[str, Any]:
|
101
77
|
"""Merge request-specific arguments with default session arguments."""
|
102
|
-
url =
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
78
|
+
url = method_kwargs.pop("url")
|
79
|
+
impersonate = self._get_with_precedence(method_kwargs.pop("impersonate"), self._default_impersonate)
|
80
|
+
http3_enabled = self._get_with_precedence(method_kwargs.pop("http3"), self._default_http3)
|
81
|
+
final_args = {
|
82
|
+
"url": url,
|
83
|
+
# Curl automatically generates the suitable browser headers when you use `impersonate`
|
84
|
+
"headers": self._headers_job(
|
85
|
+
url,
|
86
|
+
self._get_with_precedence(method_kwargs.pop("headers"), self._default_headers),
|
87
|
+
self._get_with_precedence(method_kwargs.pop("stealth"), self._stealth),
|
88
|
+
bool(impersonate),
|
89
|
+
),
|
90
|
+
"proxies": self._get_with_precedence(method_kwargs.pop("proxies"), self._default_proxies),
|
91
|
+
"proxy": self._get_with_precedence(method_kwargs.pop("proxy"), self._default_proxy),
|
92
|
+
"proxy_auth": self._get_with_precedence(method_kwargs.pop("proxy_auth"), self._default_proxy_auth),
|
93
|
+
"timeout": self._get_with_precedence(method_kwargs.pop("timeout"), self._default_timeout),
|
94
|
+
"allow_redirects": self._get_with_precedence(
|
95
|
+
method_kwargs.pop("follow_redirects"), self._default_follow_redirects
|
96
|
+
),
|
97
|
+
"max_redirects": self._get_with_precedence(method_kwargs.pop("max_redirects"), self._default_max_redirects),
|
98
|
+
"verify": self._get_with_precedence(method_kwargs.pop("verify"), self._default_verify),
|
99
|
+
"cert": self._get_with_precedence(method_kwargs.pop("cert"), self._default_cert),
|
100
|
+
"impersonate": impersonate,
|
101
|
+
**{
|
102
|
+
k: v
|
103
|
+
for k, v in method_kwargs.items()
|
104
|
+
if v
|
105
|
+
not in (
|
106
|
+
_UNSET,
|
107
|
+
None,
|
108
|
+
)
|
109
|
+
}, # Add any remaining parameters (after all known ones are popped)
|
110
|
+
}
|
111
|
+
if http3_enabled: # pragma: no cover
|
112
|
+
final_args["http_version"] = CurlHttpVersion.V3ONLY
|
111
113
|
if impersonate:
|
112
114
|
log.warning(
|
113
115
|
"The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
|
114
116
|
)
|
115
117
|
|
116
|
-
|
117
|
-
{
|
118
|
-
"url": url,
|
119
|
-
# Curl automatically generates the suitable browser headers when you use `impersonate`
|
120
|
-
"headers": self._headers_job(url, headers, stealth, bool(impersonate)),
|
121
|
-
"proxies": self.get_with_precedence(kwargs, "proxies", self.default_proxies),
|
122
|
-
"proxy": self.get_with_precedence(kwargs, "proxy", self.default_proxy),
|
123
|
-
"proxy_auth": self.get_with_precedence(kwargs, "proxy_auth", self.default_proxy_auth),
|
124
|
-
"timeout": self.get_with_precedence(kwargs, "timeout", self.default_timeout),
|
125
|
-
"allow_redirects": self.get_with_precedence(kwargs, "allow_redirects", self.default_follow_redirects),
|
126
|
-
"max_redirects": self.get_with_precedence(kwargs, "max_redirects", self.default_max_redirects),
|
127
|
-
"verify": self.get_with_precedence(kwargs, "verify", self.default_verify),
|
128
|
-
"cert": self.get_with_precedence(kwargs, "cert", self.default_cert),
|
129
|
-
"impersonate": impersonate,
|
130
|
-
**{
|
131
|
-
k: v
|
132
|
-
for k, v in kwargs.items()
|
133
|
-
if v
|
134
|
-
not in (
|
135
|
-
_UNSET,
|
136
|
-
None,
|
137
|
-
)
|
138
|
-
}, # Add any remaining parameters (after all known ones are popped)
|
139
|
-
}
|
140
|
-
)
|
141
|
-
return request_args
|
118
|
+
return final_args
|
142
119
|
|
143
|
-
def _headers_job(
|
144
|
-
|
145
|
-
|
146
|
-
headers
|
147
|
-
|
148
|
-
impersonate_enabled: bool,
|
149
|
-
) -> Dict:
|
150
|
-
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
151
|
-
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
152
|
-
|
153
|
-
:param headers: Current headers in the request if the user passed any
|
154
|
-
:param stealth: Whether to enable the `stealthy_headers` argument to this request or not. If `None`, it defaults to the session default value.
|
155
|
-
:param impersonate_enabled: Whether the browser impersonation is enabled or not.
|
156
|
-
:return: A dictionary of the new headers.
|
120
|
+
def _headers_job(self, url, headers: Dict, stealth: bool, impersonate_enabled: bool) -> Dict:
|
121
|
+
"""
|
122
|
+
1. Adds a useragent to the headers if it doesn't have one
|
123
|
+
2. Generates real headers and append them to current headers
|
124
|
+
3. Generates a referer header that looks like as if this request came from a Google's search of the current URL's domain.
|
157
125
|
"""
|
158
|
-
#
|
159
|
-
if headers is _UNSET
|
160
|
-
|
161
|
-
else:
|
162
|
-
# Merge session headers with request headers, request takes precedence
|
163
|
-
headers = {**self.default_headers, **(headers or {})}
|
164
|
-
|
165
|
-
headers_keys = set(map(str.lower, headers.keys()))
|
126
|
+
# Merge session headers with request headers, request takes precedence (if it was set)
|
127
|
+
final_headers = {**self._default_headers, **(headers if headers and headers is not _UNSET else {})}
|
128
|
+
headers_keys = {k.lower() for k in final_headers}
|
166
129
|
if stealth:
|
167
130
|
if "referer" not in headers_keys:
|
168
|
-
|
131
|
+
final_headers["referer"] = generate_convincing_referer(url)
|
169
132
|
|
170
|
-
if impersonate_enabled: # Curl will generate the suitable headers
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
|
176
|
-
headers.update(extra_headers)
|
133
|
+
if not impersonate_enabled: # Curl will generate the suitable headers
|
134
|
+
extra_headers = generate_headers(browser_mode=False)
|
135
|
+
final_headers.update(
|
136
|
+
{k: v for k, v in extra_headers.items() if k.lower() not in headers_keys}
|
137
|
+
) # Don't overwrite user-supplied headers
|
177
138
|
|
178
139
|
elif "user-agent" not in headers_keys and not impersonate_enabled:
|
179
|
-
|
180
|
-
log.debug(f"Can't find useragent in headers so '{
|
140
|
+
final_headers["User-Agent"] = __default_useragent__
|
141
|
+
log.debug(f"Can't find useragent in headers so '{final_headers['User-Agent']}' was used.")
|
142
|
+
|
143
|
+
return final_headers
|
144
|
+
|
181
145
|
|
182
|
-
|
146
|
+
class _SyncSessionLogic(_ConfigurationLogic):
|
147
|
+
def __init__(
|
148
|
+
self,
|
149
|
+
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
150
|
+
http3: Optional[bool] = False,
|
151
|
+
stealthy_headers: Optional[bool] = True,
|
152
|
+
proxies: Optional[Dict[str, str]] = None,
|
153
|
+
proxy: Optional[str] = None,
|
154
|
+
proxy_auth: Optional[Tuple[str, str]] = None,
|
155
|
+
timeout: Optional[int | float] = 30,
|
156
|
+
headers: Optional[Dict[str, str]] = None,
|
157
|
+
retries: Optional[int] = 3,
|
158
|
+
retry_delay: Optional[int] = 1,
|
159
|
+
follow_redirects: bool = True,
|
160
|
+
max_redirects: int = 30,
|
161
|
+
verify: bool = True,
|
162
|
+
cert: Optional[str | Tuple[str, str]] = None,
|
163
|
+
selector_config: Optional[Dict] = None,
|
164
|
+
):
|
165
|
+
super().__init__(
|
166
|
+
impersonate,
|
167
|
+
http3,
|
168
|
+
stealthy_headers,
|
169
|
+
proxies,
|
170
|
+
proxy,
|
171
|
+
proxy_auth,
|
172
|
+
timeout,
|
173
|
+
headers,
|
174
|
+
retries,
|
175
|
+
retry_delay,
|
176
|
+
follow_redirects,
|
177
|
+
max_redirects,
|
178
|
+
verify,
|
179
|
+
cert,
|
180
|
+
selector_config,
|
181
|
+
)
|
182
|
+
self._curl_session: Optional[CurlSession] = None
|
183
183
|
|
184
184
|
def __enter__(self):
|
185
185
|
"""Creates and returns a new synchronous Fetcher Session"""
|
186
186
|
if self._curl_session:
|
187
|
-
raise RuntimeError(
|
188
|
-
"This FetcherSession instance already has an active synchronous session. "
|
189
|
-
"Create a new FetcherSession instance for a new independent session, "
|
190
|
-
"or use the current instance sequentially after the previous context has exited."
|
191
|
-
)
|
192
|
-
if self._async_curl_session: # Prevent mixing if async is active from this instance
|
193
|
-
raise RuntimeError(
|
194
|
-
"This FetcherSession instance has an active asynchronous session. "
|
195
|
-
"Cannot enter a synchronous context simultaneously with the same manager instance."
|
196
|
-
)
|
187
|
+
raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
|
197
188
|
|
198
189
|
self._curl_session = CurlSession()
|
199
190
|
return self
|
200
191
|
|
201
192
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
202
193
|
"""Closes the active synchronous session managed by this instance, if any."""
|
194
|
+
# For type checking (not accessed error)
|
195
|
+
_ = (
|
196
|
+
exc_type,
|
197
|
+
exc_val,
|
198
|
+
exc_tb,
|
199
|
+
)
|
203
200
|
if self._curl_session:
|
204
201
|
self._curl_session.close()
|
205
202
|
self._curl_session = None
|
206
203
|
|
207
|
-
async def __aenter__(self):
|
208
|
-
"""Creates and returns a new asynchronous Session."""
|
209
|
-
if self._async_curl_session:
|
210
|
-
raise RuntimeError(
|
211
|
-
"This FetcherSession instance already has an active asynchronous session. "
|
212
|
-
"Create a new FetcherSession instance for a new independent session, "
|
213
|
-
"or use the current instance sequentially after the previous context has exited."
|
214
|
-
)
|
215
|
-
if self._curl_session: # Prevent mixing if sync is active from this instance
|
216
|
-
raise RuntimeError(
|
217
|
-
"This FetcherSession instance has an active synchronous session. "
|
218
|
-
"Cannot enter an asynchronous context simultaneously with the same manager instance."
|
219
|
-
)
|
220
|
-
|
221
|
-
self._async_curl_session = AsyncCurlSession()
|
222
|
-
return self
|
223
|
-
|
224
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
225
|
-
"""Closes the active asynchronous session managed by this instance, if any."""
|
226
|
-
if self._async_curl_session:
|
227
|
-
await self._async_curl_session.close()
|
228
|
-
self._async_curl_session = None
|
229
|
-
|
230
204
|
def __make_request(
|
231
205
|
self,
|
232
206
|
method: SUPPORTED_HTTP_METHODS,
|
233
|
-
|
234
|
-
|
235
|
-
retry_delay: int,
|
236
|
-
selector_config: Optional[Dict] = None,
|
207
|
+
stealth: Optional[bool] = None,
|
208
|
+
**kwargs,
|
237
209
|
) -> Response:
|
238
210
|
"""
|
239
211
|
Perform an HTTP request using the configured session.
|
240
|
-
|
241
|
-
:param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
|
242
|
-
:param request_args: Arguments to be passed to the session's `request()` method.
|
243
|
-
:param max_retries: Maximum number of retries for the request.
|
244
|
-
:param retry_delay: Number of seconds to wait between retries.
|
245
|
-
:param selector_config: Arguments passed when creating the final Selector class.
|
246
|
-
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
247
212
|
"""
|
213
|
+
stealth = self._stealth if stealth is None else stealth
|
214
|
+
|
215
|
+
selector_config = kwargs.pop("selector_config", {}) or self.selector_config
|
216
|
+
max_retries = self._get_with_precedence(kwargs.pop("retries"), self._default_retries)
|
217
|
+
retry_delay = self._get_with_precedence(kwargs.pop("retry_delay"), self._default_retry_delay)
|
218
|
+
request_args = self._merge_request_args(stealth=stealth, **kwargs)
|
219
|
+
|
248
220
|
session = self._curl_session
|
249
|
-
|
221
|
+
one_off_request = False
|
222
|
+
if session is _NO_SESSION and self.__enter__ is None:
|
250
223
|
# For usage inside FetcherClient
|
251
224
|
# It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
|
252
225
|
session = CurlSession()
|
226
|
+
one_off_request = True
|
253
227
|
|
254
228
|
if session:
|
255
229
|
for attempt in range(max_retries):
|
256
230
|
try:
|
257
231
|
response = session.request(method, **request_args)
|
258
|
-
|
259
|
-
return
|
232
|
+
result = ResponseFactory.from_http_request(response, selector_config)
|
233
|
+
return result
|
260
234
|
except CurlError as e: # pragma: no cover
|
261
235
|
if attempt < max_retries - 1:
|
262
236
|
log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
|
@@ -264,41 +238,407 @@ class FetcherSession:
|
|
264
238
|
else:
|
265
239
|
log.error(f"Failed after {max_retries} attempts: {e}")
|
266
240
|
raise # Raise the exception if all retries fail
|
241
|
+
finally:
|
242
|
+
if session and one_off_request:
|
243
|
+
session.close()
|
267
244
|
|
268
245
|
raise RuntimeError("No active session available.") # pragma: no cover
|
269
246
|
|
270
|
-
|
247
|
+
def get(
|
271
248
|
self,
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
249
|
+
url: str,
|
250
|
+
params: Optional[Dict | List | Tuple] = None,
|
251
|
+
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
252
|
+
cookies: Optional[CookieTypes] = None,
|
253
|
+
timeout: Optional[int | float] = _UNSET,
|
254
|
+
follow_redirects: Optional[bool] = _UNSET,
|
255
|
+
max_redirects: Optional[int] = _UNSET,
|
256
|
+
retries: Optional[int] = _UNSET,
|
257
|
+
retry_delay: Optional[int] = _UNSET,
|
258
|
+
proxies: Optional[ProxySpec] = _UNSET,
|
259
|
+
proxy: Optional[str] = _UNSET,
|
260
|
+
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
261
|
+
auth: Optional[Tuple[str, str]] = None,
|
262
|
+
verify: Optional[bool] = _UNSET,
|
263
|
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
264
|
+
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
265
|
+
http3: Optional[bool] = _UNSET,
|
266
|
+
stealthy_headers: Optional[bool] = _UNSET,
|
267
|
+
**kwargs,
|
268
|
+
) -> Response:
|
269
|
+
"""
|
270
|
+
Perform a GET request.
|
271
|
+
|
272
|
+
:param url: Target URL for the request.
|
273
|
+
:param params: Query string parameters for the request.
|
274
|
+
:param headers: Headers to include in the request.
|
275
|
+
:param cookies: Cookies to use in the request.
|
276
|
+
:param timeout: Number of seconds to wait before timing out.
|
277
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
278
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
279
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
280
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
281
|
+
:param proxies: Dict of proxies to use.
|
282
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
283
|
+
Cannot be used together with the `proxies` parameter.
|
284
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
285
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
286
|
+
:param verify: Whether to verify HTTPS certificates.
|
287
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
288
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
289
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
290
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
291
|
+
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
292
|
+
:return: A `Response` object.
|
293
|
+
"""
|
294
|
+
method_args = {k: v for k, v in locals().items() if k not in ("self", "stealthy_headers", "kwargs")}
|
295
|
+
method_args.update(kwargs)
|
296
|
+
# For type checking (not accessed error)
|
297
|
+
_ = (
|
298
|
+
url,
|
299
|
+
params,
|
300
|
+
headers,
|
301
|
+
cookies,
|
302
|
+
timeout,
|
303
|
+
follow_redirects,
|
304
|
+
max_redirects,
|
305
|
+
retries,
|
306
|
+
retry_delay,
|
307
|
+
proxies,
|
308
|
+
proxy,
|
309
|
+
proxy_auth,
|
310
|
+
auth,
|
311
|
+
verify,
|
312
|
+
cert,
|
313
|
+
impersonate,
|
314
|
+
http3,
|
315
|
+
)
|
316
|
+
return self.__make_request("GET", stealth=stealthy_headers, **method_args)
|
317
|
+
|
318
|
+
def post(
|
319
|
+
self,
|
320
|
+
url: str,
|
321
|
+
data: Optional[Dict | str] = None,
|
322
|
+
json: Optional[Dict | List] = None,
|
323
|
+
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
324
|
+
params: Optional[Dict | List | Tuple] = None,
|
325
|
+
cookies: Optional[CookieTypes] = None,
|
326
|
+
timeout: Optional[int | float] = _UNSET,
|
327
|
+
follow_redirects: Optional[bool] = _UNSET,
|
328
|
+
max_redirects: Optional[int] = _UNSET,
|
329
|
+
retries: Optional[int] = _UNSET,
|
330
|
+
retry_delay: Optional[int] = _UNSET,
|
331
|
+
proxies: Optional[ProxySpec] = _UNSET,
|
332
|
+
proxy: Optional[str] = _UNSET,
|
333
|
+
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
334
|
+
auth: Optional[Tuple[str, str]] = None,
|
335
|
+
verify: Optional[bool] = _UNSET,
|
336
|
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
337
|
+
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
338
|
+
http3: Optional[bool] = _UNSET,
|
339
|
+
stealthy_headers: Optional[bool] = _UNSET,
|
340
|
+
**kwargs,
|
341
|
+
) -> Response:
|
342
|
+
"""
|
343
|
+
Perform a POST request.
|
344
|
+
|
345
|
+
:param url: Target URL for the request.
|
346
|
+
:param data: Form data to include in the request body.
|
347
|
+
:param json: A JSON serializable object to include in the body of the request.
|
348
|
+
:param params: Query string parameters for the request.
|
349
|
+
:param headers: Headers to include in the request.
|
350
|
+
:param cookies: Cookies to use in the request.
|
351
|
+
:param timeout: Number of seconds to wait before timing out.
|
352
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
353
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
354
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
355
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
356
|
+
:param proxies: Dict of proxies to use.
|
357
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
358
|
+
Cannot be used together with the `proxies` parameter.
|
359
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
360
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
361
|
+
:param verify: Whether to verify HTTPS certificates.
|
362
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
363
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
364
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
365
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
366
|
+
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
367
|
+
:return: A `Response` object.
|
368
|
+
"""
|
369
|
+
method_args = {k: v for k, v in locals().items() if k not in ("self", "stealthy_headers", "kwargs")}
|
370
|
+
method_args.update(kwargs)
|
371
|
+
# For type checking (not accessed error)
|
372
|
+
_ = (
|
373
|
+
url,
|
374
|
+
params,
|
375
|
+
headers,
|
376
|
+
data,
|
377
|
+
json,
|
378
|
+
cookies,
|
379
|
+
timeout,
|
380
|
+
follow_redirects,
|
381
|
+
max_redirects,
|
382
|
+
retries,
|
383
|
+
retry_delay,
|
384
|
+
proxies,
|
385
|
+
proxy,
|
386
|
+
proxy_auth,
|
387
|
+
auth,
|
388
|
+
verify,
|
389
|
+
cert,
|
390
|
+
impersonate,
|
391
|
+
http3,
|
392
|
+
)
|
393
|
+
return self.__make_request("POST", stealth=stealthy_headers, **method_args)
|
394
|
+
|
395
|
+
def put(
|
396
|
+
self,
|
397
|
+
url: str,
|
398
|
+
data: Optional[Dict | str] = None,
|
399
|
+
json: Optional[Dict | List] = None,
|
400
|
+
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
401
|
+
params: Optional[Dict | List | Tuple] = None,
|
402
|
+
cookies: Optional[CookieTypes] = None,
|
403
|
+
timeout: Optional[int | float] = _UNSET,
|
404
|
+
follow_redirects: Optional[bool] = _UNSET,
|
405
|
+
max_redirects: Optional[int] = _UNSET,
|
406
|
+
retries: Optional[int] = _UNSET,
|
407
|
+
retry_delay: Optional[int] = _UNSET,
|
408
|
+
proxies: Optional[ProxySpec] = _UNSET,
|
409
|
+
proxy: Optional[str] = _UNSET,
|
410
|
+
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
411
|
+
auth: Optional[Tuple[str, str]] = None,
|
412
|
+
verify: Optional[bool] = _UNSET,
|
413
|
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
414
|
+
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
415
|
+
http3: Optional[bool] = _UNSET,
|
416
|
+
stealthy_headers: Optional[bool] = _UNSET,
|
417
|
+
**kwargs,
|
418
|
+
) -> Response:
|
419
|
+
"""
|
420
|
+
Perform a PUT request.
|
421
|
+
|
422
|
+
:param url: Target URL for the request.
|
423
|
+
:param data: Form data to include in the request body.
|
424
|
+
:param json: A JSON serializable object to include in the body of the request.
|
425
|
+
:param params: Query string parameters for the request.
|
426
|
+
:param headers: Headers to include in the request.
|
427
|
+
:param cookies: Cookies to use in the request.
|
428
|
+
:param timeout: Number of seconds to wait before timing out.
|
429
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
430
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
431
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
432
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
433
|
+
:param proxies: Dict of proxies to use.
|
434
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
435
|
+
Cannot be used together with the `proxies` parameter.
|
436
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
437
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
438
|
+
:param verify: Whether to verify HTTPS certificates.
|
439
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
440
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
441
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
442
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
443
|
+
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
444
|
+
:return: A `Response` object.
|
445
|
+
"""
|
446
|
+
method_args = {k: v for k, v in locals().items() if k not in ("self", "stealthy_headers", "kwargs")}
|
447
|
+
method_args.update(kwargs)
|
448
|
+
# For type checking (not accessed error)
|
449
|
+
_ = (
|
450
|
+
url,
|
451
|
+
params,
|
452
|
+
headers,
|
453
|
+
data,
|
454
|
+
json,
|
455
|
+
cookies,
|
456
|
+
timeout,
|
457
|
+
follow_redirects,
|
458
|
+
max_redirects,
|
459
|
+
retries,
|
460
|
+
retry_delay,
|
461
|
+
proxies,
|
462
|
+
proxy,
|
463
|
+
proxy_auth,
|
464
|
+
auth,
|
465
|
+
verify,
|
466
|
+
cert,
|
467
|
+
impersonate,
|
468
|
+
http3,
|
469
|
+
)
|
470
|
+
return self.__make_request("PUT", stealth=stealthy_headers, **method_args)
|
471
|
+
|
472
|
+
def delete(
|
473
|
+
self,
|
474
|
+
url: str,
|
475
|
+
data: Optional[Dict | str] = None,
|
476
|
+
json: Optional[Dict | List] = None,
|
477
|
+
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
478
|
+
params: Optional[Dict | List | Tuple] = None,
|
479
|
+
cookies: Optional[CookieTypes] = None,
|
480
|
+
timeout: Optional[int | float] = _UNSET,
|
481
|
+
follow_redirects: Optional[bool] = _UNSET,
|
482
|
+
max_redirects: Optional[int] = _UNSET,
|
483
|
+
retries: Optional[int] = _UNSET,
|
484
|
+
retry_delay: Optional[int] = _UNSET,
|
485
|
+
proxies: Optional[ProxySpec] = _UNSET,
|
486
|
+
proxy: Optional[str] = _UNSET,
|
487
|
+
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
488
|
+
auth: Optional[Tuple[str, str]] = None,
|
489
|
+
verify: Optional[bool] = _UNSET,
|
490
|
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
491
|
+
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
492
|
+
http3: Optional[bool] = _UNSET,
|
493
|
+
stealthy_headers: Optional[bool] = _UNSET,
|
494
|
+
**kwargs,
|
495
|
+
) -> Response:
|
496
|
+
"""
|
497
|
+
Perform a DELETE request.
|
498
|
+
|
499
|
+
:param url: Target URL for the request.
|
500
|
+
:param data: Form data to include in the request body.
|
501
|
+
:param json: A JSON serializable object to include in the body of the request.
|
502
|
+
:param params: Query string parameters for the request.
|
503
|
+
:param headers: Headers to include in the request.
|
504
|
+
:param cookies: Cookies to use in the request.
|
505
|
+
:param timeout: Number of seconds to wait before timing out.
|
506
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
507
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
508
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
509
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
510
|
+
:param proxies: Dict of proxies to use.
|
511
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
512
|
+
Cannot be used together with the `proxies` parameter.
|
513
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
514
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
515
|
+
:param verify: Whether to verify HTTPS certificates.
|
516
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
517
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
518
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
519
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
520
|
+
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
521
|
+
:return: A `Response` object.
|
522
|
+
"""
|
523
|
+
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
524
|
+
# But some websites accept it, it depends on the implementation used.
|
525
|
+
method_args = {k: v for k, v in locals().items() if k not in ("self", "stealthy_headers", "kwargs")}
|
526
|
+
method_args.update(kwargs)
|
527
|
+
# For type checking (not accessed error)
|
528
|
+
_ = (
|
529
|
+
url,
|
530
|
+
params,
|
531
|
+
headers,
|
532
|
+
data,
|
533
|
+
json,
|
534
|
+
cookies,
|
535
|
+
timeout,
|
536
|
+
follow_redirects,
|
537
|
+
max_redirects,
|
538
|
+
retries,
|
539
|
+
retry_delay,
|
540
|
+
proxies,
|
541
|
+
proxy,
|
542
|
+
proxy_auth,
|
543
|
+
auth,
|
544
|
+
verify,
|
545
|
+
cert,
|
546
|
+
impersonate,
|
547
|
+
http3,
|
548
|
+
)
|
549
|
+
return self.__make_request("DELETE", stealth=stealthy_headers, **method_args)
|
550
|
+
|
551
|
+
|
552
|
+
class _ASyncSessionLogic(_ConfigurationLogic):
|
553
|
+
def __init__(
|
554
|
+
self,
|
555
|
+
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
556
|
+
http3: Optional[bool] = False,
|
557
|
+
stealthy_headers: Optional[bool] = True,
|
558
|
+
proxies: Optional[Dict[str, str]] = None,
|
559
|
+
proxy: Optional[str] = None,
|
560
|
+
proxy_auth: Optional[Tuple[str, str]] = None,
|
561
|
+
timeout: Optional[int | float] = 30,
|
562
|
+
headers: Optional[Dict[str, str]] = None,
|
563
|
+
retries: Optional[int] = 3,
|
564
|
+
retry_delay: Optional[int] = 1,
|
565
|
+
follow_redirects: bool = True,
|
566
|
+
max_redirects: int = 30,
|
567
|
+
verify: bool = True,
|
568
|
+
cert: Optional[str | Tuple[str, str]] = None,
|
276
569
|
selector_config: Optional[Dict] = None,
|
570
|
+
):
|
571
|
+
super().__init__(
|
572
|
+
impersonate,
|
573
|
+
http3,
|
574
|
+
stealthy_headers,
|
575
|
+
proxies,
|
576
|
+
proxy,
|
577
|
+
proxy_auth,
|
578
|
+
timeout,
|
579
|
+
headers,
|
580
|
+
retries,
|
581
|
+
retry_delay,
|
582
|
+
follow_redirects,
|
583
|
+
max_redirects,
|
584
|
+
verify,
|
585
|
+
cert,
|
586
|
+
selector_config,
|
587
|
+
)
|
588
|
+
self._async_curl_session: Optional[AsyncCurlSession] = None
|
589
|
+
|
590
|
+
async def __aenter__(self):
|
591
|
+
"""Creates and returns a new asynchronous Session."""
|
592
|
+
if self._async_curl_session:
|
593
|
+
raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
|
594
|
+
|
595
|
+
self._async_curl_session = AsyncCurlSession()
|
596
|
+
return self
|
597
|
+
|
598
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
599
|
+
"""Closes the active asynchronous session managed by this instance, if any."""
|
600
|
+
# For type checking (not accessed error)
|
601
|
+
_ = (
|
602
|
+
exc_type,
|
603
|
+
exc_val,
|
604
|
+
exc_tb,
|
605
|
+
)
|
606
|
+
if self._async_curl_session:
|
607
|
+
await self._async_curl_session.close()
|
608
|
+
self._async_curl_session = None
|
609
|
+
|
610
|
+
async def __make_request(
|
611
|
+
self,
|
612
|
+
method: SUPPORTED_HTTP_METHODS,
|
613
|
+
stealth: Optional[bool] = None,
|
614
|
+
**kwargs,
|
277
615
|
) -> Response:
|
278
616
|
"""
|
279
617
|
Perform an HTTP request using the configured session.
|
280
|
-
|
281
|
-
:param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
|
282
|
-
:param request_args: Arguments to be passed to the session's `request()` method.
|
283
|
-
:param max_retries: Maximum number of retries for the request.
|
284
|
-
:param retry_delay: Number of seconds to wait between retries.
|
285
|
-
:param selector_config: Arguments passed when creating the final Selector class.
|
286
|
-
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
287
618
|
"""
|
619
|
+
stealth = self._stealth if stealth is None else stealth
|
620
|
+
|
621
|
+
selector_config = kwargs.pop("selector_config", {}) or self.selector_config
|
622
|
+
max_retries = self._get_with_precedence(kwargs.pop("retries"), self._default_retries)
|
623
|
+
retry_delay = self._get_with_precedence(kwargs.pop("retry_delay"), self._default_retry_delay)
|
624
|
+
request_args = self._merge_request_args(stealth=stealth, **kwargs)
|
625
|
+
|
288
626
|
session = self._async_curl_session
|
289
|
-
|
627
|
+
one_off_request = False
|
628
|
+
if session is _NO_SESSION and self.__aenter__ is None:
|
290
629
|
# For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
|
291
630
|
# 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
|
292
631
|
# 2. `curl_cffi` doesn't support making async requests without sessions
|
293
632
|
# 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
|
294
633
|
session = AsyncCurlSession()
|
634
|
+
one_off_request = True
|
295
635
|
|
296
636
|
if session:
|
297
637
|
for attempt in range(max_retries):
|
298
638
|
try:
|
299
639
|
response = await session.request(method, **request_args)
|
300
|
-
|
301
|
-
return
|
640
|
+
result = ResponseFactory.from_http_request(response, selector_config)
|
641
|
+
return result
|
302
642
|
except CurlError as e: # pragma: no cover
|
303
643
|
if attempt < max_retries - 1:
|
304
644
|
log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
|
@@ -306,44 +646,12 @@ class FetcherSession:
|
|
306
646
|
else:
|
307
647
|
log.error(f"Failed after {max_retries} attempts: {e}")
|
308
648
|
raise # Raise the exception if all retries fail
|
649
|
+
finally:
|
650
|
+
if session and one_off_request:
|
651
|
+
await session.close()
|
309
652
|
|
310
653
|
raise RuntimeError("No active session available.") # pragma: no cover
|
311
654
|
|
312
|
-
@staticmethod
|
313
|
-
def get_with_precedence(kwargs, key, default_value):
|
314
|
-
"""Get value with request-level priority over session-level"""
|
315
|
-
request_value = kwargs.pop(key, _UNSET)
|
316
|
-
return request_value if request_value is not _UNSET else default_value
|
317
|
-
|
318
|
-
def __prepare_and_dispatch(
|
319
|
-
self,
|
320
|
-
method: SUPPORTED_HTTP_METHODS,
|
321
|
-
stealth: Optional[bool] = None,
|
322
|
-
**kwargs,
|
323
|
-
) -> Response | Awaitable[Response]:
|
324
|
-
"""
|
325
|
-
Internal dispatcher. Prepares arguments and calls sync or async request helper.
|
326
|
-
|
327
|
-
:param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
|
328
|
-
:param stealth: Whether to enable the `stealthy_headers` argument to this request or not. If `None`, it defaults to the session default value.
|
329
|
-
:param url: Target URL for the request.
|
330
|
-
:param kwargs: Additional request-specific arguments.
|
331
|
-
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
332
|
-
"""
|
333
|
-
stealth = self.stealth if stealth is None else stealth
|
334
|
-
|
335
|
-
selector_config = kwargs.pop("selector_config", {}) or self.selector_config
|
336
|
-
max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
|
337
|
-
retry_delay = self.get_with_precedence(kwargs, "retry_delay", self.default_retry_delay)
|
338
|
-
request_args = self._merge_request_args(stealth=stealth, **kwargs)
|
339
|
-
if self._curl_session:
|
340
|
-
return self.__make_request(method, request_args, max_retries, retry_delay, selector_config)
|
341
|
-
elif self._async_curl_session:
|
342
|
-
# The returned value is a Coroutine
|
343
|
-
return self.__make_async_request(method, request_args, max_retries, retry_delay, selector_config)
|
344
|
-
|
345
|
-
raise RuntimeError("No active session available.")
|
346
|
-
|
347
655
|
def get(
|
348
656
|
self,
|
349
657
|
url: str,
|
@@ -365,7 +673,7 @@ class FetcherSession:
|
|
365
673
|
http3: Optional[bool] = _UNSET,
|
366
674
|
stealthy_headers: Optional[bool] = _UNSET,
|
367
675
|
**kwargs,
|
368
|
-
) ->
|
676
|
+
) -> Awaitable[Response]:
|
369
677
|
"""
|
370
678
|
Perform a GET request.
|
371
679
|
|
@@ -389,29 +697,31 @@ class FetcherSession:
|
|
389
697
|
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
390
698
|
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
391
699
|
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
392
|
-
:return: A `Response` object
|
700
|
+
:return: A `Response` object.
|
393
701
|
"""
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
702
|
+
method_args = {k: v for k, v in locals().items() if k not in ("self", "stealthy_headers", "kwargs")}
|
703
|
+
method_args.update(kwargs)
|
704
|
+
# For type checking (not accessed error)
|
705
|
+
_ = (
|
706
|
+
url,
|
707
|
+
params,
|
708
|
+
headers,
|
709
|
+
cookies,
|
710
|
+
timeout,
|
711
|
+
follow_redirects,
|
712
|
+
max_redirects,
|
713
|
+
retries,
|
714
|
+
retry_delay,
|
715
|
+
proxies,
|
716
|
+
proxy,
|
717
|
+
proxy_auth,
|
718
|
+
auth,
|
719
|
+
verify,
|
720
|
+
cert,
|
721
|
+
impersonate,
|
722
|
+
http3,
|
723
|
+
)
|
724
|
+
return self.__make_request("GET", stealth=stealthy_headers, **method_args)
|
415
725
|
|
416
726
|
def post(
|
417
727
|
self,
|
@@ -436,57 +746,59 @@ class FetcherSession:
|
|
436
746
|
http3: Optional[bool] = _UNSET,
|
437
747
|
stealthy_headers: Optional[bool] = _UNSET,
|
438
748
|
**kwargs,
|
439
|
-
) ->
|
749
|
+
) -> Awaitable[Response]:
|
440
750
|
"""
|
441
751
|
Perform a POST request.
|
442
752
|
|
443
753
|
:param url: Target URL for the request.
|
444
754
|
:param data: Form data to include in the request body.
|
445
755
|
:param json: A JSON serializable object to include in the body of the request.
|
446
|
-
:param headers: Headers to include in the request.
|
447
756
|
:param params: Query string parameters for the request.
|
757
|
+
:param headers: Headers to include in the request.
|
448
758
|
:param cookies: Cookies to use in the request.
|
449
759
|
:param timeout: Number of seconds to wait before timing out.
|
450
760
|
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
451
761
|
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
452
762
|
:param retries: Number of retry attempts. Defaults to 3.
|
453
763
|
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
454
|
-
:param proxies: Dict of proxies to use.
|
764
|
+
:param proxies: Dict of proxies to use.
|
455
765
|
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
456
766
|
Cannot be used together with the `proxies` parameter.
|
457
767
|
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
458
768
|
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
459
|
-
:param verify: Whether to verify HTTPS certificates.
|
769
|
+
:param verify: Whether to verify HTTPS certificates.
|
460
770
|
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
461
771
|
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
462
772
|
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
463
773
|
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
464
774
|
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
465
|
-
:return: A `Response` object
|
775
|
+
:return: A `Response` object.
|
466
776
|
"""
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
777
|
+
method_args = {k: v for k, v in locals().items() if k not in ("self", "stealthy_headers", "kwargs")}
|
778
|
+
method_args.update(kwargs)
|
779
|
+
# For type checking (not accessed error)
|
780
|
+
_ = (
|
781
|
+
url,
|
782
|
+
params,
|
783
|
+
headers,
|
784
|
+
data,
|
785
|
+
json,
|
786
|
+
cookies,
|
787
|
+
timeout,
|
788
|
+
follow_redirects,
|
789
|
+
max_redirects,
|
790
|
+
retries,
|
791
|
+
retry_delay,
|
792
|
+
proxies,
|
793
|
+
proxy,
|
794
|
+
proxy_auth,
|
795
|
+
auth,
|
796
|
+
verify,
|
797
|
+
cert,
|
798
|
+
impersonate,
|
799
|
+
http3,
|
800
|
+
)
|
801
|
+
return self.__make_request("POST", stealth=stealthy_headers, **method_args)
|
490
802
|
|
491
803
|
def put(
|
492
804
|
self,
|
@@ -511,57 +823,59 @@ class FetcherSession:
|
|
511
823
|
http3: Optional[bool] = _UNSET,
|
512
824
|
stealthy_headers: Optional[bool] = _UNSET,
|
513
825
|
**kwargs,
|
514
|
-
) ->
|
826
|
+
) -> Awaitable[Response]:
|
515
827
|
"""
|
516
828
|
Perform a PUT request.
|
517
829
|
|
518
830
|
:param url: Target URL for the request.
|
519
831
|
:param data: Form data to include in the request body.
|
520
832
|
:param json: A JSON serializable object to include in the body of the request.
|
521
|
-
:param headers: Headers to include in the request.
|
522
833
|
:param params: Query string parameters for the request.
|
834
|
+
:param headers: Headers to include in the request.
|
523
835
|
:param cookies: Cookies to use in the request.
|
524
836
|
:param timeout: Number of seconds to wait before timing out.
|
525
837
|
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
526
838
|
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
527
839
|
:param retries: Number of retry attempts. Defaults to 3.
|
528
840
|
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
529
|
-
:param proxies: Dict of proxies to use.
|
841
|
+
:param proxies: Dict of proxies to use.
|
530
842
|
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
531
843
|
Cannot be used together with the `proxies` parameter.
|
532
844
|
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
533
845
|
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
534
|
-
:param verify: Whether to verify HTTPS certificates.
|
846
|
+
:param verify: Whether to verify HTTPS certificates.
|
535
847
|
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
536
848
|
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
537
849
|
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
538
850
|
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
539
851
|
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
540
|
-
:return: A `Response` object
|
852
|
+
:return: A `Response` object.
|
541
853
|
"""
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
854
|
+
method_args = {k: v for k, v in locals().items() if k not in ("self", "stealthy_headers", "kwargs")}
|
855
|
+
method_args.update(kwargs)
|
856
|
+
# For type checking (not accessed error)
|
857
|
+
_ = (
|
858
|
+
url,
|
859
|
+
params,
|
860
|
+
headers,
|
861
|
+
data,
|
862
|
+
json,
|
863
|
+
cookies,
|
864
|
+
timeout,
|
865
|
+
follow_redirects,
|
866
|
+
max_redirects,
|
867
|
+
retries,
|
868
|
+
retry_delay,
|
869
|
+
proxies,
|
870
|
+
proxy,
|
871
|
+
proxy_auth,
|
872
|
+
auth,
|
873
|
+
verify,
|
874
|
+
cert,
|
875
|
+
impersonate,
|
876
|
+
http3,
|
877
|
+
)
|
878
|
+
return self.__make_request("PUT", stealth=stealthy_headers, **method_args)
|
565
879
|
|
566
880
|
def delete(
|
567
881
|
self,
|
@@ -586,76 +900,175 @@ class FetcherSession:
|
|
586
900
|
http3: Optional[bool] = _UNSET,
|
587
901
|
stealthy_headers: Optional[bool] = _UNSET,
|
588
902
|
**kwargs,
|
589
|
-
) ->
|
903
|
+
) -> Awaitable[Response]:
|
590
904
|
"""
|
591
905
|
Perform a DELETE request.
|
592
906
|
|
593
907
|
:param url: Target URL for the request.
|
594
908
|
:param data: Form data to include in the request body.
|
595
909
|
:param json: A JSON serializable object to include in the body of the request.
|
596
|
-
:param headers: Headers to include in the request.
|
597
910
|
:param params: Query string parameters for the request.
|
911
|
+
:param headers: Headers to include in the request.
|
598
912
|
:param cookies: Cookies to use in the request.
|
599
913
|
:param timeout: Number of seconds to wait before timing out.
|
600
914
|
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
601
915
|
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
602
916
|
:param retries: Number of retry attempts. Defaults to 3.
|
603
917
|
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
604
|
-
:param proxies: Dict of proxies to use.
|
918
|
+
:param proxies: Dict of proxies to use.
|
605
919
|
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
606
920
|
Cannot be used together with the `proxies` parameter.
|
607
921
|
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
608
922
|
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
609
|
-
:param verify: Whether to verify HTTPS certificates.
|
923
|
+
:param verify: Whether to verify HTTPS certificates.
|
610
924
|
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
611
925
|
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
612
926
|
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
613
927
|
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
614
928
|
:param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
|
615
|
-
:return: A `Response` object
|
929
|
+
:return: A `Response` object.
|
616
930
|
"""
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
931
|
+
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
932
|
+
# But some websites accept it, it depends on the implementation used.
|
933
|
+
method_args = {k: v for k, v in locals().items() if k not in ("self", "stealthy_headers", "kwargs")}
|
934
|
+
method_args.update(kwargs)
|
935
|
+
# For type checking (not accessed error)
|
936
|
+
_ = (
|
937
|
+
url,
|
938
|
+
params,
|
939
|
+
headers,
|
940
|
+
data,
|
941
|
+
json,
|
942
|
+
cookies,
|
943
|
+
timeout,
|
944
|
+
follow_redirects,
|
945
|
+
max_redirects,
|
946
|
+
retries,
|
947
|
+
retry_delay,
|
948
|
+
proxies,
|
949
|
+
proxy,
|
950
|
+
proxy_auth,
|
951
|
+
auth,
|
952
|
+
verify,
|
953
|
+
cert,
|
954
|
+
impersonate,
|
955
|
+
http3,
|
956
|
+
)
|
957
|
+
return self.__make_request("DELETE", stealth=stealthy_headers, **method_args)
|
958
|
+
|
959
|
+
|
960
|
+
class FetcherSession:
|
961
|
+
"""
|
962
|
+
A factory context manager that provides configured Fetcher sessions.
|
963
|
+
|
964
|
+
When this manager is used in a 'with' or 'async with' block,
|
965
|
+
it yields a new session configured with the manager's defaults.
|
966
|
+
A single instance of this manager should ideally be used for one active
|
967
|
+
session at a time (or sequentially). Re-entering a context with the
|
968
|
+
same manager instance while a session is already active is disallowed.
|
969
|
+
"""
|
970
|
+
|
971
|
+
def __init__(
|
972
|
+
self,
|
973
|
+
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
974
|
+
http3: Optional[bool] = False,
|
975
|
+
stealthy_headers: Optional[bool] = True,
|
976
|
+
proxies: Optional[Dict[str, str]] = None,
|
977
|
+
proxy: Optional[str] = None,
|
978
|
+
proxy_auth: Optional[Tuple[str, str]] = None,
|
979
|
+
timeout: Optional[int | float] = 30,
|
980
|
+
headers: Optional[Dict[str, str]] = None,
|
981
|
+
retries: Optional[int] = 3,
|
982
|
+
retry_delay: Optional[int] = 1,
|
983
|
+
follow_redirects: bool = True,
|
984
|
+
max_redirects: int = 30,
|
985
|
+
verify: bool = True,
|
986
|
+
cert: Optional[str | Tuple[str, str]] = None,
|
987
|
+
selector_config: Optional[Dict] = None,
|
988
|
+
):
|
989
|
+
"""
|
990
|
+
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
991
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
992
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
993
|
+
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
994
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
995
|
+
Cannot be used together with the `proxies` parameter.
|
996
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
997
|
+
:param timeout: Number of seconds to wait before timing out.
|
998
|
+
:param headers: Headers to include in the session with every request.
|
999
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
1000
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
1001
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
1002
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
1003
|
+
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
1004
|
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
1005
|
+
:param selector_config: Arguments passed when creating the final Selector class.
|
1006
|
+
"""
|
1007
|
+
self._default_impersonate: Optional[BrowserTypeLiteral] = impersonate
|
1008
|
+
self._stealth = stealthy_headers
|
1009
|
+
self._default_proxies = proxies or {}
|
1010
|
+
self._default_proxy = proxy or None
|
1011
|
+
self._default_proxy_auth = proxy_auth or None
|
1012
|
+
self._default_timeout = timeout
|
1013
|
+
self._default_headers = headers or {}
|
1014
|
+
self._default_retries = retries
|
1015
|
+
self._default_retry_delay = retry_delay
|
1016
|
+
self._default_follow_redirects = follow_redirects
|
1017
|
+
self._default_max_redirects = max_redirects
|
1018
|
+
self._default_verify = verify
|
1019
|
+
self._default_cert = cert
|
1020
|
+
self._default_http3 = http3
|
1021
|
+
self.selector_config = selector_config or {}
|
1022
|
+
self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
|
1023
|
+
|
1024
|
+
def __enter__(self) -> _SyncSessionLogic:
|
1025
|
+
"""Creates and returns a new synchronous Fetcher Session"""
|
1026
|
+
if self._client is None:
|
1027
|
+
# Use **vars(self) to avoid repeating all parameters
|
1028
|
+
config = {k.replace("_default_", ""): v for k, v in vars(self).items() if k.startswith("_default")}
|
1029
|
+
config["stealthy_headers"] = self._stealth
|
1030
|
+
config["selector_config"] = self.selector_config
|
1031
|
+
self._client = _SyncSessionLogic(**config)
|
1032
|
+
return self._client.__enter__()
|
1033
|
+
raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
|
1034
|
+
|
1035
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
1036
|
+
if self._client is not None and isinstance(self._client, _SyncSessionLogic):
|
1037
|
+
self._client.__exit__(exc_type, exc_val, exc_tb)
|
1038
|
+
self._client = None
|
1039
|
+
return
|
1040
|
+
raise RuntimeError("Cannot exit invalid session")
|
1041
|
+
|
1042
|
+
async def __aenter__(self) -> _ASyncSessionLogic:
|
1043
|
+
"""Creates and returns a new asynchronous Session."""
|
1044
|
+
if self._client is None:
|
1045
|
+
# Use **vars(self) to avoid repeating all parameters
|
1046
|
+
config = {k.replace("_default_", ""): v for k, v in vars(self).items() if k.startswith("_default")}
|
1047
|
+
config["stealthy_headers"] = self._stealth
|
1048
|
+
config["selector_config"] = self.selector_config
|
1049
|
+
self._client = _ASyncSessionLogic(**config)
|
1050
|
+
return await self._client.__aenter__()
|
1051
|
+
raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
|
1052
|
+
|
1053
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
1054
|
+
if self._client is not None and isinstance(self._client, _ASyncSessionLogic):
|
1055
|
+
await self._client.__aexit__(exc_type, exc_val, exc_tb)
|
1056
|
+
self._client = None
|
1057
|
+
return
|
1058
|
+
raise RuntimeError("Cannot exit invalid session")
|
642
1059
|
|
643
1060
|
|
644
|
-
class FetcherClient(
|
1061
|
+
class FetcherClient(_SyncSessionLogic):
|
645
1062
|
def __init__(self, *args, **kwargs):
|
646
1063
|
super().__init__(*args, **kwargs)
|
647
|
-
self.__enter__ = None
|
648
|
-
self.__exit__ = None
|
649
|
-
self.
|
650
|
-
self.__aexit__ = None
|
651
|
-
self._curl_session = True
|
1064
|
+
self.__enter__: Any = None
|
1065
|
+
self.__exit__: Any = None
|
1066
|
+
self._curl_session: Any = _NO_SESSION
|
652
1067
|
|
653
1068
|
|
654
|
-
class AsyncFetcherClient(
|
1069
|
+
class AsyncFetcherClient(_ASyncSessionLogic):
|
655
1070
|
def __init__(self, *args, **kwargs):
|
656
1071
|
super().__init__(*args, **kwargs)
|
657
|
-
self.
|
658
|
-
self.
|
659
|
-
self.
|
660
|
-
self.__aexit__ = None
|
661
|
-
self._async_curl_session = True
|
1072
|
+
self.__aenter__: Any = None
|
1073
|
+
self.__aexit__: Any = None
|
1074
|
+
self._async_curl_session: Any = _NO_SESSION
|