scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +745 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +630 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +150 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +205 -186
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +255 -260
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.dist-info/METADATA +409 -0
  30. scrapling-0.3.dist-info/RECORD +41 -0
  31. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -19
  34. scrapling/engines/camo.py +0 -299
  35. scrapling/engines/pw.py +0 -428
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.98.dist-info/METADATA +0 -867
  38. scrapling-0.2.98.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -95
  43. tests/fetchers/async/test_httpx.py +0 -83
  44. tests/fetchers/async/test_playwright.py +0 -99
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -68
  47. tests/fetchers/sync/test_httpx.py +0 -82
  48. tests/fetchers/sync/test_playwright.py +0 -87
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,156 +1,713 @@
1
- import httpx
2
- from httpx._models import Response as httpxResponse
1
+ from time import sleep as time_sleep
2
+ from asyncio import sleep as asyncio_sleep
3
3
 
4
- from scrapling.core._types import Dict, Optional, Tuple, Union
5
- from scrapling.core.utils import log, lru_cache
4
+ from curl_cffi.requests.session import CurlError
5
+ from curl_cffi import CurlHttpVersion
6
+ from curl_cffi.requests.impersonate import DEFAULT_CHROME
7
+ from curl_cffi.requests import (
8
+ ProxySpec,
9
+ CookieTypes,
10
+ BrowserTypeLiteral,
11
+ Session as CurlSession,
12
+ AsyncSession as AsyncCurlSession,
13
+ )
6
14
 
7
- from .toolbelt import Response, generate_convincing_referer, generate_headers
15
+ from scrapling.core.utils import log
16
+ from scrapling.core._types import (
17
+ Dict,
18
+ Optional,
19
+ Tuple,
20
+ Mapping,
21
+ SUPPORTED_HTTP_METHODS,
22
+ Awaitable,
23
+ List,
24
+ Any,
25
+ )
8
26
 
27
+ from .toolbelt import (
28
+ Response,
29
+ generate_convincing_referer,
30
+ generate_headers,
31
+ ResponseFactory,
32
+ __default_useragent__,
33
+ )
34
+
35
+ _UNSET = object()
36
+
37
+
38
+ class FetcherSession:
39
+ """
40
+ A context manager that provides configured Fetcher sessions.
41
+
42
+ When this manager is used in a 'with' or 'async with' block,
43
+ it yields a new session configured with the manager's defaults.
44
+ A single instance of this manager should ideally be used for one active
45
+ session at a time (or sequentially). Re-entering a context with the
46
+ same manager instance while a session is already active is disallowed.
47
+ """
9
48
 
10
- @lru_cache(2, typed=True) # Singleton easily
11
- class StaticEngine:
12
49
  def __init__(
13
- self, url: str, proxy: Optional[str] = None, stealthy_headers: bool = True, follow_redirects: bool = True,
14
- timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
50
+ self,
51
+ impersonate: Optional[BrowserTypeLiteral] = DEFAULT_CHROME,
52
+ http3: Optional[bool] = False,
53
+ stealthy_headers: Optional[bool] = True,
54
+ proxies: Optional[Dict[str, str]] = None,
55
+ proxy: Optional[str] = None,
56
+ proxy_auth: Optional[Tuple[str, str]] = None,
57
+ timeout: Optional[int | float] = 30,
58
+ headers: Optional[Dict[str, str]] = None,
59
+ retries: Optional[int] = 3,
60
+ retry_delay: Optional[int] = 1,
61
+ follow_redirects: bool = True,
62
+ max_redirects: int = 30,
63
+ verify: bool = True,
64
+ cert: Optional[str | Tuple[str, str]] = None,
65
+ selector_config: Optional[Dict] = None,
15
66
  ):
16
- """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
17
-
18
- :param url: Target url.
19
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
20
- create a referer header as if this request had came from Google's search of this URL's domain.
21
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
22
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
23
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
24
- :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
25
- """
26
- self.url = url
27
- self.proxy = proxy
67
+ """
68
+ :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
69
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
70
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
71
+ :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
72
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
73
+ Cannot be used together with the `proxies` parameter.
74
+ :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
75
+ :param timeout: Number of seconds to wait before timing out.
76
+ :param headers: Headers to include in the session with every request.
77
+ :param retries: Number of retry attempts. Defaults to 3.
78
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
79
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
80
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
81
+ :param verify: Whether to verify HTTPS certificates. Defaults to True.
82
+ :param cert: Tuple of (cert, key) filenames for the client certificate.
83
+ :param selector_config: Arguments passed when creating the final Selector class.
84
+ """
85
+ self.default_impersonate = impersonate
28
86
  self.stealth = stealthy_headers
29
- self.timeout = timeout
30
- self.follow_redirects = bool(follow_redirects)
31
- self.retries = retries
32
- self._extra_headers = generate_headers(browser_mode=False)
33
- # Because we are using `lru_cache` for a slight optimization but both dict/dict_items are not hashable so they can't be cached
34
- # So my solution here was to convert it to tuple then convert it back to dictionary again here as tuples are hashable, ofc `tuple().__hash__()`
35
- self.adaptor_arguments = dict(adaptor_arguments) if adaptor_arguments else {}
36
-
37
- def _headers_job(self, headers: Optional[Dict]) -> Dict:
87
+ self.default_proxies = proxies or {}
88
+ self.default_proxy = proxy or None
89
+ self.default_proxy_auth = proxy_auth or None
90
+ self.default_timeout = timeout
91
+ self.default_headers = headers or {}
92
+ self.default_retries = retries
93
+ self.default_retry_delay = retry_delay
94
+ self.default_follow_redirects = follow_redirects
95
+ self.default_max_redirects = max_redirects
96
+ self.default_verify = verify
97
+ self.default_cert = cert
98
+ self.default_http3 = http3
99
+ self.selector_config = selector_config or {}
100
+
101
+ self._curl_session: Optional[CurlSession] = None
102
+ self._async_curl_session: Optional[AsyncCurlSession] = None
103
+
104
+ def _merge_request_args(self, **kwargs) -> Dict[str, Any]:
105
+ """Merge request-specific arguments with default session arguments."""
106
+ url = kwargs.pop("url")
107
+ request_args = {}
108
+
109
+ headers = self.get_with_precedence(kwargs, "headers", self.default_headers)
110
+ stealth = self.get_with_precedence(kwargs, "stealth", self.stealth)
111
+ impersonate = self.get_with_precedence(
112
+ kwargs, "impersonate", self.default_impersonate
113
+ )
114
+
115
+ if self.get_with_precedence(
116
+ kwargs, "http3", self.default_http3
117
+ ): # pragma: no cover
118
+ request_args["http_version"] = CurlHttpVersion.V3ONLY
119
+ if impersonate:
120
+ log.warning(
121
+ "The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
122
+ )
123
+
124
+ request_args.update(
125
+ {
126
+ "url": url,
127
+ # Curl automatically generates the suitable browser headers when you use `impersonate`
128
+ "headers": self._headers_job(url, headers, stealth, bool(impersonate)),
129
+ "proxies": self.get_with_precedence(
130
+ kwargs, "proxies", self.default_proxies
131
+ ),
132
+ "proxy": self.get_with_precedence(kwargs, "proxy", self.default_proxy),
133
+ "proxy_auth": self.get_with_precedence(
134
+ kwargs, "proxy_auth", self.default_proxy_auth
135
+ ),
136
+ "timeout": self.get_with_precedence(
137
+ kwargs, "timeout", self.default_timeout
138
+ ),
139
+ "allow_redirects": self.get_with_precedence(
140
+ kwargs, "allow_redirects", self.default_follow_redirects
141
+ ),
142
+ "max_redirects": self.get_with_precedence(
143
+ kwargs, "max_redirects", self.default_max_redirects
144
+ ),
145
+ "verify": self.get_with_precedence(
146
+ kwargs, "verify", self.default_verify
147
+ ),
148
+ "cert": self.get_with_precedence(kwargs, "cert", self.default_cert),
149
+ "impersonate": impersonate,
150
+ **{
151
+ k: v
152
+ for k, v in kwargs.items()
153
+ if v
154
+ not in (
155
+ _UNSET,
156
+ None,
157
+ )
158
+ }, # Add any remaining parameters (after all known ones are popped)
159
+ }
160
+ )
161
+ return request_args
162
+
163
+ def _headers_job(
164
+ self,
165
+ url,
166
+ headers: Optional[Dict],
167
+ stealth: Optional[bool],
168
+ impersonate_enabled: bool,
169
+ ) -> Dict:
38
170
  """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
39
171
  finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
40
172
 
41
173
  :param headers: Current headers in the request if the user passed any
174
+ :param stealth: Whether to enable the `stealthy_headers` argument to this request or not. If `None`, it defaults to the session default value.
175
+ :param impersonate_enabled: Whether the browser impersonation is enabled or not.
42
176
  :return: A dictionary of the new headers.
43
177
  """
44
- headers = headers or {}
178
+ # Handle headers - if it was _UNSET, use default_headers
179
+ if headers is _UNSET:
180
+ headers = self.default_headers.copy()
181
+ else:
182
+ # Merge session headers with request headers, request takes precedence
183
+ headers = {**self.default_headers, **(headers or {})}
184
+
45
185
  headers_keys = set(map(str.lower, headers.keys()))
186
+ if stealth:
187
+ if "referer" not in headers_keys:
188
+ headers.update({"referer": generate_convincing_referer(url)})
189
+
190
+ if impersonate_enabled: # Curl will generate the suitable headers
191
+ return headers
46
192
 
47
- if self.stealth:
48
193
  extra_headers = generate_headers(browser_mode=False)
49
- # Don't overwrite user supplied headers
50
- extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
194
+ # Don't overwrite user-supplied headers
195
+ extra_headers = {
196
+ key: value
197
+ for key, value in extra_headers.items()
198
+ if key.lower() not in headers_keys
199
+ }
51
200
  headers.update(extra_headers)
52
- if 'referer' not in headers_keys:
53
- headers.update({'referer': generate_convincing_referer(self.url)})
54
201
 
55
- elif 'user-agent' not in headers_keys:
56
- headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
57
- log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
202
+ elif "user-agent" not in headers_keys and not impersonate_enabled:
203
+ headers["User-Agent"] = __default_useragent__
204
+ log.debug(
205
+ f"Can't find useragent in headers so '{headers['User-Agent']}' was used."
206
+ )
58
207
 
59
208
  return headers
60
209
 
61
- def _prepare_response(self, response: httpxResponse) -> Response:
62
- """Takes httpx response and generates `Response` object from it.
63
-
64
- :param response: httpx response object
65
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
66
- """
67
- return Response(
68
- url=str(response.url),
69
- text=response.text,
70
- body=response.content,
71
- status=response.status_code,
72
- reason=response.reason_phrase,
73
- encoding=response.encoding or 'utf-8',
74
- cookies=dict(response.cookies),
75
- headers=dict(response.headers),
76
- request_headers=dict(response.request.headers),
77
- method=response.request.method,
78
- history=[self._prepare_response(redirection) for redirection in response.history],
79
- **self.adaptor_arguments
80
- )
210
+ def __enter__(self):
211
+ """Creates and returns a new synchronous Fetcher Session"""
212
+ if self._curl_session:
213
+ raise RuntimeError(
214
+ "This FetcherSession instance already has an active synchronous session. "
215
+ "Create a new FetcherSession instance for a new independent session, "
216
+ "or use the current instance sequentially after the previous context has exited."
217
+ )
218
+ if (
219
+ self._async_curl_session
220
+ ): # Prevent mixing if async is active from this instance
221
+ raise RuntimeError(
222
+ "This FetcherSession instance has an active asynchronous session. "
223
+ "Cannot enter a synchronous context simultaneously with the same manager instance."
224
+ )
81
225
 
82
- def _make_request(self, method: str, **kwargs) -> Response:
83
- headers = self._headers_job(kwargs.pop('headers', {}))
84
- with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
85
- request = getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
86
- return self._prepare_response(request)
226
+ self._curl_session = CurlSession()
227
+ return self
87
228
 
88
- async def _async_make_request(self, method: str, **kwargs) -> Response:
89
- headers = self._headers_job(kwargs.pop('headers', {}))
90
- async with httpx.AsyncClient(proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)) as client:
91
- request = await getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
92
- return self._prepare_response(request)
229
+ def __exit__(self, exc_type, exc_val, exc_tb):
230
+ """Closes the active synchronous session managed by this instance, if any."""
231
+ if self._curl_session:
232
+ self._curl_session.close()
233
+ self._curl_session = None
93
234
 
94
- def get(self, **kwargs: Dict) -> Response:
95
- """Make basic HTTP GET request for you but with some added flavors.
235
+ async def __aenter__(self):
236
+ """Creates and returns a new asynchronous Session."""
237
+ if self._async_curl_session:
238
+ raise RuntimeError(
239
+ "This FetcherSession instance already has an active asynchronous session. "
240
+ "Create a new FetcherSession instance for a new independent session, "
241
+ "or use the current instance sequentially after the previous context has exited."
242
+ )
243
+ if self._curl_session: # Prevent mixing if sync is active from this instance
244
+ raise RuntimeError(
245
+ "This FetcherSession instance has an active synchronous session. "
246
+ "Cannot enter an asynchronous context simultaneously with the same manager instance."
247
+ )
96
248
 
97
- :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
98
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
99
- """
100
- return self._make_request('get', **kwargs)
249
+ self._async_curl_session = AsyncCurlSession()
250
+ return self
101
251
 
102
- async def async_get(self, **kwargs: Dict) -> Response:
103
- """Make basic async HTTP GET request for you but with some added flavors.
252
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
253
+ """Closes the active asynchronous session managed by this instance, if any."""
254
+ if self._async_curl_session:
255
+ await self._async_curl_session.close()
256
+ self._async_curl_session = None
104
257
 
105
- :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
106
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
258
+ def __make_request(
259
+ self,
260
+ method: SUPPORTED_HTTP_METHODS,
261
+ request_args: Dict[str, Any],
262
+ max_retries: int,
263
+ retry_delay: int,
264
+ selector_config: Optional[Dict] = None,
265
+ ) -> Response:
107
266
  """
108
- return await self._async_make_request('get', **kwargs)
267
+ Perform an HTTP request using the configured session.
268
+
269
+ :param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
270
+ :param url: Target URL for the request.
271
+ :param request_args: Arguments to be passed to the session's `request()` method.
272
+ :param max_retries: Maximum number of retries for the request.
273
+ :param retry_delay: Number of seconds to wait between retries.
274
+ :param selector_config: Arguments passed when creating the final Selector class.
275
+ :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
276
+ """
277
+ session = self._curl_session
278
+ if session is True and not any(
279
+ (self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
280
+ ):
281
+ # For usage inside FetcherClient
282
+ # It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
283
+ session = CurlSession()
284
+
285
+ if session:
286
+ for attempt in range(max_retries):
287
+ try:
288
+ response = session.request(method, **request_args)
289
+ # response.raise_for_status() # Retry responses with a status code between 200-400
290
+ return ResponseFactory.from_http_request(response, selector_config)
291
+ except CurlError as e: # pragma: no cover
292
+ if attempt < max_retries - 1:
293
+ log.error(
294
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds..."
295
+ )
296
+ time_sleep(retry_delay)
297
+ else:
298
+ log.error(f"Failed after {max_retries} attempts: {e}")
299
+ raise # Raise the exception if all retries fail
109
300
 
110
- def post(self, **kwargs: Dict) -> Response:
111
- """Make basic HTTP POST request for you but with some added flavors.
301
+ raise RuntimeError("No active session available.") # pragma: no cover
112
302
 
113
- :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
114
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
303
+ async def __make_async_request(
304
+ self,
305
+ method: SUPPORTED_HTTP_METHODS,
306
+ request_args: Dict[str, Any],
307
+ max_retries: int,
308
+ retry_delay: int,
309
+ selector_config: Optional[Dict] = None,
310
+ ) -> Response:
115
311
  """
116
- return self._make_request('post', **kwargs)
312
+ Perform an HTTP request using the configured session.
117
313
 
118
- async def async_post(self, **kwargs: Dict) -> Response:
119
- """Make basic async HTTP POST request for you but with some added flavors.
314
+ :param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
315
+ :param url: Target URL for the request.
316
+ :param request_args: Arguments to be passed to the session's `request()` method.
317
+ :param max_retries: Maximum number of retries for the request.
318
+ :param retry_delay: Number of seconds to wait between retries.
319
+ :param selector_config: Arguments passed when creating the final Selector class.
320
+ :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
321
+ """
322
+ session = self._async_curl_session
323
+ if session is True and not any(
324
+ (self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
325
+ ):
326
+ # For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
327
+ # 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
328
+ # 2. `curl_cffi` doesn't support making async requests without sessions
329
+ # 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
330
+ session = AsyncCurlSession()
331
+
332
+ if session:
333
+ for attempt in range(max_retries):
334
+ try:
335
+ response = await session.request(method, **request_args)
336
+ # response.raise_for_status() # Retry responses with a status code between 200-400
337
+ return ResponseFactory.from_http_request(response, selector_config)
338
+ except CurlError as e: # pragma: no cover
339
+ if attempt < max_retries - 1:
340
+ log.error(
341
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds..."
342
+ )
343
+ await asyncio_sleep(retry_delay)
344
+ else:
345
+ log.error(f"Failed after {max_retries} attempts: {e}")
346
+ raise # Raise the exception if all retries fail
347
+
348
+ raise RuntimeError("No active session available.") # pragma: no cover
120
349
 
121
- :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
122
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
350
+ @staticmethod
351
+ def get_with_precedence(kwargs, key, default_value):
352
+ """Get value with request-level priority over session-level"""
353
+ request_value = kwargs.pop(key, _UNSET)
354
+ return request_value if request_value is not _UNSET else default_value
355
+
356
+ def __prepare_and_dispatch(
357
+ self,
358
+ method: SUPPORTED_HTTP_METHODS,
359
+ stealth: Optional[bool] = None,
360
+ **kwargs,
361
+ ) -> Response | Awaitable[Response]:
123
362
  """
124
- return await self._async_make_request('post', **kwargs)
363
+ Internal dispatcher. Prepares arguments and calls sync or async request helper.
125
364
 
126
- def delete(self, **kwargs: Dict) -> Response:
127
- """Make basic HTTP DELETE request for you but with some added flavors.
365
+ :param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
366
+ :param stealth: Whether to enable the `stealthy_headers` argument to this request or not. If `None`, it defaults to the session default value.
367
+ :param url: Target URL for the request.
368
+ :param kwargs: Additional request-specific arguments.
369
+ :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
370
+ """
371
+ stealth = self.stealth if stealth is None else stealth
128
372
 
129
- :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
130
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
373
+ selector_config = kwargs.pop("selector_config", {}) or self.selector_config
374
+ max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
375
+ retry_delay = self.get_with_precedence(
376
+ kwargs, "retry_delay", self.default_retry_delay
377
+ )
378
+ request_args = self._merge_request_args(stealth=stealth, **kwargs)
379
+ if self._curl_session:
380
+ return self.__make_request(
381
+ method, request_args, max_retries, retry_delay, selector_config
382
+ )
383
+ elif self._async_curl_session:
384
+ # The returned value is a Coroutine
385
+ return self.__make_async_request(
386
+ method, request_args, max_retries, retry_delay, selector_config
387
+ )
388
+
389
+ raise RuntimeError("No active session available.")
390
+
391
+ def get(
392
+ self,
393
+ url: str,
394
+ params: Optional[Dict | List | Tuple] = None,
395
+ headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
396
+ cookies: Optional[CookieTypes] = None,
397
+ timeout: Optional[int | float] = _UNSET,
398
+ follow_redirects: Optional[bool] = _UNSET,
399
+ max_redirects: Optional[int] = _UNSET,
400
+ retries: Optional[int] = _UNSET,
401
+ retry_delay: Optional[int] = _UNSET,
402
+ proxies: Optional[ProxySpec] = _UNSET,
403
+ proxy: Optional[str] = _UNSET,
404
+ proxy_auth: Optional[Tuple[str, str]] = _UNSET,
405
+ auth: Optional[Tuple[str, str]] = None,
406
+ verify: Optional[bool] = _UNSET,
407
+ cert: Optional[str | Tuple[str, str]] = _UNSET,
408
+ impersonate: Optional[BrowserTypeLiteral] = _UNSET,
409
+ http3: Optional[bool] = _UNSET,
410
+ stealthy_headers: Optional[bool] = _UNSET,
411
+ **kwargs,
412
+ ) -> Response | Awaitable[Response]:
131
413
  """
132
- return self._make_request('delete', **kwargs)
414
+ Perform a GET request.
133
415
 
134
- async def async_delete(self, **kwargs: Dict) -> Response:
135
- """Make basic async HTTP DELETE request for you but with some added flavors.
416
+ :param url: Target URL for the request.
417
+ :param params: Query string parameters for the request.
418
+ :param headers: Headers to include in the request.
419
+ :param cookies: Cookies to use in the request.
420
+ :param timeout: Number of seconds to wait before timing out.
421
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
422
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
423
+ :param retries: Number of retry attempts. Defaults to 3.
424
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
425
+ :param proxies: Dict of proxies to use.
426
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
427
+ Cannot be used together with the `proxies` parameter.
428
+ :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
429
+ :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
430
+ :param verify: Whether to verify HTTPS certificates.
431
+ :param cert: Tuple of (cert, key) filenames for the client certificate.
432
+ :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
433
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
434
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
435
+ :param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
436
+ :return: A `Response` object or an awaitable for async.
437
+ """
438
+ request_args = {
439
+ "url": url,
440
+ "params": params,
441
+ "headers": headers,
442
+ "cookies": cookies,
443
+ "timeout": timeout,
444
+ "retry_delay": retry_delay,
445
+ "allow_redirects": follow_redirects,
446
+ "max_redirects": max_redirects,
447
+ "retries": retries,
448
+ "proxies": proxies,
449
+ "proxy": proxy,
450
+ "proxy_auth": proxy_auth,
451
+ "auth": auth,
452
+ "verify": verify,
453
+ "cert": cert,
454
+ "impersonate": impersonate,
455
+ "http3": http3,
456
+ **kwargs,
457
+ }
458
+ return self.__prepare_and_dispatch(
459
+ "GET", stealth=stealthy_headers, **request_args
460
+ )
136
461
 
137
- :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
138
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
462
+ def post(
463
+ self,
464
+ url: str,
465
+ data: Optional[Dict | str] = None,
466
+ json: Optional[Dict | List] = None,
467
+ headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
468
+ params: Optional[Dict | List | Tuple] = None,
469
+ cookies: Optional[CookieTypes] = None,
470
+ timeout: Optional[int | float] = _UNSET,
471
+ follow_redirects: Optional[bool] = _UNSET,
472
+ max_redirects: Optional[int] = _UNSET,
473
+ retries: Optional[int] = _UNSET,
474
+ retry_delay: Optional[int] = _UNSET,
475
+ proxies: Optional[ProxySpec] = _UNSET,
476
+ proxy: Optional[str] = _UNSET,
477
+ proxy_auth: Optional[Tuple[str, str]] = _UNSET,
478
+ auth: Optional[Tuple[str, str]] = None,
479
+ verify: Optional[bool] = _UNSET,
480
+ cert: Optional[str | Tuple[str, str]] = _UNSET,
481
+ impersonate: Optional[BrowserTypeLiteral] = _UNSET,
482
+ http3: Optional[bool] = _UNSET,
483
+ stealthy_headers: Optional[bool] = _UNSET,
484
+ **kwargs,
485
+ ) -> Response | Awaitable[Response]:
139
486
  """
140
- return await self._async_make_request('delete', **kwargs)
487
+ Perform a POST request.
141
488
 
142
- def put(self, **kwargs: Dict) -> Response:
143
- """Make basic HTTP PUT request for you but with some added flavors.
489
+ :param url: Target URL for the request.
490
+ :param data: Form data to include in the request body.
491
+ :param json: A JSON serializable object to include in the body of the request.
492
+ :param headers: Headers to include in the request.
493
+ :param params: Query string parameters for the request.
494
+ :param cookies: Cookies to use in the request.
495
+ :param timeout: Number of seconds to wait before timing out.
496
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
497
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
498
+ :param retries: Number of retry attempts. Defaults to 3.
499
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
500
+ :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
501
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
502
+ Cannot be used together with the `proxies` parameter.
503
+ :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
504
+ :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
505
+ :param verify: Whether to verify HTTPS certificates. Defaults to True.
506
+ :param cert: Tuple of (cert, key) filenames for the client certificate.
507
+ :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
508
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
509
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
510
+ :param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
511
+ :return: A `Response` object or an awaitable for async.
512
+ """
513
+ request_args = {
514
+ "url": url,
515
+ "data": data,
516
+ "json": json,
517
+ "headers": headers,
518
+ "params": params,
519
+ "cookies": cookies,
520
+ "timeout": timeout,
521
+ "retry_delay": retry_delay,
522
+ "proxy": proxy,
523
+ "impersonate": impersonate,
524
+ "allow_redirects": follow_redirects,
525
+ "max_redirects": max_redirects,
526
+ "retries": retries,
527
+ "proxies": proxies,
528
+ "proxy_auth": proxy_auth,
529
+ "auth": auth,
530
+ "verify": verify,
531
+ "cert": cert,
532
+ "http3": http3,
533
+ **kwargs,
534
+ }
535
+ return self.__prepare_and_dispatch(
536
+ "POST", stealth=stealthy_headers, **request_args
537
+ )
144
538
 
145
- :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
146
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
539
+ def put(
540
+ self,
541
+ url: str,
542
+ data: Optional[Dict | str] = None,
543
+ json: Optional[Dict | List] = None,
544
+ headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
545
+ params: Optional[Dict | List | Tuple] = None,
546
+ cookies: Optional[CookieTypes] = None,
547
+ timeout: Optional[int | float] = _UNSET,
548
+ follow_redirects: Optional[bool] = _UNSET,
549
+ max_redirects: Optional[int] = _UNSET,
550
+ retries: Optional[int] = _UNSET,
551
+ retry_delay: Optional[int] = _UNSET,
552
+ proxies: Optional[ProxySpec] = _UNSET,
553
+ proxy: Optional[str] = _UNSET,
554
+ proxy_auth: Optional[Tuple[str, str]] = _UNSET,
555
+ auth: Optional[Tuple[str, str]] = None,
556
+ verify: Optional[bool] = _UNSET,
557
+ cert: Optional[str | Tuple[str, str]] = _UNSET,
558
+ impersonate: Optional[BrowserTypeLiteral] = _UNSET,
559
+ http3: Optional[bool] = _UNSET,
560
+ stealthy_headers: Optional[bool] = _UNSET,
561
+ **kwargs,
562
+ ) -> Response | Awaitable[Response]:
147
563
  """
148
- return self._make_request('put', **kwargs)
564
+ Perform a PUT request.
149
565
 
150
- async def async_put(self, **kwargs: Dict) -> Response:
151
- """Make basic async HTTP PUT request for you but with some added flavors.
566
+ :param url: Target URL for the request.
567
+ :param data: Form data to include in the request body.
568
+ :param json: A JSON serializable object to include in the body of the request.
569
+ :param headers: Headers to include in the request.
570
+ :param params: Query string parameters for the request.
571
+ :param cookies: Cookies to use in the request.
572
+ :param timeout: Number of seconds to wait before timing out.
573
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
574
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
575
+ :param retries: Number of retry attempts. Defaults to 3.
576
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
577
+ :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
578
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
579
+ Cannot be used together with the `proxies` parameter.
580
+ :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
581
+ :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
582
+ :param verify: Whether to verify HTTPS certificates. Defaults to True.
583
+ :param cert: Tuple of (cert, key) filenames for the client certificate.
584
+ :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
585
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
586
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
587
+ :param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
588
+ :return: A `Response` object or an awaitable for async.
589
+ """
590
+ request_args = {
591
+ "url": url,
592
+ "data": data,
593
+ "json": json,
594
+ "headers": headers,
595
+ "params": params,
596
+ "cookies": cookies,
597
+ "timeout": timeout,
598
+ "retry_delay": retry_delay,
599
+ "proxy": proxy,
600
+ "impersonate": impersonate,
601
+ "allow_redirects": follow_redirects,
602
+ "max_redirects": max_redirects,
603
+ "retries": retries,
604
+ "proxies": proxies,
605
+ "proxy_auth": proxy_auth,
606
+ "auth": auth,
607
+ "verify": verify,
608
+ "cert": cert,
609
+ "http3": http3,
610
+ **kwargs,
611
+ }
612
+ return self.__prepare_and_dispatch(
613
+ "PUT", stealth=stealthy_headers, **request_args
614
+ )
615
+
616
+ def delete(
617
+ self,
618
+ url: str,
619
+ data: Optional[Dict | str] = None,
620
+ json: Optional[Dict | List] = None,
621
+ headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
622
+ params: Optional[Dict | List | Tuple] = None,
623
+ cookies: Optional[CookieTypes] = None,
624
+ timeout: Optional[int | float] = _UNSET,
625
+ follow_redirects: Optional[bool] = _UNSET,
626
+ max_redirects: Optional[int] = _UNSET,
627
+ retries: Optional[int] = _UNSET,
628
+ retry_delay: Optional[int] = _UNSET,
629
+ proxies: Optional[ProxySpec] = _UNSET,
630
+ proxy: Optional[str] = _UNSET,
631
+ proxy_auth: Optional[Tuple[str, str]] = _UNSET,
632
+ auth: Optional[Tuple[str, str]] = None,
633
+ verify: Optional[bool] = _UNSET,
634
+ cert: Optional[str | Tuple[str, str]] = _UNSET,
635
+ impersonate: Optional[BrowserTypeLiteral] = _UNSET,
636
+ http3: Optional[bool] = _UNSET,
637
+ stealthy_headers: Optional[bool] = _UNSET,
638
+ **kwargs,
639
+ ) -> Response | Awaitable[Response]:
640
+ """
641
+ Perform a DELETE request.
152
642
 
153
- :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
154
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
643
+ :param url: Target URL for the request.
644
+ :param data: Form data to include in the request body.
645
+ :param json: A JSON serializable object to include in the body of the request.
646
+ :param headers: Headers to include in the request.
647
+ :param params: Query string parameters for the request.
648
+ :param cookies: Cookies to use in the request.
649
+ :param timeout: Number of seconds to wait before timing out.
650
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
651
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
652
+ :param retries: Number of retry attempts. Defaults to 3.
653
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
654
+ :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
655
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
656
+ Cannot be used together with the `proxies` parameter.
657
+ :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
658
+ :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
659
+ :param verify: Whether to verify HTTPS certificates. Defaults to True.
660
+ :param cert: Tuple of (cert, key) filenames for the client certificate.
661
+ :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
662
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
663
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
664
+ :param kwargs: Additional keyword arguments to pass to the [`curl_cffi.requests.Session().request()`, `curl_cffi.requests.AsyncSession().request()`] method.
665
+ :return: A `Response` object or an awaitable for async.
155
666
  """
156
- return await self._async_make_request('put', **kwargs)
667
+ request_args = {
668
+ "url": url,
669
+ # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
670
+ # But some websites accept it, it depends on the implementation used.
671
+ "data": data,
672
+ "json": json,
673
+ "headers": headers,
674
+ "params": params,
675
+ "cookies": cookies,
676
+ "timeout": timeout,
677
+ "retry_delay": retry_delay,
678
+ "proxy": proxy,
679
+ "impersonate": impersonate,
680
+ "allow_redirects": follow_redirects,
681
+ "max_redirects": max_redirects,
682
+ "retries": retries,
683
+ "proxies": proxies,
684
+ "proxy_auth": proxy_auth,
685
+ "auth": auth,
686
+ "verify": verify,
687
+ "cert": cert,
688
+ "http3": http3,
689
+ **kwargs,
690
+ }
691
+ return self.__prepare_and_dispatch(
692
+ "DELETE", stealth=stealthy_headers, **request_args
693
+ )
694
+
695
+
696
+ class FetcherClient(FetcherSession):
697
+ def __init__(self, *args, **kwargs):
698
+ super().__init__(*args, **kwargs)
699
+ self.__enter__ = None
700
+ self.__exit__ = None
701
+ self.__aenter__ = None
702
+ self.__aexit__ = None
703
+ self._curl_session = True
704
+
705
+
706
+ class AsyncFetcherClient(FetcherSession):
707
+ def __init__(self, *args, **kwargs):
708
+ super().__init__(*args, **kwargs)
709
+ self.__enter__ = None
710
+ self.__exit__ = None
711
+ self.__aenter__ = None
712
+ self.__aexit__ = None
713
+ self._async_curl_session = True