scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
scrapling/fetchers.py CHANGED
@@ -1,289 +1,135 @@
1
- from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
2
- SelectorWaitStates, Union)
3
- from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
4
- check_if_engine_usable)
1
+ from scrapling.core._types import (
2
+ Callable,
3
+ Dict,
4
+ List,
5
+ Optional,
6
+ SelectorWaitStates,
7
+ Iterable,
8
+ )
9
+ from scrapling.engines import (
10
+ FetcherSession,
11
+ StealthySession,
12
+ AsyncStealthySession,
13
+ DynamicSession,
14
+ AsyncDynamicSession,
15
+ FetcherClient as _FetcherClient,
16
+ AsyncFetcherClient as _AsyncFetcherClient,
17
+ )
5
18
  from scrapling.engines.toolbelt import BaseFetcher, Response
6
19
 
20
+ __FetcherClientInstance__ = _FetcherClient()
21
+ __AsyncFetcherClientInstance__ = _AsyncFetcherClient()
7
22
 
8
- class Fetcher(BaseFetcher):
9
- """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on httpx.
10
-
11
- Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
12
- """
13
- @classmethod
14
- def get(
15
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
16
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
17
- """Make basic HTTP GET request for you but with some added flavors.
18
-
19
- :param url: Target url.
20
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
21
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
22
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
23
- create a referer header as if this request had came from Google's search of this URL's domain.
24
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
25
- :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
26
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
27
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
28
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
29
- """
30
- if not custom_config:
31
- custom_config = {}
32
- elif not isinstance(custom_config, dict):
33
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
34
-
35
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
36
- response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
37
- return response_object
38
-
39
- @classmethod
40
- def post(
41
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
42
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
43
- """Make basic HTTP POST request for you but with some added flavors.
44
-
45
- :param url: Target url.
46
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
47
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
48
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
49
- create a referer header as if this request came from Google's search of this URL's domain.
50
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
51
- :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
52
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
53
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
54
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
55
- """
56
- if not custom_config:
57
- custom_config = {}
58
- elif not isinstance(custom_config, dict):
59
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
60
-
61
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
62
- response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
63
- return response_object
64
-
65
- @classmethod
66
- def put(
67
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
68
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
69
- """Make basic HTTP PUT request for you but with some added flavors.
70
23
 
71
- :param url: Target url
72
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
73
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
74
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
75
- create a referer header as if this request came from Google's search of this URL's domain.
76
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
77
- :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
78
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
79
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
80
-
81
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
82
- """
83
- if not custom_config:
84
- custom_config = {}
85
- elif not isinstance(custom_config, dict):
86
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
87
-
88
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
89
- response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
90
- return response_object
91
-
92
- @classmethod
93
- def delete(
94
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
95
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
96
- """Make basic HTTP DELETE request for you but with some added flavors.
97
-
98
- :param url: Target url
99
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
100
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
101
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
102
- create a referer header as if this request came from Google's search of this URL's domain.
103
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
104
- :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
105
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
106
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
107
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
108
- """
109
- if not custom_config:
110
- custom_config = {}
111
- elif not isinstance(custom_config, dict):
112
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
113
-
114
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
115
- response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
116
- return response_object
117
-
118
-
119
- class AsyncFetcher(Fetcher):
120
- @classmethod
121
- async def get(
122
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
123
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
124
- """Make basic HTTP GET request for you but with some added flavors.
125
-
126
- :param url: Target url.
127
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
128
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
129
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
130
- create a referer header as if this request had came from Google's search of this URL's domain.
131
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
132
- :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
133
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
134
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
135
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
136
- """
137
- if not custom_config:
138
- custom_config = {}
139
- elif not isinstance(custom_config, dict):
140
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
141
-
142
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
143
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
144
- return response_object
145
-
146
- @classmethod
147
- async def post(
148
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
149
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
150
- """Make basic HTTP POST request for you but with some added flavors.
151
-
152
- :param url: Target url.
153
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
154
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
155
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
156
- create a referer header as if this request came from Google's search of this URL's domain.
157
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
158
- :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
159
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
160
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
161
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
162
- """
163
- if not custom_config:
164
- custom_config = {}
165
- elif not isinstance(custom_config, dict):
166
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
167
-
168
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
169
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
170
- return response_object
171
-
172
- @classmethod
173
- async def put(
174
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
175
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
176
- """Make basic HTTP PUT request for you but with some added flavors.
177
-
178
- :param url: Target url
179
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
180
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
181
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
182
- create a referer header as if this request came from Google's search of this URL's domain.
183
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
184
- :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
185
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
186
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
187
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
188
- """
189
- if not custom_config:
190
- custom_config = {}
191
- elif not isinstance(custom_config, dict):
192
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
24
+ class Fetcher(BaseFetcher):
25
+ """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
193
26
 
194
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
195
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
196
- return response_object
27
+ get = __FetcherClientInstance__.get
28
+ post = __FetcherClientInstance__.post
29
+ put = __FetcherClientInstance__.put
30
+ delete = __FetcherClientInstance__.delete
197
31
 
198
- @classmethod
199
- async def delete(
200
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
201
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
202
- """Make basic HTTP DELETE request for you but with some added flavors.
203
32
 
204
- :param url: Target url
205
- :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
206
- :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
207
- :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
208
- create a referer header as if this request came from Google's search of this URL's domain.
209
- :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
210
- :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
211
- :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
212
- :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
213
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
214
- """
215
- if not custom_config:
216
- custom_config = {}
217
- elif not isinstance(custom_config, dict):
218
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
33
+ class AsyncFetcher(BaseFetcher):
34
+ """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
219
35
 
220
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
221
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
222
- return response_object
36
+ get = __AsyncFetcherClientInstance__.get
37
+ post = __AsyncFetcherClientInstance__.post
38
+ put = __AsyncFetcherClientInstance__.put
39
+ delete = __AsyncFetcherClientInstance__.delete
223
40
 
224
41
 
225
42
  class StealthyFetcher(BaseFetcher):
226
- """A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
43
+ """A `Fetcher` class type that is a completely stealthy fetcher that uses a modified version of Firefox.
227
44
 
228
- It works as real browsers passing almost all online tests/protections based on Camoufox.
229
- Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
45
+ It works as real browsers passing almost all online tests/protections based on Camoufox.
46
+ Other added flavors include setting the faked OS fingerprints to match the user's OS, and the referer of every request is set as if this request came from Google's search of this URL's domain.
230
47
  """
48
+
231
49
  @classmethod
232
50
  def fetch(
233
- cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
234
- block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
235
- timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
236
- wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
237
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
238
- custom_config: Dict = None, additional_arguments: Dict = None
51
+ cls,
52
+ url: str,
53
+ headless: bool = True, # noqa: F821
54
+ block_images: bool = False,
55
+ disable_resources: bool = False,
56
+ block_webrtc: bool = False,
57
+ allow_webgl: bool = True,
58
+ network_idle: bool = False,
59
+ humanize: bool | float = True,
60
+ solve_cloudflare: bool = False,
61
+ wait: int | float = 0,
62
+ timeout: int | float = 30000,
63
+ page_action: Optional[Callable] = None,
64
+ wait_selector: Optional[str] = None,
65
+ init_script: Optional[str] = None,
66
+ addons: Optional[List[str]] = None,
67
+ wait_selector_state: SelectorWaitStates = "attached",
68
+ cookies: Optional[List[Dict]] = None,
69
+ google_search: bool = True,
70
+ extra_headers: Optional[Dict[str, str]] = None,
71
+ proxy: Optional[str | Dict[str, str]] = None,
72
+ os_randomize: bool = False,
73
+ disable_ads: bool = False,
74
+ geoip: bool = False,
75
+ custom_config: Optional[Dict] = None,
76
+ additional_args: Optional[Dict] = None,
239
77
  ) -> Response:
240
78
  """
241
79
  Opens up a browser and do your request based on your chosen options below.
242
80
 
243
81
  :param url: Target url.
244
- :param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
82
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
245
83
  :param block_images: Prevent the loading of images through Firefox preferences.
246
84
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
247
- :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
85
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
248
86
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
249
87
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
250
88
  :param block_webrtc: Blocks WebRTC entirely.
89
+ :param cookies: Set cookies for the next request.
251
90
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
252
- :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
253
91
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
254
- :param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
255
- :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
256
- It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
92
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
93
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
257
94
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
95
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
258
96
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
259
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
260
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
97
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
98
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
261
99
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
262
- :param wait_selector: Wait for a specific css selector to be in a specific state.
263
- :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
264
- :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
100
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
101
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
102
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
103
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
104
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
105
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
265
106
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
266
107
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
267
108
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
268
- :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
269
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
109
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
110
+ :return: A `Response` object.
270
111
  """
271
112
  if not custom_config:
272
113
  custom_config = {}
273
114
  elif not isinstance(custom_config, dict):
274
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
115
+ ValueError(
116
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
117
+ )
275
118
 
276
- engine = CamoufoxEngine(
119
+ with StealthySession(
277
120
  wait=wait,
121
+ max_pages=1,
278
122
  proxy=proxy,
279
123
  geoip=geoip,
280
124
  addons=addons,
281
125
  timeout=timeout,
126
+ cookies=cookies,
282
127
  headless=headless,
283
128
  humanize=humanize,
284
129
  disable_ads=disable_ads,
285
130
  allow_webgl=allow_webgl,
286
131
  page_action=page_action,
132
+ init_script=init_script,
287
133
  network_idle=network_idle,
288
134
  block_images=block_images,
289
135
  block_webrtc=block_webrtc,
@@ -291,69 +137,98 @@ class StealthyFetcher(BaseFetcher):
291
137
  wait_selector=wait_selector,
292
138
  google_search=google_search,
293
139
  extra_headers=extra_headers,
140
+ solve_cloudflare=solve_cloudflare,
294
141
  disable_resources=disable_resources,
295
142
  wait_selector_state=wait_selector_state,
296
- adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
297
- additional_arguments=additional_arguments or {}
298
- )
299
- return engine.fetch(url)
143
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
144
+ additional_args=additional_args or {},
145
+ ) as engine:
146
+ return engine.fetch(url)
300
147
 
301
148
  @classmethod
302
149
  async def async_fetch(
303
- cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
304
- block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
305
- timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
306
- wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
307
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
308
- custom_config: Dict = None, additional_arguments: Dict = None
150
+ cls,
151
+ url: str,
152
+ headless: bool = True, # noqa: F821
153
+ block_images: bool = False,
154
+ disable_resources: bool = False,
155
+ block_webrtc: bool = False,
156
+ allow_webgl: bool = True,
157
+ network_idle: bool = False,
158
+ humanize: bool | float = True,
159
+ solve_cloudflare: bool = False,
160
+ wait: int | float = 0,
161
+ timeout: int | float = 30000,
162
+ page_action: Optional[Callable] = None,
163
+ wait_selector: Optional[str] = None,
164
+ init_script: Optional[str] = None,
165
+ addons: Optional[List[str]] = None,
166
+ wait_selector_state: SelectorWaitStates = "attached",
167
+ cookies: Optional[List[Dict]] = None,
168
+ google_search: bool = True,
169
+ extra_headers: Optional[Dict[str, str]] = None,
170
+ proxy: Optional[str | Dict[str, str]] = None,
171
+ os_randomize: bool = False,
172
+ disable_ads: bool = False,
173
+ geoip: bool = False,
174
+ custom_config: Optional[Dict] = None,
175
+ additional_args: Optional[Dict] = None,
309
176
  ) -> Response:
310
177
  """
311
178
  Opens up a browser and do your request based on your chosen options below.
312
179
 
313
180
  :param url: Target url.
314
- :param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
181
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
315
182
  :param block_images: Prevent the loading of images through Firefox preferences.
316
183
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
317
- :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
184
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
318
185
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
319
186
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
320
187
  :param block_webrtc: Blocks WebRTC entirely.
188
+ :param cookies: Set cookies for the next request.
321
189
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
322
- :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
323
190
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
324
- :param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
325
- :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
326
- It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
191
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
192
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
327
193
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
194
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
328
195
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
329
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
330
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
196
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
197
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
331
198
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
332
- :param wait_selector: Wait for a specific css selector to be in a specific state.
333
- :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
334
- :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
199
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
200
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
201
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
202
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
203
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
204
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
335
205
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
336
206
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
337
207
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
338
- :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
339
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
208
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
209
+ :return: A `Response` object.
340
210
  """
341
211
  if not custom_config:
342
212
  custom_config = {}
343
213
  elif not isinstance(custom_config, dict):
344
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
214
+ ValueError(
215
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
216
+ )
345
217
 
346
- engine = CamoufoxEngine(
218
+ async with AsyncStealthySession(
347
219
  wait=wait,
220
+ max_pages=1,
348
221
  proxy=proxy,
349
222
  geoip=geoip,
350
223
  addons=addons,
351
224
  timeout=timeout,
225
+ cookies=cookies,
352
226
  headless=headless,
353
227
  humanize=humanize,
354
228
  disable_ads=disable_ads,
355
229
  allow_webgl=allow_webgl,
356
230
  page_action=page_action,
231
+ init_script=init_script,
357
232
  network_idle=network_idle,
358
233
  block_images=block_images,
359
234
  block_webrtc=block_webrtc,
@@ -361,173 +236,204 @@ class StealthyFetcher(BaseFetcher):
361
236
  wait_selector=wait_selector,
362
237
  google_search=google_search,
363
238
  extra_headers=extra_headers,
239
+ solve_cloudflare=solve_cloudflare,
364
240
  disable_resources=disable_resources,
365
241
  wait_selector_state=wait_selector_state,
366
- adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
367
- additional_arguments=additional_arguments or {}
368
- )
369
- return await engine.async_fetch(url)
242
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
243
+ additional_args=additional_args or {},
244
+ ) as engine:
245
+ return await engine.fetch(url)
370
246
 
371
247
 
372
- class PlayWrightFetcher(BaseFetcher):
248
+ class DynamicFetcher(BaseFetcher):
373
249
  """A `Fetcher` class type that provide many options, all of them are based on PlayWright.
374
250
 
375
251
  Using this Fetcher class, you can do requests with:
376
252
  - Vanilla Playwright without any modifications other than the ones you chose.
377
- - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
253
+ - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress, but it bypasses many online tests like bot.sannysoft.com
378
254
  Some of the things stealth mode does include:
379
255
  1) Patches the CDP runtime fingerprint.
380
256
  2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
381
257
  3) Using custom flags on launch to hide Playwright even more and make it faster.
382
- 4) Generates real browser's headers of the same type and same user OS then append it to the request.
383
- - Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
258
+ 4) Generates real browser's headers of the same type and same user OS, then append it to the request.
259
+ - Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.
384
260
  - NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
385
261
 
386
- > Note that these are the main options with PlayWright but it can be mixed together.
262
+ > Note that these are the main options with PlayWright, but it can be mixed.
387
263
  """
264
+
388
265
  @classmethod
389
266
  def fetch(
390
- cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
391
- useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
392
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
393
- hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
394
- proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
395
- stealth: bool = False, real_chrome: bool = False,
396
- cdp_url: Optional[str] = None,
397
- nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
398
- custom_config: Dict = None
267
+ cls,
268
+ url: str,
269
+ headless: bool = True,
270
+ google_search: bool = True,
271
+ hide_canvas: bool = False,
272
+ disable_webgl: bool = False,
273
+ real_chrome: bool = False,
274
+ stealth: bool = False,
275
+ wait: int | float = 0,
276
+ page_action: Optional[Callable] = None,
277
+ proxy: Optional[str | Dict[str, str]] = None,
278
+ locale: str = "en-US",
279
+ extra_headers: Optional[Dict[str, str]] = None,
280
+ useragent: Optional[str] = None,
281
+ cdp_url: Optional[str] = None,
282
+ timeout: int | float = 30000,
283
+ disable_resources: bool = False,
284
+ wait_selector: Optional[str] = None,
285
+ init_script: Optional[str] = None,
286
+ cookies: Optional[Iterable[Dict]] = None,
287
+ network_idle: bool = False,
288
+ wait_selector_state: SelectorWaitStates = "attached",
289
+ custom_config: Optional[Dict] = None,
399
290
  ) -> Response:
400
291
  """Opens up a browser and do your request based on your chosen options below.
401
292
 
402
293
  :param url: Target url.
403
294
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
404
- :param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
295
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
405
296
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
406
297
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
407
298
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
299
+ :param cookies: Set cookies for the next request.
408
300
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
409
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
410
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
411
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
301
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
302
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
412
303
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
413
- :param wait_selector: Wait for a specific css selector to be in a specific state.
414
- :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
304
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
305
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
306
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
307
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
415
308
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
416
- :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
309
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
417
310
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
418
311
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
419
- :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
312
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
313
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
420
314
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
421
315
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
422
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
423
- :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
424
- :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
425
316
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
426
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
317
+ :return: A `Response` object.
427
318
  """
428
319
  if not custom_config:
429
320
  custom_config = {}
430
321
  elif not isinstance(custom_config, dict):
431
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
322
+ raise ValueError(
323
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
324
+ )
432
325
 
433
- engine = PlaywrightEngine(
326
+ with DynamicSession(
434
327
  wait=wait,
435
328
  proxy=proxy,
436
329
  locale=locale,
437
330
  timeout=timeout,
438
331
  stealth=stealth,
439
332
  cdp_url=cdp_url,
333
+ cookies=cookies,
440
334
  headless=headless,
441
335
  useragent=useragent,
442
336
  real_chrome=real_chrome,
443
337
  page_action=page_action,
444
338
  hide_canvas=hide_canvas,
339
+ init_script=init_script,
445
340
  network_idle=network_idle,
446
341
  google_search=google_search,
447
342
  extra_headers=extra_headers,
448
343
  wait_selector=wait_selector,
449
344
  disable_webgl=disable_webgl,
450
- nstbrowser_mode=nstbrowser_mode,
451
- nstbrowser_config=nstbrowser_config,
452
345
  disable_resources=disable_resources,
453
346
  wait_selector_state=wait_selector_state,
454
- adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
455
- )
456
- return engine.fetch(url)
347
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
348
+ ) as session:
349
+ return session.fetch(url)
457
350
 
458
351
  @classmethod
459
352
  async def async_fetch(
460
- cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
461
- useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
462
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
463
- hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
464
- proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
465
- stealth: bool = False, real_chrome: bool = False,
466
- cdp_url: Optional[str] = None,
467
- nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
468
- custom_config: Dict = None
353
+ cls,
354
+ url: str,
355
+ headless: bool = True,
356
+ google_search: bool = True,
357
+ hide_canvas: bool = False,
358
+ disable_webgl: bool = False,
359
+ real_chrome: bool = False,
360
+ stealth: bool = False,
361
+ wait: int | float = 0,
362
+ page_action: Optional[Callable] = None,
363
+ proxy: Optional[str | Dict[str, str]] = None,
364
+ locale: str = "en-US",
365
+ extra_headers: Optional[Dict[str, str]] = None,
366
+ useragent: Optional[str] = None,
367
+ cdp_url: Optional[str] = None,
368
+ timeout: int | float = 30000,
369
+ disable_resources: bool = False,
370
+ wait_selector: Optional[str] = None,
371
+ init_script: Optional[str] = None,
372
+ cookies: Optional[Iterable[Dict]] = None,
373
+ network_idle: bool = False,
374
+ wait_selector_state: SelectorWaitStates = "attached",
375
+ custom_config: Optional[Dict] = None,
469
376
  ) -> Response:
470
377
  """Opens up a browser and do your request based on your chosen options below.
471
378
 
472
379
  :param url: Target url.
473
380
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
474
- :param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
381
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
475
382
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
476
383
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
477
384
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
385
+ :param cookies: Set cookies for the next request.
478
386
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
479
- :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
480
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
481
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
387
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
388
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
482
389
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
483
- :param wait_selector: Wait for a specific css selector to be in a specific state.
484
- :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
390
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
391
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
392
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
393
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
485
394
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
486
- :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
395
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
487
396
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
488
397
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
489
- :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
398
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
399
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
490
400
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
491
401
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
492
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
493
- :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
494
- :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
495
402
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
496
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
403
+ :return: A `Response` object.
497
404
  """
498
405
  if not custom_config:
499
406
  custom_config = {}
500
407
  elif not isinstance(custom_config, dict):
501
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
408
+ raise ValueError(
409
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
410
+ )
502
411
 
503
- engine = PlaywrightEngine(
412
+ async with AsyncDynamicSession(
504
413
  wait=wait,
505
414
  proxy=proxy,
506
415
  locale=locale,
507
416
  timeout=timeout,
508
417
  stealth=stealth,
509
418
  cdp_url=cdp_url,
419
+ cookies=cookies,
510
420
  headless=headless,
511
421
  useragent=useragent,
422
+ max_pages=1,
512
423
  real_chrome=real_chrome,
513
424
  page_action=page_action,
514
425
  hide_canvas=hide_canvas,
426
+ init_script=init_script,
515
427
  network_idle=network_idle,
516
428
  google_search=google_search,
517
429
  extra_headers=extra_headers,
518
430
  wait_selector=wait_selector,
519
431
  disable_webgl=disable_webgl,
520
- nstbrowser_mode=nstbrowser_mode,
521
- nstbrowser_config=nstbrowser_config,
522
432
  disable_resources=disable_resources,
523
433
  wait_selector_state=wait_selector_state,
524
- adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
525
- )
526
- return await engine.async_fetch(url)
434
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
435
+ ) as session:
436
+ return await session.fetch(url)
527
437
 
528
438
 
529
- class CustomFetcher(BaseFetcher):
530
- @classmethod
531
- def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
532
- engine = check_if_engine_usable(browser_engine)(adaptor_arguments=cls._generate_parser_arguments(), **kwargs)
533
- return engine.fetch(url)
439
+ PlayWrightFetcher = DynamicFetcher # For backward-compatibility