scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +21 -4
- scrapling/core/_types.py +3 -2
- scrapling/core/ai.py +24 -15
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +6 -4
- scrapling/core/storage.py +7 -6
- scrapling/core/translator.py +13 -8
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +45 -21
- scrapling/engines/_browsers/_camoufox.py +98 -43
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +34 -13
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +749 -336
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +46 -0
- scrapling/fetchers/chrome.py +210 -0
- scrapling/fetchers/firefox.py +212 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +109 -84
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.5.dist-info/RECORD +0 -44
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
scrapling/fetchers.py
DELETED
@@ -1,444 +0,0 @@
|
|
1
|
-
from scrapling.core._types import (
|
2
|
-
Callable,
|
3
|
-
Dict,
|
4
|
-
List,
|
5
|
-
Optional,
|
6
|
-
SelectorWaitStates,
|
7
|
-
Iterable,
|
8
|
-
)
|
9
|
-
from scrapling.engines.static import (
|
10
|
-
FetcherSession,
|
11
|
-
FetcherClient as _FetcherClient,
|
12
|
-
AsyncFetcherClient as _AsyncFetcherClient,
|
13
|
-
)
|
14
|
-
from scrapling.engines._browsers import (
|
15
|
-
DynamicSession,
|
16
|
-
StealthySession,
|
17
|
-
AsyncDynamicSession,
|
18
|
-
AsyncStealthySession,
|
19
|
-
)
|
20
|
-
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
21
|
-
|
22
|
-
__FetcherClientInstance__ = _FetcherClient()
|
23
|
-
__AsyncFetcherClientInstance__ = _AsyncFetcherClient()
|
24
|
-
|
25
|
-
|
26
|
-
class Fetcher(BaseFetcher):
|
27
|
-
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
28
|
-
|
29
|
-
get = __FetcherClientInstance__.get
|
30
|
-
post = __FetcherClientInstance__.post
|
31
|
-
put = __FetcherClientInstance__.put
|
32
|
-
delete = __FetcherClientInstance__.delete
|
33
|
-
|
34
|
-
|
35
|
-
class AsyncFetcher(BaseFetcher):
|
36
|
-
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
37
|
-
|
38
|
-
get = __AsyncFetcherClientInstance__.get
|
39
|
-
post = __AsyncFetcherClientInstance__.post
|
40
|
-
put = __AsyncFetcherClientInstance__.put
|
41
|
-
delete = __AsyncFetcherClientInstance__.delete
|
42
|
-
|
43
|
-
|
44
|
-
class StealthyFetcher(BaseFetcher):
|
45
|
-
"""A `Fetcher` class type that is a completely stealthy fetcher that uses a modified version of Firefox.
|
46
|
-
|
47
|
-
It works as real browsers passing almost all online tests/protections based on Camoufox.
|
48
|
-
Other added flavors include setting the faked OS fingerprints to match the user's OS, and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
49
|
-
"""
|
50
|
-
|
51
|
-
@classmethod
|
52
|
-
def fetch(
|
53
|
-
cls,
|
54
|
-
url: str,
|
55
|
-
headless: bool = True, # noqa: F821
|
56
|
-
block_images: bool = False,
|
57
|
-
disable_resources: bool = False,
|
58
|
-
block_webrtc: bool = False,
|
59
|
-
allow_webgl: bool = True,
|
60
|
-
network_idle: bool = False,
|
61
|
-
load_dom: bool = True,
|
62
|
-
humanize: bool | float = True,
|
63
|
-
solve_cloudflare: bool = False,
|
64
|
-
wait: int | float = 0,
|
65
|
-
timeout: int | float = 30000,
|
66
|
-
page_action: Optional[Callable] = None,
|
67
|
-
wait_selector: Optional[str] = None,
|
68
|
-
init_script: Optional[str] = None,
|
69
|
-
addons: Optional[List[str]] = None,
|
70
|
-
wait_selector_state: SelectorWaitStates = "attached",
|
71
|
-
cookies: Optional[List[Dict]] = None,
|
72
|
-
google_search: bool = True,
|
73
|
-
extra_headers: Optional[Dict[str, str]] = None,
|
74
|
-
proxy: Optional[str | Dict[str, str]] = None,
|
75
|
-
os_randomize: bool = False,
|
76
|
-
disable_ads: bool = False,
|
77
|
-
geoip: bool = False,
|
78
|
-
custom_config: Optional[Dict] = None,
|
79
|
-
additional_args: Optional[Dict] = None,
|
80
|
-
) -> Response:
|
81
|
-
"""
|
82
|
-
Opens up a browser and do your request based on your chosen options below.
|
83
|
-
|
84
|
-
:param url: Target url.
|
85
|
-
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
86
|
-
:param block_images: Prevent the loading of images through Firefox preferences.
|
87
|
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
88
|
-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
89
|
-
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
90
|
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
91
|
-
:param block_webrtc: Blocks WebRTC entirely.
|
92
|
-
:param cookies: Set cookies for the next request.
|
93
|
-
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
94
|
-
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
95
|
-
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
96
|
-
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
97
|
-
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
98
|
-
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
99
|
-
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
100
|
-
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
101
|
-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
102
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
103
|
-
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
104
|
-
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
105
|
-
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
106
|
-
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
107
|
-
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
108
|
-
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
109
|
-
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
110
|
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
111
|
-
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
112
|
-
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
113
|
-
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
114
|
-
:return: A `Response` object.
|
115
|
-
"""
|
116
|
-
if not custom_config:
|
117
|
-
custom_config = {}
|
118
|
-
elif not isinstance(custom_config, dict):
|
119
|
-
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
120
|
-
|
121
|
-
with StealthySession(
|
122
|
-
wait=wait,
|
123
|
-
proxy=proxy,
|
124
|
-
geoip=geoip,
|
125
|
-
addons=addons,
|
126
|
-
timeout=timeout,
|
127
|
-
cookies=cookies,
|
128
|
-
headless=headless,
|
129
|
-
humanize=humanize,
|
130
|
-
load_dom=load_dom,
|
131
|
-
disable_ads=disable_ads,
|
132
|
-
allow_webgl=allow_webgl,
|
133
|
-
page_action=page_action,
|
134
|
-
init_script=init_script,
|
135
|
-
network_idle=network_idle,
|
136
|
-
block_images=block_images,
|
137
|
-
block_webrtc=block_webrtc,
|
138
|
-
os_randomize=os_randomize,
|
139
|
-
wait_selector=wait_selector,
|
140
|
-
google_search=google_search,
|
141
|
-
extra_headers=extra_headers,
|
142
|
-
solve_cloudflare=solve_cloudflare,
|
143
|
-
disable_resources=disable_resources,
|
144
|
-
wait_selector_state=wait_selector_state,
|
145
|
-
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
146
|
-
additional_args=additional_args or {},
|
147
|
-
) as engine:
|
148
|
-
return engine.fetch(url)
|
149
|
-
|
150
|
-
@classmethod
|
151
|
-
async def async_fetch(
|
152
|
-
cls,
|
153
|
-
url: str,
|
154
|
-
headless: bool = True, # noqa: F821
|
155
|
-
block_images: bool = False,
|
156
|
-
disable_resources: bool = False,
|
157
|
-
block_webrtc: bool = False,
|
158
|
-
allow_webgl: bool = True,
|
159
|
-
network_idle: bool = False,
|
160
|
-
load_dom: bool = True,
|
161
|
-
humanize: bool | float = True,
|
162
|
-
solve_cloudflare: bool = False,
|
163
|
-
wait: int | float = 0,
|
164
|
-
timeout: int | float = 30000,
|
165
|
-
page_action: Optional[Callable] = None,
|
166
|
-
wait_selector: Optional[str] = None,
|
167
|
-
init_script: Optional[str] = None,
|
168
|
-
addons: Optional[List[str]] = None,
|
169
|
-
wait_selector_state: SelectorWaitStates = "attached",
|
170
|
-
cookies: Optional[List[Dict]] = None,
|
171
|
-
google_search: bool = True,
|
172
|
-
extra_headers: Optional[Dict[str, str]] = None,
|
173
|
-
proxy: Optional[str | Dict[str, str]] = None,
|
174
|
-
os_randomize: bool = False,
|
175
|
-
disable_ads: bool = False,
|
176
|
-
geoip: bool = False,
|
177
|
-
custom_config: Optional[Dict] = None,
|
178
|
-
additional_args: Optional[Dict] = None,
|
179
|
-
) -> Response:
|
180
|
-
"""
|
181
|
-
Opens up a browser and do your request based on your chosen options below.
|
182
|
-
|
183
|
-
:param url: Target url.
|
184
|
-
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
185
|
-
:param block_images: Prevent the loading of images through Firefox preferences.
|
186
|
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
187
|
-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
188
|
-
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
189
|
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
190
|
-
:param block_webrtc: Blocks WebRTC entirely.
|
191
|
-
:param cookies: Set cookies for the next request.
|
192
|
-
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
193
|
-
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
194
|
-
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
195
|
-
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
196
|
-
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
197
|
-
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
198
|
-
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
199
|
-
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
200
|
-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
201
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
202
|
-
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
203
|
-
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
204
|
-
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
205
|
-
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
206
|
-
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
207
|
-
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
208
|
-
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
209
|
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
210
|
-
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
211
|
-
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
212
|
-
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
213
|
-
:return: A `Response` object.
|
214
|
-
"""
|
215
|
-
if not custom_config:
|
216
|
-
custom_config = {}
|
217
|
-
elif not isinstance(custom_config, dict):
|
218
|
-
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
219
|
-
|
220
|
-
async with AsyncStealthySession(
|
221
|
-
wait=wait,
|
222
|
-
max_pages=1,
|
223
|
-
proxy=proxy,
|
224
|
-
geoip=geoip,
|
225
|
-
addons=addons,
|
226
|
-
timeout=timeout,
|
227
|
-
cookies=cookies,
|
228
|
-
headless=headless,
|
229
|
-
humanize=humanize,
|
230
|
-
load_dom=load_dom,
|
231
|
-
disable_ads=disable_ads,
|
232
|
-
allow_webgl=allow_webgl,
|
233
|
-
page_action=page_action,
|
234
|
-
init_script=init_script,
|
235
|
-
network_idle=network_idle,
|
236
|
-
block_images=block_images,
|
237
|
-
block_webrtc=block_webrtc,
|
238
|
-
os_randomize=os_randomize,
|
239
|
-
wait_selector=wait_selector,
|
240
|
-
google_search=google_search,
|
241
|
-
extra_headers=extra_headers,
|
242
|
-
solve_cloudflare=solve_cloudflare,
|
243
|
-
disable_resources=disable_resources,
|
244
|
-
wait_selector_state=wait_selector_state,
|
245
|
-
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
246
|
-
additional_args=additional_args or {},
|
247
|
-
) as engine:
|
248
|
-
return await engine.fetch(url)
|
249
|
-
|
250
|
-
|
251
|
-
class DynamicFetcher(BaseFetcher):
|
252
|
-
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
253
|
-
|
254
|
-
Using this Fetcher class, you can do requests with:
|
255
|
-
- Vanilla Playwright without any modifications other than the ones you chose.
|
256
|
-
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress, but it bypasses many online tests like bot.sannysoft.com
|
257
|
-
Some of the things stealth mode does include:
|
258
|
-
1) Patches the CDP runtime fingerprint.
|
259
|
-
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
260
|
-
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
261
|
-
4) Generates real browser's headers of the same type and same user OS, then append it to the request.
|
262
|
-
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.
|
263
|
-
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
264
|
-
|
265
|
-
> Note that these are the main options with PlayWright, but it can be mixed.
|
266
|
-
"""
|
267
|
-
|
268
|
-
@classmethod
|
269
|
-
def fetch(
|
270
|
-
cls,
|
271
|
-
url: str,
|
272
|
-
headless: bool = True,
|
273
|
-
google_search: bool = True,
|
274
|
-
hide_canvas: bool = False,
|
275
|
-
disable_webgl: bool = False,
|
276
|
-
real_chrome: bool = False,
|
277
|
-
stealth: bool = False,
|
278
|
-
wait: int | float = 0,
|
279
|
-
page_action: Optional[Callable] = None,
|
280
|
-
proxy: Optional[str | Dict[str, str]] = None,
|
281
|
-
locale: str = "en-US",
|
282
|
-
extra_headers: Optional[Dict[str, str]] = None,
|
283
|
-
useragent: Optional[str] = None,
|
284
|
-
cdp_url: Optional[str] = None,
|
285
|
-
timeout: int | float = 30000,
|
286
|
-
disable_resources: bool = False,
|
287
|
-
wait_selector: Optional[str] = None,
|
288
|
-
init_script: Optional[str] = None,
|
289
|
-
cookies: Optional[Iterable[Dict]] = None,
|
290
|
-
network_idle: bool = False,
|
291
|
-
load_dom: bool = True,
|
292
|
-
wait_selector_state: SelectorWaitStates = "attached",
|
293
|
-
custom_config: Optional[Dict] = None,
|
294
|
-
) -> Response:
|
295
|
-
"""Opens up a browser and do your request based on your chosen options below.
|
296
|
-
|
297
|
-
:param url: Target url.
|
298
|
-
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
299
|
-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
300
|
-
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
301
|
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
302
|
-
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
303
|
-
:param cookies: Set cookies for the next request.
|
304
|
-
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
305
|
-
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
306
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
307
|
-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
308
|
-
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
309
|
-
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
310
|
-
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
311
|
-
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
312
|
-
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
313
|
-
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
314
|
-
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
315
|
-
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
316
|
-
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
317
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
318
|
-
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
319
|
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
320
|
-
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
321
|
-
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
322
|
-
:return: A `Response` object.
|
323
|
-
"""
|
324
|
-
if not custom_config:
|
325
|
-
custom_config = {}
|
326
|
-
elif not isinstance(custom_config, dict):
|
327
|
-
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
328
|
-
|
329
|
-
with DynamicSession(
|
330
|
-
wait=wait,
|
331
|
-
proxy=proxy,
|
332
|
-
locale=locale,
|
333
|
-
timeout=timeout,
|
334
|
-
stealth=stealth,
|
335
|
-
cdp_url=cdp_url,
|
336
|
-
cookies=cookies,
|
337
|
-
headless=headless,
|
338
|
-
load_dom=load_dom,
|
339
|
-
useragent=useragent,
|
340
|
-
real_chrome=real_chrome,
|
341
|
-
page_action=page_action,
|
342
|
-
hide_canvas=hide_canvas,
|
343
|
-
init_script=init_script,
|
344
|
-
network_idle=network_idle,
|
345
|
-
google_search=google_search,
|
346
|
-
extra_headers=extra_headers,
|
347
|
-
wait_selector=wait_selector,
|
348
|
-
disable_webgl=disable_webgl,
|
349
|
-
disable_resources=disable_resources,
|
350
|
-
wait_selector_state=wait_selector_state,
|
351
|
-
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
352
|
-
) as session:
|
353
|
-
return session.fetch(url)
|
354
|
-
|
355
|
-
@classmethod
|
356
|
-
async def async_fetch(
|
357
|
-
cls,
|
358
|
-
url: str,
|
359
|
-
headless: bool = True,
|
360
|
-
google_search: bool = True,
|
361
|
-
hide_canvas: bool = False,
|
362
|
-
disable_webgl: bool = False,
|
363
|
-
real_chrome: bool = False,
|
364
|
-
stealth: bool = False,
|
365
|
-
wait: int | float = 0,
|
366
|
-
page_action: Optional[Callable] = None,
|
367
|
-
proxy: Optional[str | Dict[str, str]] = None,
|
368
|
-
locale: str = "en-US",
|
369
|
-
extra_headers: Optional[Dict[str, str]] = None,
|
370
|
-
useragent: Optional[str] = None,
|
371
|
-
cdp_url: Optional[str] = None,
|
372
|
-
timeout: int | float = 30000,
|
373
|
-
disable_resources: bool = False,
|
374
|
-
wait_selector: Optional[str] = None,
|
375
|
-
init_script: Optional[str] = None,
|
376
|
-
cookies: Optional[Iterable[Dict]] = None,
|
377
|
-
network_idle: bool = False,
|
378
|
-
load_dom: bool = True,
|
379
|
-
wait_selector_state: SelectorWaitStates = "attached",
|
380
|
-
custom_config: Optional[Dict] = None,
|
381
|
-
) -> Response:
|
382
|
-
"""Opens up a browser and do your request based on your chosen options below.
|
383
|
-
|
384
|
-
:param url: Target url.
|
385
|
-
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
386
|
-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
387
|
-
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
388
|
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
389
|
-
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
390
|
-
:param cookies: Set cookies for the next request.
|
391
|
-
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
392
|
-
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
393
|
-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
394
|
-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
395
|
-
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
396
|
-
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
397
|
-
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
398
|
-
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
399
|
-
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
400
|
-
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
401
|
-
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
402
|
-
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
403
|
-
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
404
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
405
|
-
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
406
|
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
407
|
-
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
408
|
-
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
409
|
-
:return: A `Response` object.
|
410
|
-
"""
|
411
|
-
if not custom_config:
|
412
|
-
custom_config = {}
|
413
|
-
elif not isinstance(custom_config, dict):
|
414
|
-
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
415
|
-
|
416
|
-
async with AsyncDynamicSession(
|
417
|
-
wait=wait,
|
418
|
-
max_pages=1,
|
419
|
-
proxy=proxy,
|
420
|
-
locale=locale,
|
421
|
-
timeout=timeout,
|
422
|
-
stealth=stealth,
|
423
|
-
cdp_url=cdp_url,
|
424
|
-
cookies=cookies,
|
425
|
-
headless=headless,
|
426
|
-
load_dom=load_dom,
|
427
|
-
useragent=useragent,
|
428
|
-
real_chrome=real_chrome,
|
429
|
-
page_action=page_action,
|
430
|
-
hide_canvas=hide_canvas,
|
431
|
-
init_script=init_script,
|
432
|
-
network_idle=network_idle,
|
433
|
-
google_search=google_search,
|
434
|
-
extra_headers=extra_headers,
|
435
|
-
wait_selector=wait_selector,
|
436
|
-
disable_webgl=disable_webgl,
|
437
|
-
disable_resources=disable_resources,
|
438
|
-
wait_selector_state=wait_selector_state,
|
439
|
-
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
440
|
-
) as session:
|
441
|
-
return await session.fetch(url)
|
442
|
-
|
443
|
-
|
444
|
-
PlayWrightFetcher = DynamicFetcher # For backward-compatibility
|
scrapling-0.3.5.dist-info/RECORD
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=3-wjeMR5IQVhHoPcl5KYMo3cgA00q1mWn38q02xTWck,1236
|
2
|
-
scrapling/cli.py,sha256=tGQ3q4pHJZf1XJ8UIqPdT2JR9bjOhlXydmY1cNLkbZc,26363
|
3
|
-
scrapling/fetchers.py,sha256=aYQUxp-0i-OBucdpdG6zjWCafTCgpXJdnJ0GIrm5GfA,26523
|
4
|
-
scrapling/parser.py,sha256=Fh15nediLLSfYQOb_vr76YFUA_fNJFU7klYCkp_XXts,57517
|
5
|
-
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
-
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
scrapling/core/_html_utils.py,sha256=ki47v54SsTL5-khi1jcLkJqAHqEq19cuex-dqzXdbEI,20328
|
8
|
-
scrapling/core/_types.py,sha256=iXhi8LFkU4wjkGOjITdY1IDBEbn5rOxsl7xwEKT1L3I,895
|
9
|
-
scrapling/core/ai.py,sha256=v3wjtXJgBRUtImE6Q_Bf_FruOArJyraQk4kqsqhlU8k,35474
|
10
|
-
scrapling/core/custom_types.py,sha256=GlQZiVIMCyv8vOdDUlASPn85r_4nw0P9ggID9q1VkRA,13608
|
11
|
-
scrapling/core/mixins.py,sha256=2iUVcN2XSAKGEvNmAM2Rr9axpZoxu0M2gIFEaFTO_Dg,3206
|
12
|
-
scrapling/core/shell.py,sha256=dCD8c_k1skXrKSIc_Qe_KgsiMOAS_1eCzgWjvSO74-I,22893
|
13
|
-
scrapling/core/storage.py,sha256=8lWMPut6lPpvn9iOkgy9ao11_g8FNkXq67wHKtU4uuM,6290
|
14
|
-
scrapling/core/translator.py,sha256=HLJngeRRw2M0eNe_f8AfQD64a49OECIEm5Df_WELVG4,5135
|
15
|
-
scrapling/core/utils/__init__.py,sha256=7B14TcrDVwSaH6BQrMnzb1NtFa4Om237dJcF9oe-lM0,204
|
16
|
-
scrapling/core/utils/_shell.py,sha256=zes71MmFTs7V9f0JFstaWcjQhKNZN6xvspu29YVQtRc,1707
|
17
|
-
scrapling/core/utils/_utils.py,sha256=ATy-wwz00U-alOGH-NGK-VoPNr1qYmUwEoWuqAHjDkg,3143
|
18
|
-
scrapling/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
scrapling/engines/constants.py,sha256=DP7yVRK1w1W4B1eXGeeKKQNDSo163FFXdPaxTy4adqw,4088
|
20
|
-
scrapling/engines/static.py,sha256=Tuwl6cEDP5_OQKPFRmemz7ozdeZnWm0vir4J7JYdQCs,33122
|
21
|
-
scrapling/engines/_browsers/__init__.py,sha256=lu5RgcV4zYacRaKm28ph5TzjqAovTaQNNfXSgQGwDOU,123
|
22
|
-
scrapling/engines/_browsers/_base.py,sha256=29rPeXyrRnFIPLLMbvq3CUxGw4sMEJ3nKki9CC1iH2g,11049
|
23
|
-
scrapling/engines/_browsers/_camoufox.py,sha256=BvxsTLcDpTMVoqsHIy7Smwls1zo6fpCtGMDW4v5Kim8,35356
|
24
|
-
scrapling/engines/_browsers/_config_tools.py,sha256=mEPA5SGrWq0dl15cDOT6sOsm5NHMD0vI0fuPttGpw-U,4610
|
25
|
-
scrapling/engines/_browsers/_controllers.py,sha256=YuiO8uw8pyv8hQLBvZCJcTGrNbKZSsYzkPKK9X6bq6U,27232
|
26
|
-
scrapling/engines/_browsers/_page.py,sha256=1z-P6c97cTkULE-FVrsMY589e6eL_20Ae8pUe6vjggE,2206
|
27
|
-
scrapling/engines/_browsers/_validators.py,sha256=jvJjXURN79aeR-ZFc_k5zf_3ClP18gM1qZA7dMXd_YI,7491
|
28
|
-
scrapling/engines/toolbelt/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
29
|
-
scrapling/engines/toolbelt/convertor.py,sha256=e_rMcW8ScdfxKO-V5Mk61blVzwuDgd82CpRds0Z2tMQ,13102
|
30
|
-
scrapling/engines/toolbelt/custom.py,sha256=uhMXa_LNcvvG3wZXBRKHXvqLqShMR9SHwc3bBv4UaQs,7664
|
31
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=hCxKUTwo8sy7iN9wk8OA5vGo9XOn6E365zvC1C6zWDE,2212
|
32
|
-
scrapling/engines/toolbelt/navigation.py,sha256=Ej23I1n9AjCwxva_yRXUQeefmYJgi7lgb2Wr_b8RNFs,3550
|
33
|
-
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
34
|
-
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
35
|
-
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=clzuf7KYcvDWYaKKxT_bkAoCT2fGsOcUw47948CHjAc,267
|
36
|
-
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
37
|
-
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
38
|
-
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
39
|
-
scrapling-0.3.5.dist-info/licenses/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
40
|
-
scrapling-0.3.5.dist-info/METADATA,sha256=a-ZKBr0yH6jKb88l5BpbwMhWEbP-mQG3_NoI4Rogv9M,22513
|
41
|
-
scrapling-0.3.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
42
|
-
scrapling-0.3.5.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
43
|
-
scrapling-0.3.5.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
|
44
|
-
scrapling-0.3.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|