scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +49 -127
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +219 -296
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +201 -281
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +9 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +29 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
- scrapling-0.3.3.dist-info/RECORD +44 -0
- scrapling-0.3.1.dist-info/RECORD +0 -41
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,8 @@
|
|
1
|
-
from time import time, sleep
|
2
1
|
from re import compile as re_compile
|
3
|
-
from asyncio import sleep as asyncio_sleep, Lock
|
4
2
|
|
5
|
-
from camoufox import DefaultAddons
|
6
|
-
from camoufox.utils import launch_options as generate_launch_options
|
7
3
|
from playwright.sync_api import (
|
8
4
|
Response as SyncPlaywrightResponse,
|
9
5
|
sync_playwright,
|
10
|
-
BrowserContext,
|
11
|
-
Playwright,
|
12
6
|
Locator,
|
13
7
|
Page,
|
14
8
|
)
|
@@ -21,9 +15,9 @@ from playwright.async_api import (
|
|
21
15
|
Page as async_Page,
|
22
16
|
)
|
23
17
|
|
24
|
-
from scrapling.core.utils import log
|
25
|
-
from ._page import PageInfo, PagePool
|
26
18
|
from ._validators import validate, CamoufoxConfig
|
19
|
+
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
20
|
+
from scrapling.core.utils import log
|
27
21
|
from scrapling.core._types import (
|
28
22
|
Dict,
|
29
23
|
List,
|
@@ -31,19 +25,17 @@ from scrapling.core._types import (
|
|
31
25
|
Callable,
|
32
26
|
SelectorWaitStates,
|
33
27
|
)
|
34
|
-
from scrapling.engines.toolbelt import (
|
28
|
+
from scrapling.engines.toolbelt.convertor import (
|
35
29
|
Response,
|
36
30
|
ResponseFactory,
|
37
|
-
async_intercept_route,
|
38
|
-
generate_convincing_referer,
|
39
|
-
get_os_name,
|
40
|
-
intercept_route,
|
41
31
|
)
|
32
|
+
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
42
33
|
|
43
34
|
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
35
|
+
_UNSET = object()
|
44
36
|
|
45
37
|
|
46
|
-
class StealthySession:
|
38
|
+
class StealthySession(StealthySessionMixin, SyncSession):
|
47
39
|
"""A Stealthy session manager with page pooling."""
|
48
40
|
|
49
41
|
__slots__ = (
|
@@ -54,6 +46,7 @@ class StealthySession:
|
|
54
46
|
"block_webrtc",
|
55
47
|
"allow_webgl",
|
56
48
|
"network_idle",
|
49
|
+
"load_dom",
|
57
50
|
"humanize",
|
58
51
|
"solve_cloudflare",
|
59
52
|
"wait",
|
@@ -83,13 +76,14 @@ class StealthySession:
|
|
83
76
|
|
84
77
|
def __init__(
|
85
78
|
self,
|
86
|
-
|
79
|
+
__max_pages: int = 1,
|
87
80
|
headless: bool = True, # noqa: F821
|
88
81
|
block_images: bool = False,
|
89
82
|
disable_resources: bool = False,
|
90
83
|
block_webrtc: bool = False,
|
91
84
|
allow_webgl: bool = True,
|
92
85
|
network_idle: bool = False,
|
86
|
+
load_dom: bool = True,
|
93
87
|
humanize: bool | float = True,
|
94
88
|
solve_cloudflare: bool = False,
|
95
89
|
wait: int | float = 0,
|
@@ -124,11 +118,12 @@ class StealthySession:
|
|
124
118
|
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
125
119
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
126
120
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
121
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
127
122
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
128
123
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
129
124
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
130
125
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
131
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
126
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
132
127
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
133
128
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
134
129
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
@@ -137,108 +132,47 @@ class StealthySession:
|
|
137
132
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
138
133
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
139
134
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
140
|
-
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
141
135
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
142
136
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
143
137
|
"""
|
144
138
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
config = validate(params, CamoufoxConfig)
|
173
|
-
|
174
|
-
self.max_pages = config.max_pages
|
175
|
-
self.headless = config.headless
|
176
|
-
self.block_images = config.block_images
|
177
|
-
self.disable_resources = config.disable_resources
|
178
|
-
self.block_webrtc = config.block_webrtc
|
179
|
-
self.allow_webgl = config.allow_webgl
|
180
|
-
self.network_idle = config.network_idle
|
181
|
-
self.humanize = config.humanize
|
182
|
-
self.solve_cloudflare = config.solve_cloudflare
|
183
|
-
self.wait = config.wait
|
184
|
-
self.timeout = config.timeout
|
185
|
-
self.page_action = config.page_action
|
186
|
-
self.wait_selector = config.wait_selector
|
187
|
-
self.init_script = config.init_script
|
188
|
-
self.addons = config.addons
|
189
|
-
self.wait_selector_state = config.wait_selector_state
|
190
|
-
self.cookies = config.cookies
|
191
|
-
self.google_search = config.google_search
|
192
|
-
self.extra_headers = config.extra_headers
|
193
|
-
self.proxy = config.proxy
|
194
|
-
self.os_randomize = config.os_randomize
|
195
|
-
self.disable_ads = config.disable_ads
|
196
|
-
self.geoip = config.geoip
|
197
|
-
self.selector_config = config.selector_config
|
198
|
-
self.additional_args = config.additional_args
|
199
|
-
|
200
|
-
self.playwright: Optional[Playwright] = None
|
201
|
-
self.context: Optional[BrowserContext] = None
|
202
|
-
self.page_pool = PagePool(self.max_pages)
|
203
|
-
self._closed = False
|
204
|
-
self.selector_config = config.selector_config
|
205
|
-
self.page_action = config.page_action
|
206
|
-
self._headers_keys = (
|
207
|
-
set(map(str.lower, self.extra_headers.keys()))
|
208
|
-
if self.extra_headers
|
209
|
-
else set()
|
210
|
-
)
|
211
|
-
self.__initiate_browser_options__()
|
212
|
-
|
213
|
-
def __initiate_browser_options__(self):
|
214
|
-
"""Initiate browser options."""
|
215
|
-
self.launch_options = generate_launch_options(
|
216
|
-
**{
|
217
|
-
"geoip": self.geoip,
|
218
|
-
"proxy": dict(self.proxy) if self.proxy else self.proxy,
|
219
|
-
"enable_cache": True,
|
220
|
-
"addons": self.addons,
|
221
|
-
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
222
|
-
"headless": self.headless,
|
223
|
-
"humanize": True if self.solve_cloudflare else self.humanize,
|
224
|
-
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
225
|
-
"allow_webgl": self.allow_webgl,
|
226
|
-
"block_webrtc": self.block_webrtc,
|
227
|
-
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
228
|
-
"os": None if self.os_randomize else get_os_name(),
|
229
|
-
"user_data_dir": "",
|
230
|
-
**self.additional_args,
|
231
|
-
}
|
139
|
+
self.__validate__(
|
140
|
+
wait=wait,
|
141
|
+
proxy=proxy,
|
142
|
+
geoip=geoip,
|
143
|
+
addons=addons,
|
144
|
+
timeout=timeout,
|
145
|
+
cookies=cookies,
|
146
|
+
headless=headless,
|
147
|
+
humanize=humanize,
|
148
|
+
load_dom=load_dom,
|
149
|
+
max_pages=__max_pages,
|
150
|
+
disable_ads=disable_ads,
|
151
|
+
allow_webgl=allow_webgl,
|
152
|
+
page_action=page_action,
|
153
|
+
init_script=init_script,
|
154
|
+
network_idle=network_idle,
|
155
|
+
block_images=block_images,
|
156
|
+
block_webrtc=block_webrtc,
|
157
|
+
os_randomize=os_randomize,
|
158
|
+
wait_selector=wait_selector,
|
159
|
+
google_search=google_search,
|
160
|
+
extra_headers=extra_headers,
|
161
|
+
additional_args=additional_args,
|
162
|
+
selector_config=selector_config,
|
163
|
+
solve_cloudflare=solve_cloudflare,
|
164
|
+
disable_resources=disable_resources,
|
165
|
+
wait_selector_state=wait_selector_state,
|
232
166
|
)
|
167
|
+
super().__init__(max_pages=self.max_pages)
|
233
168
|
|
234
169
|
def __create__(self):
|
235
170
|
"""Create a browser for this instance and context."""
|
236
171
|
self.playwright = sync_playwright().start()
|
237
|
-
self.context = (
|
238
|
-
self.
|
239
|
-
**self.launch_options
|
240
|
-
)
|
172
|
+
self.context = self.playwright.firefox.launch_persistent_context( # pragma: no cover
|
173
|
+
**self.launch_options
|
241
174
|
)
|
175
|
+
|
242
176
|
if self.init_script: # pragma: no cover
|
243
177
|
self.context.add_init_script(path=self.init_script)
|
244
178
|
|
@@ -267,68 +201,6 @@ class StealthySession:
|
|
267
201
|
|
268
202
|
self._closed = True
|
269
203
|
|
270
|
-
def _get_or_create_page(self) -> PageInfo: # pragma: no cover
|
271
|
-
"""Get an available page or create a new one"""
|
272
|
-
# Try to get a ready page first
|
273
|
-
page_info = self.page_pool.get_ready_page()
|
274
|
-
if page_info:
|
275
|
-
return page_info
|
276
|
-
|
277
|
-
# Create a new page if under limit
|
278
|
-
if self.page_pool.pages_count < self.max_pages:
|
279
|
-
page = self.context.new_page()
|
280
|
-
page.set_default_navigation_timeout(self.timeout)
|
281
|
-
page.set_default_timeout(self.timeout)
|
282
|
-
if self.extra_headers:
|
283
|
-
page.set_extra_http_headers(self.extra_headers)
|
284
|
-
|
285
|
-
if self.disable_resources:
|
286
|
-
page.route("**/*", intercept_route)
|
287
|
-
|
288
|
-
return self.page_pool.add_page(page)
|
289
|
-
|
290
|
-
# Wait for a page to become available
|
291
|
-
max_wait = 30
|
292
|
-
start_time = time()
|
293
|
-
|
294
|
-
while time() - start_time < max_wait:
|
295
|
-
page_info = self.page_pool.get_ready_page()
|
296
|
-
if page_info:
|
297
|
-
return page_info
|
298
|
-
sleep(0.05)
|
299
|
-
|
300
|
-
raise TimeoutError("No pages available within timeout period")
|
301
|
-
|
302
|
-
@staticmethod
|
303
|
-
def _detect_cloudflare(page_content):
|
304
|
-
"""
|
305
|
-
Detect the type of Cloudflare challenge present in the provided page content.
|
306
|
-
|
307
|
-
This function analyzes the given page content to identify whether a specific
|
308
|
-
type of Cloudflare challenge is present. It checks for three predefined
|
309
|
-
challenge types: non-interactive, managed, and interactive. If a challenge
|
310
|
-
type is detected, it returns the corresponding type as a string. If no
|
311
|
-
challenge type is detected, it returns None.
|
312
|
-
|
313
|
-
Args:
|
314
|
-
page_content (str): The content of the page to analyze for Cloudflare
|
315
|
-
challenge types.
|
316
|
-
|
317
|
-
Returns:
|
318
|
-
str: A string representing the detected Cloudflare challenge type, if
|
319
|
-
found. Returns None if no challenge matches.
|
320
|
-
"""
|
321
|
-
challenge_types = (
|
322
|
-
"non-interactive",
|
323
|
-
"managed",
|
324
|
-
"interactive",
|
325
|
-
)
|
326
|
-
for ctype in challenge_types:
|
327
|
-
if f"cType: '{ctype}'" in page_content:
|
328
|
-
return ctype
|
329
|
-
|
330
|
-
return None
|
331
|
-
|
332
204
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
333
205
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
334
206
|
|
@@ -375,20 +247,66 @@ class StealthySession:
|
|
375
247
|
log.info("Cloudflare captcha is solved")
|
376
248
|
return
|
377
249
|
|
378
|
-
def fetch(
|
250
|
+
def fetch(
|
251
|
+
self,
|
252
|
+
url: str,
|
253
|
+
google_search: bool = _UNSET,
|
254
|
+
timeout: int | float = _UNSET,
|
255
|
+
wait: int | float = _UNSET,
|
256
|
+
page_action: Optional[Callable] = _UNSET,
|
257
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
258
|
+
disable_resources: bool = _UNSET,
|
259
|
+
wait_selector: Optional[str] = _UNSET,
|
260
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
261
|
+
network_idle: bool = _UNSET,
|
262
|
+
load_dom: bool = _UNSET,
|
263
|
+
solve_cloudflare: bool = _UNSET,
|
264
|
+
selector_config: Optional[Dict] = _UNSET,
|
265
|
+
) -> Response:
|
379
266
|
"""Opens up the browser and do your request based on your chosen options.
|
380
267
|
|
381
268
|
:param url: The Target url.
|
269
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
270
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
271
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
272
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
273
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
274
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
275
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
276
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
277
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
278
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
279
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
280
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
281
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
282
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
382
283
|
:return: A `Response` object.
|
383
284
|
"""
|
285
|
+
# Validate all resolved parameters
|
286
|
+
params = validate(
|
287
|
+
dict(
|
288
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
289
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
290
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
291
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
292
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
293
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
294
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
295
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
296
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
297
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
298
|
+
solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
|
299
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
300
|
+
),
|
301
|
+
CamoufoxConfig,
|
302
|
+
)
|
303
|
+
|
384
304
|
if self._closed: # pragma: no cover
|
385
305
|
raise RuntimeError("Context manager has been closed")
|
386
306
|
|
387
307
|
final_response = None
|
388
308
|
referer = (
|
389
|
-
generate_convincing_referer(url)
|
390
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
391
|
-
else None
|
309
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
392
310
|
)
|
393
311
|
|
394
312
|
def handle_response(finished_response: SyncPlaywrightResponse):
|
@@ -399,54 +317,57 @@ class StealthySession:
|
|
399
317
|
):
|
400
318
|
final_response = finished_response
|
401
319
|
|
402
|
-
page_info = self.
|
320
|
+
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
403
321
|
page_info.mark_busy(url=url)
|
404
322
|
|
405
323
|
try: # pragma: no cover
|
406
324
|
# Navigate to URL and wait for a specified state
|
407
325
|
page_info.page.on("response", handle_response)
|
408
326
|
first_response = page_info.page.goto(url, referer=referer)
|
409
|
-
|
327
|
+
if params.load_dom:
|
328
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
410
329
|
|
411
|
-
if
|
330
|
+
if params.network_idle:
|
412
331
|
page_info.page.wait_for_load_state("networkidle")
|
413
332
|
|
414
333
|
if not first_response:
|
415
334
|
raise RuntimeError(f"Failed to get response for {url}")
|
416
335
|
|
417
|
-
if
|
336
|
+
if params.solve_cloudflare:
|
418
337
|
self._solve_cloudflare(page_info.page)
|
419
338
|
# Make sure the page is fully loaded after the captcha
|
420
339
|
page_info.page.wait_for_load_state(state="load")
|
421
|
-
|
422
|
-
|
340
|
+
if params.load_dom:
|
341
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
342
|
+
if params.network_idle:
|
423
343
|
page_info.page.wait_for_load_state("networkidle")
|
424
344
|
|
425
|
-
if
|
345
|
+
if params.page_action:
|
426
346
|
try:
|
427
|
-
|
347
|
+
_ = params.page_action(page_info.page)
|
428
348
|
except Exception as e:
|
429
349
|
log.error(f"Error executing page_action: {e}")
|
430
350
|
|
431
|
-
if
|
351
|
+
if params.wait_selector:
|
432
352
|
try:
|
433
|
-
waiter: Locator = page_info.page.locator(
|
434
|
-
waiter.first.wait_for(state=
|
353
|
+
waiter: Locator = page_info.page.locator(params.wait_selector)
|
354
|
+
waiter.first.wait_for(state=params.wait_selector_state)
|
435
355
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
436
356
|
page_info.page.wait_for_load_state(state="load")
|
437
|
-
|
438
|
-
|
357
|
+
if params.load_dom:
|
358
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
359
|
+
if params.network_idle:
|
439
360
|
page_info.page.wait_for_load_state("networkidle")
|
440
361
|
except Exception as e:
|
441
|
-
log.error(f"Error waiting for selector {
|
362
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
442
363
|
|
443
|
-
page_info.page.wait_for_timeout(
|
364
|
+
page_info.page.wait_for_timeout(params.wait)
|
444
365
|
response = ResponseFactory.from_playwright_response(
|
445
|
-
page_info.page, first_response, final_response,
|
366
|
+
page_info.page, first_response, final_response, params.selector_config
|
446
367
|
)
|
447
368
|
|
448
|
-
# Mark the page as
|
449
|
-
page_info.
|
369
|
+
# Mark the page as finished for next use
|
370
|
+
page_info.mark_finished()
|
450
371
|
|
451
372
|
return response
|
452
373
|
|
@@ -454,17 +375,8 @@ class StealthySession:
|
|
454
375
|
page_info.mark_error()
|
455
376
|
raise e
|
456
377
|
|
457
|
-
def get_pool_stats(self) -> Dict[str, int]:
|
458
|
-
"""Get statistics about the current page pool"""
|
459
|
-
return {
|
460
|
-
"total_pages": self.page_pool.pages_count,
|
461
|
-
"ready_pages": self.page_pool.ready_count,
|
462
|
-
"busy_pages": self.page_pool.busy_count,
|
463
|
-
"max_pages": self.max_pages,
|
464
|
-
}
|
465
|
-
|
466
378
|
|
467
|
-
class AsyncStealthySession(
|
379
|
+
class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
468
380
|
"""A Stealthy session manager with page pooling."""
|
469
381
|
|
470
382
|
def __init__(
|
@@ -476,6 +388,7 @@ class AsyncStealthySession(StealthySession):
|
|
476
388
|
block_webrtc: bool = False,
|
477
389
|
allow_webgl: bool = True,
|
478
390
|
network_idle: bool = False,
|
391
|
+
load_dom: bool = True,
|
479
392
|
humanize: bool | float = True,
|
480
393
|
solve_cloudflare: bool = False,
|
481
394
|
wait: int | float = 0,
|
@@ -510,11 +423,12 @@ class AsyncStealthySession(StealthySession):
|
|
510
423
|
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
511
424
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
512
425
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
426
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
513
427
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
514
428
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
515
429
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
516
430
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
517
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
431
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
518
432
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
519
433
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
520
434
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
@@ -527,47 +441,43 @@ class AsyncStealthySession(StealthySession):
|
|
527
441
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
528
442
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
529
443
|
"""
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
init_script,
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
444
|
+
self.__validate__(
|
445
|
+
wait=wait,
|
446
|
+
proxy=proxy,
|
447
|
+
geoip=geoip,
|
448
|
+
addons=addons,
|
449
|
+
timeout=timeout,
|
450
|
+
cookies=cookies,
|
451
|
+
headless=headless,
|
452
|
+
load_dom=load_dom,
|
453
|
+
humanize=humanize,
|
454
|
+
max_pages=max_pages,
|
455
|
+
disable_ads=disable_ads,
|
456
|
+
allow_webgl=allow_webgl,
|
457
|
+
page_action=page_action,
|
458
|
+
init_script=init_script,
|
459
|
+
network_idle=network_idle,
|
460
|
+
block_images=block_images,
|
461
|
+
block_webrtc=block_webrtc,
|
462
|
+
os_randomize=os_randomize,
|
463
|
+
wait_selector=wait_selector,
|
464
|
+
google_search=google_search,
|
465
|
+
extra_headers=extra_headers,
|
466
|
+
additional_args=additional_args,
|
467
|
+
selector_config=selector_config,
|
468
|
+
solve_cloudflare=solve_cloudflare,
|
469
|
+
disable_resources=disable_resources,
|
470
|
+
wait_selector_state=wait_selector_state,
|
556
471
|
)
|
557
|
-
self.
|
558
|
-
self.context: Optional[AsyncBrowserContext] = None
|
559
|
-
self._lock = Lock()
|
560
|
-
self.__enter__ = None
|
561
|
-
self.__exit__ = None
|
472
|
+
super().__init__(max_pages=self.max_pages)
|
562
473
|
|
563
474
|
async def __create__(self):
|
564
475
|
"""Create a browser for this instance and context."""
|
565
476
|
self.playwright: AsyncPlaywright = await async_playwright().start()
|
566
|
-
self.context: AsyncBrowserContext = (
|
567
|
-
|
568
|
-
**self.launch_options
|
569
|
-
)
|
477
|
+
self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
|
478
|
+
**self.launch_options
|
570
479
|
)
|
480
|
+
|
571
481
|
if self.init_script: # pragma: no cover
|
572
482
|
await self.context.add_init_script(path=self.init_script)
|
573
483
|
|
@@ -596,39 +506,6 @@ class AsyncStealthySession(StealthySession):
|
|
596
506
|
|
597
507
|
self._closed = True
|
598
508
|
|
599
|
-
async def _get_or_create_page(self) -> PageInfo:
|
600
|
-
"""Get an available page or create a new one"""
|
601
|
-
async with self._lock:
|
602
|
-
# Try to get a ready page first
|
603
|
-
page_info = self.page_pool.get_ready_page()
|
604
|
-
if page_info:
|
605
|
-
return page_info
|
606
|
-
|
607
|
-
# Create a new page if under limit
|
608
|
-
if self.page_pool.pages_count < self.max_pages:
|
609
|
-
page = await self.context.new_page()
|
610
|
-
page.set_default_navigation_timeout(self.timeout)
|
611
|
-
page.set_default_timeout(self.timeout)
|
612
|
-
if self.extra_headers:
|
613
|
-
await page.set_extra_http_headers(self.extra_headers)
|
614
|
-
|
615
|
-
if self.disable_resources:
|
616
|
-
await page.route("**/*", async_intercept_route)
|
617
|
-
|
618
|
-
return self.page_pool.add_page(page)
|
619
|
-
|
620
|
-
# Wait for a page to become available
|
621
|
-
max_wait = 30
|
622
|
-
start_time = time()
|
623
|
-
|
624
|
-
while time() - start_time < max_wait: # pragma: no cover
|
625
|
-
page_info = self.page_pool.get_ready_page()
|
626
|
-
if page_info:
|
627
|
-
return page_info
|
628
|
-
await asyncio_sleep(0.05)
|
629
|
-
|
630
|
-
raise TimeoutError("No pages available within timeout period")
|
631
|
-
|
632
509
|
async def _solve_cloudflare(self, page: async_Page):
|
633
510
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
634
511
|
|
@@ -664,9 +541,7 @@ class AsyncStealthySession(StealthySession):
|
|
664
541
|
await page.wait_for_timeout(500)
|
665
542
|
|
666
543
|
# Calculate the Captcha coordinates for any viewport
|
667
|
-
outer_box = await page.locator(
|
668
|
-
".main-content p+div>div>div"
|
669
|
-
).bounding_box()
|
544
|
+
outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
|
670
545
|
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
671
546
|
|
672
547
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
@@ -677,20 +552,65 @@ class AsyncStealthySession(StealthySession):
|
|
677
552
|
log.info("Cloudflare captcha is solved")
|
678
553
|
return
|
679
554
|
|
680
|
-
async def fetch(
|
555
|
+
async def fetch(
|
556
|
+
self,
|
557
|
+
url: str,
|
558
|
+
google_search: bool = _UNSET,
|
559
|
+
timeout: int | float = _UNSET,
|
560
|
+
wait: int | float = _UNSET,
|
561
|
+
page_action: Optional[Callable] = _UNSET,
|
562
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
563
|
+
disable_resources: bool = _UNSET,
|
564
|
+
wait_selector: Optional[str] = _UNSET,
|
565
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
566
|
+
network_idle: bool = _UNSET,
|
567
|
+
load_dom: bool = _UNSET,
|
568
|
+
solve_cloudflare: bool = _UNSET,
|
569
|
+
selector_config: Optional[Dict] = _UNSET,
|
570
|
+
) -> Response:
|
681
571
|
"""Opens up the browser and do your request based on your chosen options.
|
682
572
|
|
683
573
|
:param url: The Target url.
|
574
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
575
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
576
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
577
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
578
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
579
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
580
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
581
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
582
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
583
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
584
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
585
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
586
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
587
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
684
588
|
:return: A `Response` object.
|
685
589
|
"""
|
590
|
+
params = validate(
|
591
|
+
dict(
|
592
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
593
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
594
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
595
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
596
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
597
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
598
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
599
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
600
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
601
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
602
|
+
solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
|
603
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
604
|
+
),
|
605
|
+
CamoufoxConfig,
|
606
|
+
)
|
607
|
+
|
686
608
|
if self._closed: # pragma: no cover
|
687
609
|
raise RuntimeError("Context manager has been closed")
|
688
610
|
|
689
611
|
final_response = None
|
690
612
|
referer = (
|
691
|
-
generate_convincing_referer(url)
|
692
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
693
|
-
else None
|
613
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
694
614
|
)
|
695
615
|
|
696
616
|
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
@@ -701,56 +621,59 @@ class AsyncStealthySession(StealthySession):
|
|
701
621
|
):
|
702
622
|
final_response = finished_response
|
703
623
|
|
704
|
-
page_info = await self.
|
624
|
+
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
705
625
|
page_info.mark_busy(url=url)
|
706
626
|
|
707
627
|
try:
|
708
628
|
# Navigate to URL and wait for a specified state
|
709
629
|
page_info.page.on("response", handle_response)
|
710
630
|
first_response = await page_info.page.goto(url, referer=referer)
|
711
|
-
|
631
|
+
if params.load_dom:
|
632
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
712
633
|
|
713
|
-
if
|
634
|
+
if params.network_idle:
|
714
635
|
await page_info.page.wait_for_load_state("networkidle")
|
715
636
|
|
716
637
|
if not first_response:
|
717
638
|
raise RuntimeError(f"Failed to get response for {url}")
|
718
639
|
|
719
|
-
if
|
640
|
+
if params.solve_cloudflare:
|
720
641
|
await self._solve_cloudflare(page_info.page)
|
721
642
|
# Make sure the page is fully loaded after the captcha
|
722
643
|
await page_info.page.wait_for_load_state(state="load")
|
723
|
-
|
724
|
-
|
644
|
+
if params.load_dom:
|
645
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
646
|
+
if params.network_idle:
|
725
647
|
await page_info.page.wait_for_load_state("networkidle")
|
726
648
|
|
727
|
-
if
|
649
|
+
if params.page_action:
|
728
650
|
try:
|
729
|
-
|
651
|
+
_ = await params.page_action(page_info.page)
|
730
652
|
except Exception as e:
|
731
653
|
log.error(f"Error executing page_action: {e}")
|
732
654
|
|
733
|
-
if
|
655
|
+
if params.wait_selector:
|
734
656
|
try:
|
735
|
-
waiter: AsyncLocator = page_info.page.locator(
|
736
|
-
await waiter.first.wait_for(state=
|
657
|
+
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
658
|
+
await waiter.first.wait_for(state=params.wait_selector_state)
|
737
659
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
738
660
|
await page_info.page.wait_for_load_state(state="load")
|
739
|
-
|
740
|
-
|
661
|
+
if params.load_dom:
|
662
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
663
|
+
if params.network_idle:
|
741
664
|
await page_info.page.wait_for_load_state("networkidle")
|
742
665
|
except Exception as e:
|
743
|
-
log.error(f"Error waiting for selector {
|
666
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
744
667
|
|
745
|
-
await page_info.page.wait_for_timeout(
|
668
|
+
await page_info.page.wait_for_timeout(params.wait)
|
746
669
|
|
747
670
|
# Create response object
|
748
671
|
response = await ResponseFactory.from_async_playwright_response(
|
749
|
-
page_info.page, first_response, final_response,
|
672
|
+
page_info.page, first_response, final_response, params.selector_config
|
750
673
|
)
|
751
674
|
|
752
|
-
# Mark the page as
|
753
|
-
page_info.
|
675
|
+
# Mark the page as finished for next use
|
676
|
+
page_info.mark_finished()
|
754
677
|
|
755
678
|
return response
|
756
679
|
|