scrapling 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +49 -127
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +227 -296
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +209 -281
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +9 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +29 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/METADATA +54 -46
- scrapling-0.3.2.dist-info/RECORD +44 -0
- scrapling-0.3.1.dist-info/RECORD +0 -41
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,8 @@
|
|
1
|
-
from time import time, sleep
|
2
1
|
from re import compile as re_compile
|
3
|
-
from asyncio import sleep as asyncio_sleep, Lock
|
4
2
|
|
5
|
-
from camoufox import DefaultAddons
|
6
|
-
from camoufox.utils import launch_options as generate_launch_options
|
7
3
|
from playwright.sync_api import (
|
8
4
|
Response as SyncPlaywrightResponse,
|
9
5
|
sync_playwright,
|
10
|
-
BrowserContext,
|
11
|
-
Playwright,
|
12
6
|
Locator,
|
13
7
|
Page,
|
14
8
|
)
|
@@ -21,9 +15,9 @@ from playwright.async_api import (
|
|
21
15
|
Page as async_Page,
|
22
16
|
)
|
23
17
|
|
24
|
-
from scrapling.core.utils import log
|
25
|
-
from ._page import PageInfo, PagePool
|
26
18
|
from ._validators import validate, CamoufoxConfig
|
19
|
+
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
20
|
+
from scrapling.core.utils import log
|
27
21
|
from scrapling.core._types import (
|
28
22
|
Dict,
|
29
23
|
List,
|
@@ -31,19 +25,17 @@ from scrapling.core._types import (
|
|
31
25
|
Callable,
|
32
26
|
SelectorWaitStates,
|
33
27
|
)
|
34
|
-
from scrapling.engines.toolbelt import (
|
28
|
+
from scrapling.engines.toolbelt.convertor import (
|
35
29
|
Response,
|
36
30
|
ResponseFactory,
|
37
|
-
async_intercept_route,
|
38
|
-
generate_convincing_referer,
|
39
|
-
get_os_name,
|
40
|
-
intercept_route,
|
41
31
|
)
|
32
|
+
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
42
33
|
|
43
34
|
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
35
|
+
_UNSET = object()
|
44
36
|
|
45
37
|
|
46
|
-
class StealthySession:
|
38
|
+
class StealthySession(StealthySessionMixin, SyncSession):
|
47
39
|
"""A Stealthy session manager with page pooling."""
|
48
40
|
|
49
41
|
__slots__ = (
|
@@ -54,6 +46,7 @@ class StealthySession:
|
|
54
46
|
"block_webrtc",
|
55
47
|
"allow_webgl",
|
56
48
|
"network_idle",
|
49
|
+
"load_dom",
|
57
50
|
"humanize",
|
58
51
|
"solve_cloudflare",
|
59
52
|
"wait",
|
@@ -83,13 +76,14 @@ class StealthySession:
|
|
83
76
|
|
84
77
|
def __init__(
|
85
78
|
self,
|
86
|
-
|
79
|
+
__max_pages: int = 1,
|
87
80
|
headless: bool = True, # noqa: F821
|
88
81
|
block_images: bool = False,
|
89
82
|
disable_resources: bool = False,
|
90
83
|
block_webrtc: bool = False,
|
91
84
|
allow_webgl: bool = True,
|
92
85
|
network_idle: bool = False,
|
86
|
+
load_dom: bool = True,
|
93
87
|
humanize: bool | float = True,
|
94
88
|
solve_cloudflare: bool = False,
|
95
89
|
wait: int | float = 0,
|
@@ -124,11 +118,12 @@ class StealthySession:
|
|
124
118
|
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
125
119
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
126
120
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
121
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
127
122
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
128
123
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
129
124
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
130
125
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
131
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
126
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
132
127
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
133
128
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
134
129
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
@@ -137,108 +132,51 @@ class StealthySession:
|
|
137
132
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
138
133
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
139
134
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
140
|
-
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
141
135
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
142
136
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
143
137
|
"""
|
144
138
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
config = validate(params, CamoufoxConfig)
|
173
|
-
|
174
|
-
self.max_pages = config.max_pages
|
175
|
-
self.headless = config.headless
|
176
|
-
self.block_images = config.block_images
|
177
|
-
self.disable_resources = config.disable_resources
|
178
|
-
self.block_webrtc = config.block_webrtc
|
179
|
-
self.allow_webgl = config.allow_webgl
|
180
|
-
self.network_idle = config.network_idle
|
181
|
-
self.humanize = config.humanize
|
182
|
-
self.solve_cloudflare = config.solve_cloudflare
|
183
|
-
self.wait = config.wait
|
184
|
-
self.timeout = config.timeout
|
185
|
-
self.page_action = config.page_action
|
186
|
-
self.wait_selector = config.wait_selector
|
187
|
-
self.init_script = config.init_script
|
188
|
-
self.addons = config.addons
|
189
|
-
self.wait_selector_state = config.wait_selector_state
|
190
|
-
self.cookies = config.cookies
|
191
|
-
self.google_search = config.google_search
|
192
|
-
self.extra_headers = config.extra_headers
|
193
|
-
self.proxy = config.proxy
|
194
|
-
self.os_randomize = config.os_randomize
|
195
|
-
self.disable_ads = config.disable_ads
|
196
|
-
self.geoip = config.geoip
|
197
|
-
self.selector_config = config.selector_config
|
198
|
-
self.additional_args = config.additional_args
|
199
|
-
|
200
|
-
self.playwright: Optional[Playwright] = None
|
201
|
-
self.context: Optional[BrowserContext] = None
|
202
|
-
self.page_pool = PagePool(self.max_pages)
|
203
|
-
self._closed = False
|
204
|
-
self.selector_config = config.selector_config
|
205
|
-
self.page_action = config.page_action
|
206
|
-
self._headers_keys = (
|
207
|
-
set(map(str.lower, self.extra_headers.keys()))
|
208
|
-
if self.extra_headers
|
209
|
-
else set()
|
210
|
-
)
|
211
|
-
self.__initiate_browser_options__()
|
212
|
-
|
213
|
-
def __initiate_browser_options__(self):
|
214
|
-
"""Initiate browser options."""
|
215
|
-
self.launch_options = generate_launch_options(
|
216
|
-
**{
|
217
|
-
"geoip": self.geoip,
|
218
|
-
"proxy": dict(self.proxy) if self.proxy else self.proxy,
|
219
|
-
"enable_cache": True,
|
220
|
-
"addons": self.addons,
|
221
|
-
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
222
|
-
"headless": self.headless,
|
223
|
-
"humanize": True if self.solve_cloudflare else self.humanize,
|
224
|
-
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
225
|
-
"allow_webgl": self.allow_webgl,
|
226
|
-
"block_webrtc": self.block_webrtc,
|
227
|
-
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
228
|
-
"os": None if self.os_randomize else get_os_name(),
|
229
|
-
"user_data_dir": "",
|
230
|
-
**self.additional_args,
|
231
|
-
}
|
139
|
+
self.__validate__(
|
140
|
+
wait=wait,
|
141
|
+
proxy=proxy,
|
142
|
+
geoip=geoip,
|
143
|
+
addons=addons,
|
144
|
+
timeout=timeout,
|
145
|
+
cookies=cookies,
|
146
|
+
headless=headless,
|
147
|
+
humanize=humanize,
|
148
|
+
load_dom=load_dom,
|
149
|
+
max_pages=__max_pages,
|
150
|
+
disable_ads=disable_ads,
|
151
|
+
allow_webgl=allow_webgl,
|
152
|
+
page_action=page_action,
|
153
|
+
init_script=init_script,
|
154
|
+
network_idle=network_idle,
|
155
|
+
block_images=block_images,
|
156
|
+
block_webrtc=block_webrtc,
|
157
|
+
os_randomize=os_randomize,
|
158
|
+
wait_selector=wait_selector,
|
159
|
+
google_search=google_search,
|
160
|
+
extra_headers=extra_headers,
|
161
|
+
additional_args=additional_args,
|
162
|
+
selector_config=selector_config,
|
163
|
+
solve_cloudflare=solve_cloudflare,
|
164
|
+
disable_resources=disable_resources,
|
165
|
+
wait_selector_state=wait_selector_state,
|
232
166
|
)
|
167
|
+
super().__init__(max_pages=self.max_pages)
|
233
168
|
|
234
169
|
def __create__(self):
|
235
170
|
"""Create a browser for this instance and context."""
|
236
171
|
self.playwright = sync_playwright().start()
|
237
|
-
self.context = (
|
238
|
-
self.
|
239
|
-
**self.launch_options
|
240
|
-
)
|
172
|
+
self.context = self.playwright.firefox.launch_persistent_context( # pragma: no cover
|
173
|
+
**self.launch_options
|
241
174
|
)
|
175
|
+
|
176
|
+
# Get the default page and close it
|
177
|
+
default_page = self.context.pages[0]
|
178
|
+
default_page.close()
|
179
|
+
|
242
180
|
if self.init_script: # pragma: no cover
|
243
181
|
self.context.add_init_script(path=self.init_script)
|
244
182
|
|
@@ -267,68 +205,6 @@ class StealthySession:
|
|
267
205
|
|
268
206
|
self._closed = True
|
269
207
|
|
270
|
-
def _get_or_create_page(self) -> PageInfo: # pragma: no cover
|
271
|
-
"""Get an available page or create a new one"""
|
272
|
-
# Try to get a ready page first
|
273
|
-
page_info = self.page_pool.get_ready_page()
|
274
|
-
if page_info:
|
275
|
-
return page_info
|
276
|
-
|
277
|
-
# Create a new page if under limit
|
278
|
-
if self.page_pool.pages_count < self.max_pages:
|
279
|
-
page = self.context.new_page()
|
280
|
-
page.set_default_navigation_timeout(self.timeout)
|
281
|
-
page.set_default_timeout(self.timeout)
|
282
|
-
if self.extra_headers:
|
283
|
-
page.set_extra_http_headers(self.extra_headers)
|
284
|
-
|
285
|
-
if self.disable_resources:
|
286
|
-
page.route("**/*", intercept_route)
|
287
|
-
|
288
|
-
return self.page_pool.add_page(page)
|
289
|
-
|
290
|
-
# Wait for a page to become available
|
291
|
-
max_wait = 30
|
292
|
-
start_time = time()
|
293
|
-
|
294
|
-
while time() - start_time < max_wait:
|
295
|
-
page_info = self.page_pool.get_ready_page()
|
296
|
-
if page_info:
|
297
|
-
return page_info
|
298
|
-
sleep(0.05)
|
299
|
-
|
300
|
-
raise TimeoutError("No pages available within timeout period")
|
301
|
-
|
302
|
-
@staticmethod
|
303
|
-
def _detect_cloudflare(page_content):
|
304
|
-
"""
|
305
|
-
Detect the type of Cloudflare challenge present in the provided page content.
|
306
|
-
|
307
|
-
This function analyzes the given page content to identify whether a specific
|
308
|
-
type of Cloudflare challenge is present. It checks for three predefined
|
309
|
-
challenge types: non-interactive, managed, and interactive. If a challenge
|
310
|
-
type is detected, it returns the corresponding type as a string. If no
|
311
|
-
challenge type is detected, it returns None.
|
312
|
-
|
313
|
-
Args:
|
314
|
-
page_content (str): The content of the page to analyze for Cloudflare
|
315
|
-
challenge types.
|
316
|
-
|
317
|
-
Returns:
|
318
|
-
str: A string representing the detected Cloudflare challenge type, if
|
319
|
-
found. Returns None if no challenge matches.
|
320
|
-
"""
|
321
|
-
challenge_types = (
|
322
|
-
"non-interactive",
|
323
|
-
"managed",
|
324
|
-
"interactive",
|
325
|
-
)
|
326
|
-
for ctype in challenge_types:
|
327
|
-
if f"cType: '{ctype}'" in page_content:
|
328
|
-
return ctype
|
329
|
-
|
330
|
-
return None
|
331
|
-
|
332
208
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
333
209
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
334
210
|
|
@@ -375,20 +251,66 @@ class StealthySession:
|
|
375
251
|
log.info("Cloudflare captcha is solved")
|
376
252
|
return
|
377
253
|
|
378
|
-
def fetch(
|
254
|
+
def fetch(
|
255
|
+
self,
|
256
|
+
url: str,
|
257
|
+
google_search: bool = _UNSET,
|
258
|
+
timeout: int | float = _UNSET,
|
259
|
+
wait: int | float = _UNSET,
|
260
|
+
page_action: Optional[Callable] = _UNSET,
|
261
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
262
|
+
disable_resources: bool = _UNSET,
|
263
|
+
wait_selector: Optional[str] = _UNSET,
|
264
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
265
|
+
network_idle: bool = _UNSET,
|
266
|
+
load_dom: bool = _UNSET,
|
267
|
+
solve_cloudflare: bool = _UNSET,
|
268
|
+
selector_config: Optional[Dict] = _UNSET,
|
269
|
+
) -> Response:
|
379
270
|
"""Opens up the browser and do your request based on your chosen options.
|
380
271
|
|
381
272
|
:param url: The Target url.
|
273
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
274
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
275
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
276
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
277
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
278
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
279
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
280
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
281
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
282
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
283
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
284
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
285
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
286
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
382
287
|
:return: A `Response` object.
|
383
288
|
"""
|
289
|
+
# Validate all resolved parameters
|
290
|
+
params = validate(
|
291
|
+
dict(
|
292
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
293
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
294
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
295
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
296
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
297
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
298
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
299
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
300
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
301
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
302
|
+
solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
|
303
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
304
|
+
),
|
305
|
+
CamoufoxConfig,
|
306
|
+
)
|
307
|
+
|
384
308
|
if self._closed: # pragma: no cover
|
385
309
|
raise RuntimeError("Context manager has been closed")
|
386
310
|
|
387
311
|
final_response = None
|
388
312
|
referer = (
|
389
|
-
generate_convincing_referer(url)
|
390
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
391
|
-
else None
|
313
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
392
314
|
)
|
393
315
|
|
394
316
|
def handle_response(finished_response: SyncPlaywrightResponse):
|
@@ -399,54 +321,57 @@ class StealthySession:
|
|
399
321
|
):
|
400
322
|
final_response = finished_response
|
401
323
|
|
402
|
-
page_info = self.
|
324
|
+
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
403
325
|
page_info.mark_busy(url=url)
|
404
326
|
|
405
327
|
try: # pragma: no cover
|
406
328
|
# Navigate to URL and wait for a specified state
|
407
329
|
page_info.page.on("response", handle_response)
|
408
330
|
first_response = page_info.page.goto(url, referer=referer)
|
409
|
-
|
331
|
+
if params.load_dom:
|
332
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
410
333
|
|
411
|
-
if
|
334
|
+
if params.network_idle:
|
412
335
|
page_info.page.wait_for_load_state("networkidle")
|
413
336
|
|
414
337
|
if not first_response:
|
415
338
|
raise RuntimeError(f"Failed to get response for {url}")
|
416
339
|
|
417
|
-
if
|
340
|
+
if params.solve_cloudflare:
|
418
341
|
self._solve_cloudflare(page_info.page)
|
419
342
|
# Make sure the page is fully loaded after the captcha
|
420
343
|
page_info.page.wait_for_load_state(state="load")
|
421
|
-
|
422
|
-
|
344
|
+
if params.load_dom:
|
345
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
346
|
+
if params.network_idle:
|
423
347
|
page_info.page.wait_for_load_state("networkidle")
|
424
348
|
|
425
|
-
if
|
349
|
+
if params.page_action:
|
426
350
|
try:
|
427
|
-
|
351
|
+
_ = params.page_action(page_info.page)
|
428
352
|
except Exception as e:
|
429
353
|
log.error(f"Error executing page_action: {e}")
|
430
354
|
|
431
|
-
if
|
355
|
+
if params.wait_selector:
|
432
356
|
try:
|
433
|
-
waiter: Locator = page_info.page.locator(
|
434
|
-
waiter.first.wait_for(state=
|
357
|
+
waiter: Locator = page_info.page.locator(params.wait_selector)
|
358
|
+
waiter.first.wait_for(state=params.wait_selector_state)
|
435
359
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
436
360
|
page_info.page.wait_for_load_state(state="load")
|
437
|
-
|
438
|
-
|
361
|
+
if params.load_dom:
|
362
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
363
|
+
if params.network_idle:
|
439
364
|
page_info.page.wait_for_load_state("networkidle")
|
440
365
|
except Exception as e:
|
441
|
-
log.error(f"Error waiting for selector {
|
366
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
442
367
|
|
443
|
-
page_info.page.wait_for_timeout(
|
368
|
+
page_info.page.wait_for_timeout(params.wait)
|
444
369
|
response = ResponseFactory.from_playwright_response(
|
445
|
-
page_info.page, first_response, final_response,
|
370
|
+
page_info.page, first_response, final_response, params.selector_config
|
446
371
|
)
|
447
372
|
|
448
|
-
# Mark the page as
|
449
|
-
page_info.
|
373
|
+
# Mark the page as finished for next use
|
374
|
+
page_info.mark_finished()
|
450
375
|
|
451
376
|
return response
|
452
377
|
|
@@ -454,17 +379,8 @@ class StealthySession:
|
|
454
379
|
page_info.mark_error()
|
455
380
|
raise e
|
456
381
|
|
457
|
-
def get_pool_stats(self) -> Dict[str, int]:
|
458
|
-
"""Get statistics about the current page pool"""
|
459
|
-
return {
|
460
|
-
"total_pages": self.page_pool.pages_count,
|
461
|
-
"ready_pages": self.page_pool.ready_count,
|
462
|
-
"busy_pages": self.page_pool.busy_count,
|
463
|
-
"max_pages": self.max_pages,
|
464
|
-
}
|
465
|
-
|
466
382
|
|
467
|
-
class AsyncStealthySession(
|
383
|
+
class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
468
384
|
"""A Stealthy session manager with page pooling."""
|
469
385
|
|
470
386
|
def __init__(
|
@@ -476,6 +392,7 @@ class AsyncStealthySession(StealthySession):
|
|
476
392
|
block_webrtc: bool = False,
|
477
393
|
allow_webgl: bool = True,
|
478
394
|
network_idle: bool = False,
|
395
|
+
load_dom: bool = True,
|
479
396
|
humanize: bool | float = True,
|
480
397
|
solve_cloudflare: bool = False,
|
481
398
|
wait: int | float = 0,
|
@@ -510,11 +427,12 @@ class AsyncStealthySession(StealthySession):
|
|
510
427
|
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
511
428
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
512
429
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
430
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
513
431
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
514
432
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
515
433
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
516
434
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
517
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
435
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
518
436
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
519
437
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
520
438
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
@@ -527,47 +445,47 @@ class AsyncStealthySession(StealthySession):
|
|
527
445
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
528
446
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
529
447
|
"""
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
init_script,
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
448
|
+
self.__validate__(
|
449
|
+
wait=wait,
|
450
|
+
proxy=proxy,
|
451
|
+
geoip=geoip,
|
452
|
+
addons=addons,
|
453
|
+
timeout=timeout,
|
454
|
+
cookies=cookies,
|
455
|
+
headless=headless,
|
456
|
+
load_dom=load_dom,
|
457
|
+
humanize=humanize,
|
458
|
+
max_pages=max_pages,
|
459
|
+
disable_ads=disable_ads,
|
460
|
+
allow_webgl=allow_webgl,
|
461
|
+
page_action=page_action,
|
462
|
+
init_script=init_script,
|
463
|
+
network_idle=network_idle,
|
464
|
+
block_images=block_images,
|
465
|
+
block_webrtc=block_webrtc,
|
466
|
+
os_randomize=os_randomize,
|
467
|
+
wait_selector=wait_selector,
|
468
|
+
google_search=google_search,
|
469
|
+
extra_headers=extra_headers,
|
470
|
+
additional_args=additional_args,
|
471
|
+
selector_config=selector_config,
|
472
|
+
solve_cloudflare=solve_cloudflare,
|
473
|
+
disable_resources=disable_resources,
|
474
|
+
wait_selector_state=wait_selector_state,
|
556
475
|
)
|
557
|
-
self.
|
558
|
-
self.context: Optional[AsyncBrowserContext] = None
|
559
|
-
self._lock = Lock()
|
560
|
-
self.__enter__ = None
|
561
|
-
self.__exit__ = None
|
476
|
+
super().__init__(max_pages=self.max_pages)
|
562
477
|
|
563
478
|
async def __create__(self):
|
564
479
|
"""Create a browser for this instance and context."""
|
565
480
|
self.playwright: AsyncPlaywright = await async_playwright().start()
|
566
|
-
self.context: AsyncBrowserContext = (
|
567
|
-
|
568
|
-
**self.launch_options
|
569
|
-
)
|
481
|
+
self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
|
482
|
+
**self.launch_options
|
570
483
|
)
|
484
|
+
|
485
|
+
# Get the default page and close it
|
486
|
+
default_page = self.context.pages[0]
|
487
|
+
await default_page.close()
|
488
|
+
|
571
489
|
if self.init_script: # pragma: no cover
|
572
490
|
await self.context.add_init_script(path=self.init_script)
|
573
491
|
|
@@ -596,39 +514,6 @@ class AsyncStealthySession(StealthySession):
|
|
596
514
|
|
597
515
|
self._closed = True
|
598
516
|
|
599
|
-
async def _get_or_create_page(self) -> PageInfo:
|
600
|
-
"""Get an available page or create a new one"""
|
601
|
-
async with self._lock:
|
602
|
-
# Try to get a ready page first
|
603
|
-
page_info = self.page_pool.get_ready_page()
|
604
|
-
if page_info:
|
605
|
-
return page_info
|
606
|
-
|
607
|
-
# Create a new page if under limit
|
608
|
-
if self.page_pool.pages_count < self.max_pages:
|
609
|
-
page = await self.context.new_page()
|
610
|
-
page.set_default_navigation_timeout(self.timeout)
|
611
|
-
page.set_default_timeout(self.timeout)
|
612
|
-
if self.extra_headers:
|
613
|
-
await page.set_extra_http_headers(self.extra_headers)
|
614
|
-
|
615
|
-
if self.disable_resources:
|
616
|
-
await page.route("**/*", async_intercept_route)
|
617
|
-
|
618
|
-
return self.page_pool.add_page(page)
|
619
|
-
|
620
|
-
# Wait for a page to become available
|
621
|
-
max_wait = 30
|
622
|
-
start_time = time()
|
623
|
-
|
624
|
-
while time() - start_time < max_wait: # pragma: no cover
|
625
|
-
page_info = self.page_pool.get_ready_page()
|
626
|
-
if page_info:
|
627
|
-
return page_info
|
628
|
-
await asyncio_sleep(0.05)
|
629
|
-
|
630
|
-
raise TimeoutError("No pages available within timeout period")
|
631
|
-
|
632
517
|
async def _solve_cloudflare(self, page: async_Page):
|
633
518
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
634
519
|
|
@@ -664,9 +549,7 @@ class AsyncStealthySession(StealthySession):
|
|
664
549
|
await page.wait_for_timeout(500)
|
665
550
|
|
666
551
|
# Calculate the Captcha coordinates for any viewport
|
667
|
-
outer_box = await page.locator(
|
668
|
-
".main-content p+div>div>div"
|
669
|
-
).bounding_box()
|
552
|
+
outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
|
670
553
|
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
671
554
|
|
672
555
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
@@ -677,20 +560,65 @@ class AsyncStealthySession(StealthySession):
|
|
677
560
|
log.info("Cloudflare captcha is solved")
|
678
561
|
return
|
679
562
|
|
680
|
-
async def fetch(
|
563
|
+
async def fetch(
|
564
|
+
self,
|
565
|
+
url: str,
|
566
|
+
google_search: bool = _UNSET,
|
567
|
+
timeout: int | float = _UNSET,
|
568
|
+
wait: int | float = _UNSET,
|
569
|
+
page_action: Optional[Callable] = _UNSET,
|
570
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
571
|
+
disable_resources: bool = _UNSET,
|
572
|
+
wait_selector: Optional[str] = _UNSET,
|
573
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
574
|
+
network_idle: bool = _UNSET,
|
575
|
+
load_dom: bool = _UNSET,
|
576
|
+
solve_cloudflare: bool = _UNSET,
|
577
|
+
selector_config: Optional[Dict] = _UNSET,
|
578
|
+
) -> Response:
|
681
579
|
"""Opens up the browser and do your request based on your chosen options.
|
682
580
|
|
683
581
|
:param url: The Target url.
|
582
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
583
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
584
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
585
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
586
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
587
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
588
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
589
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
590
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
591
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
592
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
593
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
594
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
595
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
684
596
|
:return: A `Response` object.
|
685
597
|
"""
|
598
|
+
params = validate(
|
599
|
+
dict(
|
600
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
601
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
602
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
603
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
604
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
605
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
606
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
607
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
608
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
609
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
610
|
+
solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
|
611
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
612
|
+
),
|
613
|
+
CamoufoxConfig,
|
614
|
+
)
|
615
|
+
|
686
616
|
if self._closed: # pragma: no cover
|
687
617
|
raise RuntimeError("Context manager has been closed")
|
688
618
|
|
689
619
|
final_response = None
|
690
620
|
referer = (
|
691
|
-
generate_convincing_referer(url)
|
692
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
693
|
-
else None
|
621
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
694
622
|
)
|
695
623
|
|
696
624
|
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
@@ -701,56 +629,59 @@ class AsyncStealthySession(StealthySession):
|
|
701
629
|
):
|
702
630
|
final_response = finished_response
|
703
631
|
|
704
|
-
page_info = await self.
|
632
|
+
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
705
633
|
page_info.mark_busy(url=url)
|
706
634
|
|
707
635
|
try:
|
708
636
|
# Navigate to URL and wait for a specified state
|
709
637
|
page_info.page.on("response", handle_response)
|
710
638
|
first_response = await page_info.page.goto(url, referer=referer)
|
711
|
-
|
639
|
+
if params.load_dom:
|
640
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
712
641
|
|
713
|
-
if
|
642
|
+
if params.network_idle:
|
714
643
|
await page_info.page.wait_for_load_state("networkidle")
|
715
644
|
|
716
645
|
if not first_response:
|
717
646
|
raise RuntimeError(f"Failed to get response for {url}")
|
718
647
|
|
719
|
-
if
|
648
|
+
if params.solve_cloudflare:
|
720
649
|
await self._solve_cloudflare(page_info.page)
|
721
650
|
# Make sure the page is fully loaded after the captcha
|
722
651
|
await page_info.page.wait_for_load_state(state="load")
|
723
|
-
|
724
|
-
|
652
|
+
if params.load_dom:
|
653
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
654
|
+
if params.network_idle:
|
725
655
|
await page_info.page.wait_for_load_state("networkidle")
|
726
656
|
|
727
|
-
if
|
657
|
+
if params.page_action:
|
728
658
|
try:
|
729
|
-
|
659
|
+
_ = await params.page_action(page_info.page)
|
730
660
|
except Exception as e:
|
731
661
|
log.error(f"Error executing page_action: {e}")
|
732
662
|
|
733
|
-
if
|
663
|
+
if params.wait_selector:
|
734
664
|
try:
|
735
|
-
waiter: AsyncLocator = page_info.page.locator(
|
736
|
-
await waiter.first.wait_for(state=
|
665
|
+
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
666
|
+
await waiter.first.wait_for(state=params.wait_selector_state)
|
737
667
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
738
668
|
await page_info.page.wait_for_load_state(state="load")
|
739
|
-
|
740
|
-
|
669
|
+
if params.load_dom:
|
670
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
671
|
+
if params.network_idle:
|
741
672
|
await page_info.page.wait_for_load_state("networkidle")
|
742
673
|
except Exception as e:
|
743
|
-
log.error(f"Error waiting for selector {
|
674
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
744
675
|
|
745
|
-
await page_info.page.wait_for_timeout(
|
676
|
+
await page_info.page.wait_for_timeout(params.wait)
|
746
677
|
|
747
678
|
# Create response object
|
748
679
|
response = await ResponseFactory.from_async_playwright_response(
|
749
|
-
page_info.page, first_response, final_response,
|
680
|
+
page_info.page, first_response, final_response, params.selector_config
|
750
681
|
)
|
751
682
|
|
752
|
-
# Mark the page as
|
753
|
-
page_info.
|
683
|
+
# Mark the page as finished for next use
|
684
|
+
page_info.mark_finished()
|
754
685
|
|
755
686
|
return response
|
756
687
|
|