scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +51 -129
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +238 -293
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +220 -278
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +29 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +41 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
- scrapling-0.3.2.dist-info/RECORD +44 -0
- scrapling-0.3.dist-info/RECORD +0 -41
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,8 @@
|
|
1
|
-
from time import time, sleep
|
2
1
|
from re import compile as re_compile
|
3
|
-
from asyncio import sleep as asyncio_sleep, Lock
|
4
2
|
|
5
|
-
from camoufox import DefaultAddons
|
6
|
-
from camoufox.utils import launch_options as generate_launch_options
|
7
3
|
from playwright.sync_api import (
|
8
4
|
Response as SyncPlaywrightResponse,
|
9
5
|
sync_playwright,
|
10
|
-
BrowserContext,
|
11
|
-
Playwright,
|
12
6
|
Locator,
|
13
7
|
Page,
|
14
8
|
)
|
@@ -21,9 +15,9 @@ from playwright.async_api import (
|
|
21
15
|
Page as async_Page,
|
22
16
|
)
|
23
17
|
|
24
|
-
from scrapling.core.utils import log
|
25
|
-
from ._page import PageInfo, PagePool
|
26
18
|
from ._validators import validate, CamoufoxConfig
|
19
|
+
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
20
|
+
from scrapling.core.utils import log
|
27
21
|
from scrapling.core._types import (
|
28
22
|
Dict,
|
29
23
|
List,
|
@@ -31,19 +25,17 @@ from scrapling.core._types import (
|
|
31
25
|
Callable,
|
32
26
|
SelectorWaitStates,
|
33
27
|
)
|
34
|
-
from scrapling.engines.toolbelt import (
|
28
|
+
from scrapling.engines.toolbelt.convertor import (
|
35
29
|
Response,
|
36
30
|
ResponseFactory,
|
37
|
-
async_intercept_route,
|
38
|
-
generate_convincing_referer,
|
39
|
-
get_os_name,
|
40
|
-
intercept_route,
|
41
31
|
)
|
32
|
+
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
42
33
|
|
43
34
|
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
35
|
+
_UNSET = object()
|
44
36
|
|
45
37
|
|
46
|
-
class StealthySession:
|
38
|
+
class StealthySession(StealthySessionMixin, SyncSession):
|
47
39
|
"""A Stealthy session manager with page pooling."""
|
48
40
|
|
49
41
|
__slots__ = (
|
@@ -54,12 +46,14 @@ class StealthySession:
|
|
54
46
|
"block_webrtc",
|
55
47
|
"allow_webgl",
|
56
48
|
"network_idle",
|
49
|
+
"load_dom",
|
57
50
|
"humanize",
|
58
51
|
"solve_cloudflare",
|
59
52
|
"wait",
|
60
53
|
"timeout",
|
61
54
|
"page_action",
|
62
55
|
"wait_selector",
|
56
|
+
"init_script",
|
63
57
|
"addons",
|
64
58
|
"wait_selector_state",
|
65
59
|
"cookies",
|
@@ -82,19 +76,21 @@ class StealthySession:
|
|
82
76
|
|
83
77
|
def __init__(
|
84
78
|
self,
|
85
|
-
|
79
|
+
__max_pages: int = 1,
|
86
80
|
headless: bool = True, # noqa: F821
|
87
81
|
block_images: bool = False,
|
88
82
|
disable_resources: bool = False,
|
89
83
|
block_webrtc: bool = False,
|
90
84
|
allow_webgl: bool = True,
|
91
85
|
network_idle: bool = False,
|
86
|
+
load_dom: bool = True,
|
92
87
|
humanize: bool | float = True,
|
93
88
|
solve_cloudflare: bool = False,
|
94
89
|
wait: int | float = 0,
|
95
90
|
timeout: int | float = 30000,
|
96
91
|
page_action: Optional[Callable] = None,
|
97
92
|
wait_selector: Optional[str] = None,
|
93
|
+
init_script: Optional[str] = None,
|
98
94
|
addons: Optional[List[str]] = None,
|
99
95
|
wait_selector_state: SelectorWaitStates = "attached",
|
100
96
|
cookies: Optional[List[Dict]] = None,
|
@@ -122,118 +118,68 @@ class StealthySession:
|
|
122
118
|
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
123
119
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
124
120
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
121
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
125
122
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
126
123
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
127
124
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
128
125
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
129
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
126
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
130
127
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
128
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
131
129
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
132
130
|
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
133
131
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
134
132
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
135
133
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
136
134
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
137
|
-
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
138
135
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
139
136
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
140
137
|
"""
|
141
138
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
self.max_pages = config.max_pages
|
171
|
-
self.headless = config.headless
|
172
|
-
self.block_images = config.block_images
|
173
|
-
self.disable_resources = config.disable_resources
|
174
|
-
self.block_webrtc = config.block_webrtc
|
175
|
-
self.allow_webgl = config.allow_webgl
|
176
|
-
self.network_idle = config.network_idle
|
177
|
-
self.humanize = config.humanize
|
178
|
-
self.solve_cloudflare = config.solve_cloudflare
|
179
|
-
self.wait = config.wait
|
180
|
-
self.timeout = config.timeout
|
181
|
-
self.page_action = config.page_action
|
182
|
-
self.wait_selector = config.wait_selector
|
183
|
-
self.addons = config.addons
|
184
|
-
self.wait_selector_state = config.wait_selector_state
|
185
|
-
self.cookies = config.cookies
|
186
|
-
self.google_search = config.google_search
|
187
|
-
self.extra_headers = config.extra_headers
|
188
|
-
self.proxy = config.proxy
|
189
|
-
self.os_randomize = config.os_randomize
|
190
|
-
self.disable_ads = config.disable_ads
|
191
|
-
self.geoip = config.geoip
|
192
|
-
self.selector_config = config.selector_config
|
193
|
-
self.additional_args = config.additional_args
|
194
|
-
|
195
|
-
self.playwright: Optional[Playwright] = None
|
196
|
-
self.context: Optional[BrowserContext] = None
|
197
|
-
self.page_pool = PagePool(self.max_pages)
|
198
|
-
self._closed = False
|
199
|
-
self.selector_config = config.selector_config
|
200
|
-
self.page_action = config.page_action
|
201
|
-
self._headers_keys = (
|
202
|
-
set(map(str.lower, self.extra_headers.keys()))
|
203
|
-
if self.extra_headers
|
204
|
-
else set()
|
205
|
-
)
|
206
|
-
self.__initiate_browser_options__()
|
207
|
-
|
208
|
-
def __initiate_browser_options__(self):
|
209
|
-
"""Initiate browser options."""
|
210
|
-
self.launch_options = generate_launch_options(
|
211
|
-
**{
|
212
|
-
"geoip": self.geoip,
|
213
|
-
"proxy": dict(self.proxy) if self.proxy else self.proxy,
|
214
|
-
"enable_cache": True,
|
215
|
-
"addons": self.addons,
|
216
|
-
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
217
|
-
"headless": self.headless,
|
218
|
-
"humanize": True if self.solve_cloudflare else self.humanize,
|
219
|
-
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
220
|
-
"allow_webgl": self.allow_webgl,
|
221
|
-
"block_webrtc": self.block_webrtc,
|
222
|
-
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
223
|
-
"os": None if self.os_randomize else get_os_name(),
|
224
|
-
"user_data_dir": "",
|
225
|
-
**self.additional_args,
|
226
|
-
}
|
139
|
+
self.__validate__(
|
140
|
+
wait=wait,
|
141
|
+
proxy=proxy,
|
142
|
+
geoip=geoip,
|
143
|
+
addons=addons,
|
144
|
+
timeout=timeout,
|
145
|
+
cookies=cookies,
|
146
|
+
headless=headless,
|
147
|
+
humanize=humanize,
|
148
|
+
load_dom=load_dom,
|
149
|
+
max_pages=__max_pages,
|
150
|
+
disable_ads=disable_ads,
|
151
|
+
allow_webgl=allow_webgl,
|
152
|
+
page_action=page_action,
|
153
|
+
init_script=init_script,
|
154
|
+
network_idle=network_idle,
|
155
|
+
block_images=block_images,
|
156
|
+
block_webrtc=block_webrtc,
|
157
|
+
os_randomize=os_randomize,
|
158
|
+
wait_selector=wait_selector,
|
159
|
+
google_search=google_search,
|
160
|
+
extra_headers=extra_headers,
|
161
|
+
additional_args=additional_args,
|
162
|
+
selector_config=selector_config,
|
163
|
+
solve_cloudflare=solve_cloudflare,
|
164
|
+
disable_resources=disable_resources,
|
165
|
+
wait_selector_state=wait_selector_state,
|
227
166
|
)
|
167
|
+
super().__init__(max_pages=self.max_pages)
|
228
168
|
|
229
169
|
def __create__(self):
|
230
170
|
"""Create a browser for this instance and context."""
|
231
171
|
self.playwright = sync_playwright().start()
|
232
|
-
self.context = (
|
233
|
-
self.
|
234
|
-
**self.launch_options
|
235
|
-
)
|
172
|
+
self.context = self.playwright.firefox.launch_persistent_context( # pragma: no cover
|
173
|
+
**self.launch_options
|
236
174
|
)
|
175
|
+
|
176
|
+
# Get the default page and close it
|
177
|
+
default_page = self.context.pages[0]
|
178
|
+
default_page.close()
|
179
|
+
|
180
|
+
if self.init_script: # pragma: no cover
|
181
|
+
self.context.add_init_script(path=self.init_script)
|
182
|
+
|
237
183
|
if self.cookies: # pragma: no cover
|
238
184
|
self.context.add_cookies(self.cookies)
|
239
185
|
|
@@ -259,68 +205,6 @@ class StealthySession:
|
|
259
205
|
|
260
206
|
self._closed = True
|
261
207
|
|
262
|
-
def _get_or_create_page(self) -> PageInfo: # pragma: no cover
|
263
|
-
"""Get an available page or create a new one"""
|
264
|
-
# Try to get a ready page first
|
265
|
-
page_info = self.page_pool.get_ready_page()
|
266
|
-
if page_info:
|
267
|
-
return page_info
|
268
|
-
|
269
|
-
# Create a new page if under limit
|
270
|
-
if self.page_pool.pages_count < self.max_pages:
|
271
|
-
page = self.context.new_page()
|
272
|
-
page.set_default_navigation_timeout(self.timeout)
|
273
|
-
page.set_default_timeout(self.timeout)
|
274
|
-
if self.extra_headers:
|
275
|
-
page.set_extra_http_headers(self.extra_headers)
|
276
|
-
|
277
|
-
if self.disable_resources:
|
278
|
-
page.route("**/*", intercept_route)
|
279
|
-
|
280
|
-
return self.page_pool.add_page(page)
|
281
|
-
|
282
|
-
# Wait for a page to become available
|
283
|
-
max_wait = 30
|
284
|
-
start_time = time()
|
285
|
-
|
286
|
-
while time() - start_time < max_wait:
|
287
|
-
page_info = self.page_pool.get_ready_page()
|
288
|
-
if page_info:
|
289
|
-
return page_info
|
290
|
-
sleep(0.05)
|
291
|
-
|
292
|
-
raise TimeoutError("No pages available within timeout period")
|
293
|
-
|
294
|
-
@staticmethod
|
295
|
-
def _detect_cloudflare(page_content):
|
296
|
-
"""
|
297
|
-
Detect the type of Cloudflare challenge present in the provided page content.
|
298
|
-
|
299
|
-
This function analyzes the given page content to identify whether a specific
|
300
|
-
type of Cloudflare challenge is present. It checks for three predefined
|
301
|
-
challenge types: non-interactive, managed, and interactive. If a challenge
|
302
|
-
type is detected, it returns the corresponding type as a string. If no
|
303
|
-
challenge type is detected, it returns None.
|
304
|
-
|
305
|
-
Args:
|
306
|
-
page_content (str): The content of the page to analyze for Cloudflare
|
307
|
-
challenge types.
|
308
|
-
|
309
|
-
Returns:
|
310
|
-
str: A string representing the detected Cloudflare challenge type, if
|
311
|
-
found. Returns None if no challenge matches.
|
312
|
-
"""
|
313
|
-
challenge_types = (
|
314
|
-
"non-interactive",
|
315
|
-
"managed",
|
316
|
-
"interactive",
|
317
|
-
)
|
318
|
-
for ctype in challenge_types:
|
319
|
-
if f"cType: '{ctype}'" in page_content:
|
320
|
-
return ctype
|
321
|
-
|
322
|
-
return None
|
323
|
-
|
324
208
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
325
209
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
326
210
|
|
@@ -367,20 +251,66 @@ class StealthySession:
|
|
367
251
|
log.info("Cloudflare captcha is solved")
|
368
252
|
return
|
369
253
|
|
370
|
-
def fetch(
|
254
|
+
def fetch(
|
255
|
+
self,
|
256
|
+
url: str,
|
257
|
+
google_search: bool = _UNSET,
|
258
|
+
timeout: int | float = _UNSET,
|
259
|
+
wait: int | float = _UNSET,
|
260
|
+
page_action: Optional[Callable] = _UNSET,
|
261
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
262
|
+
disable_resources: bool = _UNSET,
|
263
|
+
wait_selector: Optional[str] = _UNSET,
|
264
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
265
|
+
network_idle: bool = _UNSET,
|
266
|
+
load_dom: bool = _UNSET,
|
267
|
+
solve_cloudflare: bool = _UNSET,
|
268
|
+
selector_config: Optional[Dict] = _UNSET,
|
269
|
+
) -> Response:
|
371
270
|
"""Opens up the browser and do your request based on your chosen options.
|
372
271
|
|
373
272
|
:param url: The Target url.
|
273
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
274
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
275
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
276
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
277
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
278
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
279
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
280
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
281
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
282
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
283
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
284
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
285
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
286
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
374
287
|
:return: A `Response` object.
|
375
288
|
"""
|
289
|
+
# Validate all resolved parameters
|
290
|
+
params = validate(
|
291
|
+
dict(
|
292
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
293
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
294
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
295
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
296
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
297
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
298
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
299
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
300
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
301
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
302
|
+
solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
|
303
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
304
|
+
),
|
305
|
+
CamoufoxConfig,
|
306
|
+
)
|
307
|
+
|
376
308
|
if self._closed: # pragma: no cover
|
377
309
|
raise RuntimeError("Context manager has been closed")
|
378
310
|
|
379
311
|
final_response = None
|
380
312
|
referer = (
|
381
|
-
generate_convincing_referer(url)
|
382
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
383
|
-
else None
|
313
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
384
314
|
)
|
385
315
|
|
386
316
|
def handle_response(finished_response: SyncPlaywrightResponse):
|
@@ -391,54 +321,57 @@ class StealthySession:
|
|
391
321
|
):
|
392
322
|
final_response = finished_response
|
393
323
|
|
394
|
-
page_info = self.
|
324
|
+
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
395
325
|
page_info.mark_busy(url=url)
|
396
326
|
|
397
327
|
try: # pragma: no cover
|
398
328
|
# Navigate to URL and wait for a specified state
|
399
329
|
page_info.page.on("response", handle_response)
|
400
330
|
first_response = page_info.page.goto(url, referer=referer)
|
401
|
-
|
331
|
+
if params.load_dom:
|
332
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
402
333
|
|
403
|
-
if
|
334
|
+
if params.network_idle:
|
404
335
|
page_info.page.wait_for_load_state("networkidle")
|
405
336
|
|
406
337
|
if not first_response:
|
407
338
|
raise RuntimeError(f"Failed to get response for {url}")
|
408
339
|
|
409
|
-
if
|
340
|
+
if params.solve_cloudflare:
|
410
341
|
self._solve_cloudflare(page_info.page)
|
411
342
|
# Make sure the page is fully loaded after the captcha
|
412
343
|
page_info.page.wait_for_load_state(state="load")
|
413
|
-
|
414
|
-
|
344
|
+
if params.load_dom:
|
345
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
346
|
+
if params.network_idle:
|
415
347
|
page_info.page.wait_for_load_state("networkidle")
|
416
348
|
|
417
|
-
if
|
349
|
+
if params.page_action:
|
418
350
|
try:
|
419
|
-
|
351
|
+
_ = params.page_action(page_info.page)
|
420
352
|
except Exception as e:
|
421
353
|
log.error(f"Error executing page_action: {e}")
|
422
354
|
|
423
|
-
if
|
355
|
+
if params.wait_selector:
|
424
356
|
try:
|
425
|
-
waiter: Locator = page_info.page.locator(
|
426
|
-
waiter.first.wait_for(state=
|
357
|
+
waiter: Locator = page_info.page.locator(params.wait_selector)
|
358
|
+
waiter.first.wait_for(state=params.wait_selector_state)
|
427
359
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
428
360
|
page_info.page.wait_for_load_state(state="load")
|
429
|
-
|
430
|
-
|
361
|
+
if params.load_dom:
|
362
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
363
|
+
if params.network_idle:
|
431
364
|
page_info.page.wait_for_load_state("networkidle")
|
432
365
|
except Exception as e:
|
433
|
-
log.error(f"Error waiting for selector {
|
366
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
434
367
|
|
435
|
-
page_info.page.wait_for_timeout(
|
368
|
+
page_info.page.wait_for_timeout(params.wait)
|
436
369
|
response = ResponseFactory.from_playwright_response(
|
437
|
-
page_info.page, first_response, final_response,
|
370
|
+
page_info.page, first_response, final_response, params.selector_config
|
438
371
|
)
|
439
372
|
|
440
|
-
# Mark the page as
|
441
|
-
page_info.
|
373
|
+
# Mark the page as finished for next use
|
374
|
+
page_info.mark_finished()
|
442
375
|
|
443
376
|
return response
|
444
377
|
|
@@ -446,17 +379,8 @@ class StealthySession:
|
|
446
379
|
page_info.mark_error()
|
447
380
|
raise e
|
448
381
|
|
449
|
-
def get_pool_stats(self) -> Dict[str, int]:
|
450
|
-
"""Get statistics about the current page pool"""
|
451
|
-
return {
|
452
|
-
"total_pages": self.page_pool.pages_count,
|
453
|
-
"ready_pages": self.page_pool.ready_count,
|
454
|
-
"busy_pages": self.page_pool.busy_count,
|
455
|
-
"max_pages": self.max_pages,
|
456
|
-
}
|
457
|
-
|
458
382
|
|
459
|
-
class AsyncStealthySession(
|
383
|
+
class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
460
384
|
"""A Stealthy session manager with page pooling."""
|
461
385
|
|
462
386
|
def __init__(
|
@@ -468,12 +392,14 @@ class AsyncStealthySession(StealthySession):
|
|
468
392
|
block_webrtc: bool = False,
|
469
393
|
allow_webgl: bool = True,
|
470
394
|
network_idle: bool = False,
|
395
|
+
load_dom: bool = True,
|
471
396
|
humanize: bool | float = True,
|
472
397
|
solve_cloudflare: bool = False,
|
473
398
|
wait: int | float = 0,
|
474
399
|
timeout: int | float = 30000,
|
475
400
|
page_action: Optional[Callable] = None,
|
476
401
|
wait_selector: Optional[str] = None,
|
402
|
+
init_script: Optional[str] = None,
|
477
403
|
addons: Optional[List[str]] = None,
|
478
404
|
wait_selector_state: SelectorWaitStates = "attached",
|
479
405
|
cookies: Optional[List[Dict]] = None,
|
@@ -501,12 +427,14 @@ class AsyncStealthySession(StealthySession):
|
|
501
427
|
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
502
428
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
503
429
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
430
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
504
431
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
505
432
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
506
433
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
507
434
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
508
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
435
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
509
436
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
437
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
510
438
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
511
439
|
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
512
440
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
@@ -517,46 +445,50 @@ class AsyncStealthySession(StealthySession):
|
|
517
445
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
518
446
|
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
519
447
|
"""
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
selector_config,
|
544
|
-
|
448
|
+
self.__validate__(
|
449
|
+
wait=wait,
|
450
|
+
proxy=proxy,
|
451
|
+
geoip=geoip,
|
452
|
+
addons=addons,
|
453
|
+
timeout=timeout,
|
454
|
+
cookies=cookies,
|
455
|
+
headless=headless,
|
456
|
+
load_dom=load_dom,
|
457
|
+
humanize=humanize,
|
458
|
+
max_pages=max_pages,
|
459
|
+
disable_ads=disable_ads,
|
460
|
+
allow_webgl=allow_webgl,
|
461
|
+
page_action=page_action,
|
462
|
+
init_script=init_script,
|
463
|
+
network_idle=network_idle,
|
464
|
+
block_images=block_images,
|
465
|
+
block_webrtc=block_webrtc,
|
466
|
+
os_randomize=os_randomize,
|
467
|
+
wait_selector=wait_selector,
|
468
|
+
google_search=google_search,
|
469
|
+
extra_headers=extra_headers,
|
470
|
+
additional_args=additional_args,
|
471
|
+
selector_config=selector_config,
|
472
|
+
solve_cloudflare=solve_cloudflare,
|
473
|
+
disable_resources=disable_resources,
|
474
|
+
wait_selector_state=wait_selector_state,
|
545
475
|
)
|
546
|
-
self.
|
547
|
-
self.context: Optional[AsyncBrowserContext] = None
|
548
|
-
self._lock = Lock()
|
549
|
-
self.__enter__ = None
|
550
|
-
self.__exit__ = None
|
476
|
+
super().__init__(max_pages=self.max_pages)
|
551
477
|
|
552
478
|
async def __create__(self):
|
553
479
|
"""Create a browser for this instance and context."""
|
554
480
|
self.playwright: AsyncPlaywright = await async_playwright().start()
|
555
|
-
self.context: AsyncBrowserContext = (
|
556
|
-
|
557
|
-
**self.launch_options
|
558
|
-
)
|
481
|
+
self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
|
482
|
+
**self.launch_options
|
559
483
|
)
|
484
|
+
|
485
|
+
# Get the default page and close it
|
486
|
+
default_page = self.context.pages[0]
|
487
|
+
await default_page.close()
|
488
|
+
|
489
|
+
if self.init_script: # pragma: no cover
|
490
|
+
await self.context.add_init_script(path=self.init_script)
|
491
|
+
|
560
492
|
if self.cookies:
|
561
493
|
await self.context.add_cookies(self.cookies)
|
562
494
|
|
@@ -582,39 +514,6 @@ class AsyncStealthySession(StealthySession):
|
|
582
514
|
|
583
515
|
self._closed = True
|
584
516
|
|
585
|
-
async def _get_or_create_page(self) -> PageInfo:
|
586
|
-
"""Get an available page or create a new one"""
|
587
|
-
async with self._lock:
|
588
|
-
# Try to get a ready page first
|
589
|
-
page_info = self.page_pool.get_ready_page()
|
590
|
-
if page_info:
|
591
|
-
return page_info
|
592
|
-
|
593
|
-
# Create a new page if under limit
|
594
|
-
if self.page_pool.pages_count < self.max_pages:
|
595
|
-
page = await self.context.new_page()
|
596
|
-
page.set_default_navigation_timeout(self.timeout)
|
597
|
-
page.set_default_timeout(self.timeout)
|
598
|
-
if self.extra_headers:
|
599
|
-
await page.set_extra_http_headers(self.extra_headers)
|
600
|
-
|
601
|
-
if self.disable_resources:
|
602
|
-
await page.route("**/*", async_intercept_route)
|
603
|
-
|
604
|
-
return self.page_pool.add_page(page)
|
605
|
-
|
606
|
-
# Wait for a page to become available
|
607
|
-
max_wait = 30
|
608
|
-
start_time = time()
|
609
|
-
|
610
|
-
while time() - start_time < max_wait: # pragma: no cover
|
611
|
-
page_info = self.page_pool.get_ready_page()
|
612
|
-
if page_info:
|
613
|
-
return page_info
|
614
|
-
await asyncio_sleep(0.05)
|
615
|
-
|
616
|
-
raise TimeoutError("No pages available within timeout period")
|
617
|
-
|
618
517
|
async def _solve_cloudflare(self, page: async_Page):
|
619
518
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
620
519
|
|
@@ -650,9 +549,7 @@ class AsyncStealthySession(StealthySession):
|
|
650
549
|
await page.wait_for_timeout(500)
|
651
550
|
|
652
551
|
# Calculate the Captcha coordinates for any viewport
|
653
|
-
outer_box = await page.locator(
|
654
|
-
".main-content p+div>div>div"
|
655
|
-
).bounding_box()
|
552
|
+
outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
|
656
553
|
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
657
554
|
|
658
555
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
@@ -663,20 +560,65 @@ class AsyncStealthySession(StealthySession):
|
|
663
560
|
log.info("Cloudflare captcha is solved")
|
664
561
|
return
|
665
562
|
|
666
|
-
async def fetch(
|
563
|
+
async def fetch(
|
564
|
+
self,
|
565
|
+
url: str,
|
566
|
+
google_search: bool = _UNSET,
|
567
|
+
timeout: int | float = _UNSET,
|
568
|
+
wait: int | float = _UNSET,
|
569
|
+
page_action: Optional[Callable] = _UNSET,
|
570
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
571
|
+
disable_resources: bool = _UNSET,
|
572
|
+
wait_selector: Optional[str] = _UNSET,
|
573
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
574
|
+
network_idle: bool = _UNSET,
|
575
|
+
load_dom: bool = _UNSET,
|
576
|
+
solve_cloudflare: bool = _UNSET,
|
577
|
+
selector_config: Optional[Dict] = _UNSET,
|
578
|
+
) -> Response:
|
667
579
|
"""Opens up the browser and do your request based on your chosen options.
|
668
580
|
|
669
581
|
:param url: The Target url.
|
582
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
583
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
584
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
585
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
586
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
587
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
588
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
589
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
590
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
591
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
592
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
593
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
594
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
595
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
670
596
|
:return: A `Response` object.
|
671
597
|
"""
|
598
|
+
params = validate(
|
599
|
+
dict(
|
600
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
601
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
602
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
603
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
604
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
605
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
606
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
607
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
608
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
609
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
610
|
+
solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
|
611
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
612
|
+
),
|
613
|
+
CamoufoxConfig,
|
614
|
+
)
|
615
|
+
|
672
616
|
if self._closed: # pragma: no cover
|
673
617
|
raise RuntimeError("Context manager has been closed")
|
674
618
|
|
675
619
|
final_response = None
|
676
620
|
referer = (
|
677
|
-
generate_convincing_referer(url)
|
678
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
679
|
-
else None
|
621
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
680
622
|
)
|
681
623
|
|
682
624
|
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
@@ -687,56 +629,59 @@ class AsyncStealthySession(StealthySession):
|
|
687
629
|
):
|
688
630
|
final_response = finished_response
|
689
631
|
|
690
|
-
page_info = await self.
|
632
|
+
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
691
633
|
page_info.mark_busy(url=url)
|
692
634
|
|
693
635
|
try:
|
694
636
|
# Navigate to URL and wait for a specified state
|
695
637
|
page_info.page.on("response", handle_response)
|
696
638
|
first_response = await page_info.page.goto(url, referer=referer)
|
697
|
-
|
639
|
+
if params.load_dom:
|
640
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
698
641
|
|
699
|
-
if
|
642
|
+
if params.network_idle:
|
700
643
|
await page_info.page.wait_for_load_state("networkidle")
|
701
644
|
|
702
645
|
if not first_response:
|
703
646
|
raise RuntimeError(f"Failed to get response for {url}")
|
704
647
|
|
705
|
-
if
|
648
|
+
if params.solve_cloudflare:
|
706
649
|
await self._solve_cloudflare(page_info.page)
|
707
650
|
# Make sure the page is fully loaded after the captcha
|
708
651
|
await page_info.page.wait_for_load_state(state="load")
|
709
|
-
|
710
|
-
|
652
|
+
if params.load_dom:
|
653
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
654
|
+
if params.network_idle:
|
711
655
|
await page_info.page.wait_for_load_state("networkidle")
|
712
656
|
|
713
|
-
if
|
657
|
+
if params.page_action:
|
714
658
|
try:
|
715
|
-
|
659
|
+
_ = await params.page_action(page_info.page)
|
716
660
|
except Exception as e:
|
717
661
|
log.error(f"Error executing page_action: {e}")
|
718
662
|
|
719
|
-
if
|
663
|
+
if params.wait_selector:
|
720
664
|
try:
|
721
|
-
waiter: AsyncLocator = page_info.page.locator(
|
722
|
-
await waiter.first.wait_for(state=
|
665
|
+
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
666
|
+
await waiter.first.wait_for(state=params.wait_selector_state)
|
723
667
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
724
668
|
await page_info.page.wait_for_load_state(state="load")
|
725
|
-
|
726
|
-
|
669
|
+
if params.load_dom:
|
670
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
671
|
+
if params.network_idle:
|
727
672
|
await page_info.page.wait_for_load_state("networkidle")
|
728
673
|
except Exception as e:
|
729
|
-
log.error(f"Error waiting for selector {
|
674
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
730
675
|
|
731
|
-
await page_info.page.wait_for_timeout(
|
676
|
+
await page_info.page.wait_for_timeout(params.wait)
|
732
677
|
|
733
678
|
# Create response object
|
734
679
|
response = await ResponseFactory.from_async_playwright_response(
|
735
|
-
page_info.page, first_response, final_response,
|
680
|
+
page_info.page, first_response, final_response, params.selector_config
|
736
681
|
)
|
737
682
|
|
738
|
-
# Mark the page as
|
739
|
-
page_info.
|
683
|
+
# Mark the page as finished for next use
|
684
|
+
page_info.mark_finished()
|
740
685
|
|
741
686
|
return response
|
742
687
|
|