scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +227 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,745 @@
|
|
1
|
+
from time import time, sleep
|
2
|
+
from re import compile as re_compile
|
3
|
+
from asyncio import sleep as asyncio_sleep, Lock
|
4
|
+
|
5
|
+
from camoufox import DefaultAddons
|
6
|
+
from camoufox.utils import launch_options as generate_launch_options
|
7
|
+
from playwright.sync_api import (
|
8
|
+
Response as SyncPlaywrightResponse,
|
9
|
+
sync_playwright,
|
10
|
+
BrowserContext,
|
11
|
+
Playwright,
|
12
|
+
Locator,
|
13
|
+
Page,
|
14
|
+
)
|
15
|
+
from playwright.async_api import (
|
16
|
+
async_playwright,
|
17
|
+
Response as AsyncPlaywrightResponse,
|
18
|
+
BrowserContext as AsyncBrowserContext,
|
19
|
+
Playwright as AsyncPlaywright,
|
20
|
+
Locator as AsyncLocator,
|
21
|
+
Page as async_Page,
|
22
|
+
)
|
23
|
+
|
24
|
+
from scrapling.core.utils import log
|
25
|
+
from ._page import PageInfo, PagePool
|
26
|
+
from ._validators import validate, CamoufoxConfig
|
27
|
+
from scrapling.core._types import (
|
28
|
+
Dict,
|
29
|
+
List,
|
30
|
+
Optional,
|
31
|
+
Callable,
|
32
|
+
SelectorWaitStates,
|
33
|
+
)
|
34
|
+
from scrapling.engines.toolbelt import (
|
35
|
+
Response,
|
36
|
+
ResponseFactory,
|
37
|
+
async_intercept_route,
|
38
|
+
generate_convincing_referer,
|
39
|
+
get_os_name,
|
40
|
+
intercept_route,
|
41
|
+
)
|
42
|
+
|
43
|
+
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
44
|
+
|
45
|
+
|
46
|
+
class StealthySession:
|
47
|
+
"""A Stealthy session manager with page pooling."""
|
48
|
+
|
49
|
+
__slots__ = (
|
50
|
+
"max_pages",
|
51
|
+
"headless",
|
52
|
+
"block_images",
|
53
|
+
"disable_resources",
|
54
|
+
"block_webrtc",
|
55
|
+
"allow_webgl",
|
56
|
+
"network_idle",
|
57
|
+
"humanize",
|
58
|
+
"solve_cloudflare",
|
59
|
+
"wait",
|
60
|
+
"timeout",
|
61
|
+
"page_action",
|
62
|
+
"wait_selector",
|
63
|
+
"addons",
|
64
|
+
"wait_selector_state",
|
65
|
+
"cookies",
|
66
|
+
"google_search",
|
67
|
+
"extra_headers",
|
68
|
+
"proxy",
|
69
|
+
"os_randomize",
|
70
|
+
"disable_ads",
|
71
|
+
"geoip",
|
72
|
+
"selector_config",
|
73
|
+
"additional_args",
|
74
|
+
"playwright",
|
75
|
+
"browser",
|
76
|
+
"context",
|
77
|
+
"page_pool",
|
78
|
+
"_closed",
|
79
|
+
"launch_options",
|
80
|
+
"_headers_keys",
|
81
|
+
)
|
82
|
+
|
83
|
+
def __init__(
|
84
|
+
self,
|
85
|
+
max_pages: int = 1,
|
86
|
+
headless: bool = True, # noqa: F821
|
87
|
+
block_images: bool = False,
|
88
|
+
disable_resources: bool = False,
|
89
|
+
block_webrtc: bool = False,
|
90
|
+
allow_webgl: bool = True,
|
91
|
+
network_idle: bool = False,
|
92
|
+
humanize: bool | float = True,
|
93
|
+
solve_cloudflare: bool = False,
|
94
|
+
wait: int | float = 0,
|
95
|
+
timeout: int | float = 30000,
|
96
|
+
page_action: Optional[Callable] = None,
|
97
|
+
wait_selector: Optional[str] = None,
|
98
|
+
addons: Optional[List[str]] = None,
|
99
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
100
|
+
cookies: Optional[List[Dict]] = None,
|
101
|
+
google_search: bool = True,
|
102
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
103
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
104
|
+
os_randomize: bool = False,
|
105
|
+
disable_ads: bool = False,
|
106
|
+
geoip: bool = False,
|
107
|
+
selector_config: Optional[Dict] = None,
|
108
|
+
additional_args: Optional[Dict] = None,
|
109
|
+
):
|
110
|
+
"""A Browser session manager with page pooling
|
111
|
+
|
112
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
113
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
114
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
115
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
116
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
117
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
118
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
119
|
+
:param cookies: Set cookies for the next request.
|
120
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
121
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
122
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
123
|
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
124
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
125
|
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
126
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
127
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
128
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
129
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
130
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
131
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
132
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
133
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
134
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
135
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
136
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
137
|
+
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
138
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
139
|
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
140
|
+
"""
|
141
|
+
|
142
|
+
params = {
|
143
|
+
"max_pages": max_pages,
|
144
|
+
"headless": headless,
|
145
|
+
"block_images": block_images,
|
146
|
+
"disable_resources": disable_resources,
|
147
|
+
"block_webrtc": block_webrtc,
|
148
|
+
"allow_webgl": allow_webgl,
|
149
|
+
"network_idle": network_idle,
|
150
|
+
"humanize": humanize,
|
151
|
+
"solve_cloudflare": solve_cloudflare,
|
152
|
+
"wait": wait,
|
153
|
+
"timeout": timeout,
|
154
|
+
"page_action": page_action,
|
155
|
+
"wait_selector": wait_selector,
|
156
|
+
"addons": addons,
|
157
|
+
"wait_selector_state": wait_selector_state,
|
158
|
+
"cookies": cookies,
|
159
|
+
"google_search": google_search,
|
160
|
+
"extra_headers": extra_headers,
|
161
|
+
"proxy": proxy,
|
162
|
+
"os_randomize": os_randomize,
|
163
|
+
"disable_ads": disable_ads,
|
164
|
+
"geoip": geoip,
|
165
|
+
"selector_config": selector_config,
|
166
|
+
"additional_args": additional_args,
|
167
|
+
}
|
168
|
+
config = validate(params, CamoufoxConfig)
|
169
|
+
|
170
|
+
self.max_pages = config.max_pages
|
171
|
+
self.headless = config.headless
|
172
|
+
self.block_images = config.block_images
|
173
|
+
self.disable_resources = config.disable_resources
|
174
|
+
self.block_webrtc = config.block_webrtc
|
175
|
+
self.allow_webgl = config.allow_webgl
|
176
|
+
self.network_idle = config.network_idle
|
177
|
+
self.humanize = config.humanize
|
178
|
+
self.solve_cloudflare = config.solve_cloudflare
|
179
|
+
self.wait = config.wait
|
180
|
+
self.timeout = config.timeout
|
181
|
+
self.page_action = config.page_action
|
182
|
+
self.wait_selector = config.wait_selector
|
183
|
+
self.addons = config.addons
|
184
|
+
self.wait_selector_state = config.wait_selector_state
|
185
|
+
self.cookies = config.cookies
|
186
|
+
self.google_search = config.google_search
|
187
|
+
self.extra_headers = config.extra_headers
|
188
|
+
self.proxy = config.proxy
|
189
|
+
self.os_randomize = config.os_randomize
|
190
|
+
self.disable_ads = config.disable_ads
|
191
|
+
self.geoip = config.geoip
|
192
|
+
self.selector_config = config.selector_config
|
193
|
+
self.additional_args = config.additional_args
|
194
|
+
|
195
|
+
self.playwright: Optional[Playwright] = None
|
196
|
+
self.context: Optional[BrowserContext] = None
|
197
|
+
self.page_pool = PagePool(self.max_pages)
|
198
|
+
self._closed = False
|
199
|
+
self.selector_config = config.selector_config
|
200
|
+
self.page_action = config.page_action
|
201
|
+
self._headers_keys = (
|
202
|
+
set(map(str.lower, self.extra_headers.keys()))
|
203
|
+
if self.extra_headers
|
204
|
+
else set()
|
205
|
+
)
|
206
|
+
self.__initiate_browser_options__()
|
207
|
+
|
208
|
+
def __initiate_browser_options__(self):
|
209
|
+
"""Initiate browser options."""
|
210
|
+
self.launch_options = generate_launch_options(
|
211
|
+
**{
|
212
|
+
"geoip": self.geoip,
|
213
|
+
"proxy": dict(self.proxy) if self.proxy else self.proxy,
|
214
|
+
"enable_cache": True,
|
215
|
+
"addons": self.addons,
|
216
|
+
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
217
|
+
"headless": self.headless,
|
218
|
+
"humanize": True if self.solve_cloudflare else self.humanize,
|
219
|
+
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
220
|
+
"allow_webgl": self.allow_webgl,
|
221
|
+
"block_webrtc": self.block_webrtc,
|
222
|
+
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
223
|
+
"os": None if self.os_randomize else get_os_name(),
|
224
|
+
"user_data_dir": "",
|
225
|
+
**self.additional_args,
|
226
|
+
}
|
227
|
+
)
|
228
|
+
|
229
|
+
def __create__(self):
|
230
|
+
"""Create a browser for this instance and context."""
|
231
|
+
self.playwright = sync_playwright().start()
|
232
|
+
self.context = (
|
233
|
+
self.playwright.firefox.launch_persistent_context( # pragma: no cover
|
234
|
+
**self.launch_options
|
235
|
+
)
|
236
|
+
)
|
237
|
+
if self.cookies: # pragma: no cover
|
238
|
+
self.context.add_cookies(self.cookies)
|
239
|
+
|
240
|
+
def __enter__(self): # pragma: no cover
|
241
|
+
self.__create__()
|
242
|
+
return self
|
243
|
+
|
244
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
245
|
+
self.close()
|
246
|
+
|
247
|
+
def close(self): # pragma: no cover
|
248
|
+
"""Close all resources"""
|
249
|
+
if self._closed: # pragma: no cover
|
250
|
+
return
|
251
|
+
|
252
|
+
if self.context:
|
253
|
+
self.context.close()
|
254
|
+
self.context = None
|
255
|
+
|
256
|
+
if self.playwright:
|
257
|
+
self.playwright.stop()
|
258
|
+
self.playwright = None
|
259
|
+
|
260
|
+
self._closed = True
|
261
|
+
|
262
|
+
def _get_or_create_page(self) -> PageInfo: # pragma: no cover
|
263
|
+
"""Get an available page or create a new one"""
|
264
|
+
# Try to get a ready page first
|
265
|
+
page_info = self.page_pool.get_ready_page()
|
266
|
+
if page_info:
|
267
|
+
return page_info
|
268
|
+
|
269
|
+
# Create a new page if under limit
|
270
|
+
if self.page_pool.pages_count < self.max_pages:
|
271
|
+
page = self.context.new_page()
|
272
|
+
page.set_default_navigation_timeout(self.timeout)
|
273
|
+
page.set_default_timeout(self.timeout)
|
274
|
+
if self.extra_headers:
|
275
|
+
page.set_extra_http_headers(self.extra_headers)
|
276
|
+
|
277
|
+
if self.disable_resources:
|
278
|
+
page.route("**/*", intercept_route)
|
279
|
+
|
280
|
+
return self.page_pool.add_page(page)
|
281
|
+
|
282
|
+
# Wait for a page to become available
|
283
|
+
max_wait = 30
|
284
|
+
start_time = time()
|
285
|
+
|
286
|
+
while time() - start_time < max_wait:
|
287
|
+
page_info = self.page_pool.get_ready_page()
|
288
|
+
if page_info:
|
289
|
+
return page_info
|
290
|
+
sleep(0.05)
|
291
|
+
|
292
|
+
raise TimeoutError("No pages available within timeout period")
|
293
|
+
|
294
|
+
@staticmethod
|
295
|
+
def _detect_cloudflare(page_content):
|
296
|
+
"""
|
297
|
+
Detect the type of Cloudflare challenge present in the provided page content.
|
298
|
+
|
299
|
+
This function analyzes the given page content to identify whether a specific
|
300
|
+
type of Cloudflare challenge is present. It checks for three predefined
|
301
|
+
challenge types: non-interactive, managed, and interactive. If a challenge
|
302
|
+
type is detected, it returns the corresponding type as a string. If no
|
303
|
+
challenge type is detected, it returns None.
|
304
|
+
|
305
|
+
Args:
|
306
|
+
page_content (str): The content of the page to analyze for Cloudflare
|
307
|
+
challenge types.
|
308
|
+
|
309
|
+
Returns:
|
310
|
+
str: A string representing the detected Cloudflare challenge type, if
|
311
|
+
found. Returns None if no challenge matches.
|
312
|
+
"""
|
313
|
+
challenge_types = (
|
314
|
+
"non-interactive",
|
315
|
+
"managed",
|
316
|
+
"interactive",
|
317
|
+
)
|
318
|
+
for ctype in challenge_types:
|
319
|
+
if f"cType: '{ctype}'" in page_content:
|
320
|
+
return ctype
|
321
|
+
|
322
|
+
return None
|
323
|
+
|
324
|
+
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
325
|
+
"""Solve the cloudflare challenge displayed on the playwright page passed
|
326
|
+
|
327
|
+
:param page: The targeted page
|
328
|
+
:return:
|
329
|
+
"""
|
330
|
+
challenge_type = self._detect_cloudflare(page.content())
|
331
|
+
if not challenge_type:
|
332
|
+
log.error("No Cloudflare challenge found.")
|
333
|
+
return
|
334
|
+
else:
|
335
|
+
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
336
|
+
if challenge_type == "non-interactive":
|
337
|
+
while "<title>Just a moment...</title>" in (page.content()):
|
338
|
+
log.info("Waiting for Cloudflare wait page to disappear.")
|
339
|
+
page.wait_for_timeout(1000)
|
340
|
+
page.wait_for_load_state()
|
341
|
+
log.info("Cloudflare captcha is solved")
|
342
|
+
return
|
343
|
+
|
344
|
+
else:
|
345
|
+
while "Verifying you are human." in page.content():
|
346
|
+
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
347
|
+
page.wait_for_timeout(500)
|
348
|
+
|
349
|
+
iframe = page.frame(url=__CF_PATTERN__)
|
350
|
+
if iframe is None:
|
351
|
+
log.info("Didn't find Cloudflare iframe!")
|
352
|
+
return
|
353
|
+
|
354
|
+
while not iframe.frame_element().is_visible():
|
355
|
+
# Double-checking that the iframe is loaded
|
356
|
+
page.wait_for_timeout(500)
|
357
|
+
|
358
|
+
# Calculate the Captcha coordinates for any viewport
|
359
|
+
outer_box = page.locator(".main-content p+div>div>div").bounding_box()
|
360
|
+
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
361
|
+
|
362
|
+
# Move the mouse to the center of the window, then press and hold the left mouse button
|
363
|
+
page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
364
|
+
page.locator(".zone-name-title").wait_for(state="hidden")
|
365
|
+
page.wait_for_load_state(state="domcontentloaded")
|
366
|
+
|
367
|
+
log.info("Cloudflare captcha is solved")
|
368
|
+
return
|
369
|
+
|
370
|
+
def fetch(self, url: str) -> Response:
|
371
|
+
"""Opens up the browser and do your request based on your chosen options.
|
372
|
+
|
373
|
+
:param url: The Target url.
|
374
|
+
:return: A `Response` object.
|
375
|
+
"""
|
376
|
+
if self._closed: # pragma: no cover
|
377
|
+
raise RuntimeError("Context manager has been closed")
|
378
|
+
|
379
|
+
final_response = None
|
380
|
+
referer = (
|
381
|
+
generate_convincing_referer(url)
|
382
|
+
if (self.google_search and "referer" not in self._headers_keys)
|
383
|
+
else None
|
384
|
+
)
|
385
|
+
|
386
|
+
def handle_response(finished_response: SyncPlaywrightResponse):
|
387
|
+
nonlocal final_response
|
388
|
+
if (
|
389
|
+
finished_response.request.resource_type == "document"
|
390
|
+
and finished_response.request.is_navigation_request()
|
391
|
+
):
|
392
|
+
final_response = finished_response
|
393
|
+
|
394
|
+
page_info = self._get_or_create_page()
|
395
|
+
page_info.mark_busy(url=url)
|
396
|
+
|
397
|
+
try: # pragma: no cover
|
398
|
+
# Navigate to URL and wait for a specified state
|
399
|
+
page_info.page.on("response", handle_response)
|
400
|
+
first_response = page_info.page.goto(url, referer=referer)
|
401
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
402
|
+
|
403
|
+
if self.network_idle:
|
404
|
+
page_info.page.wait_for_load_state("networkidle")
|
405
|
+
|
406
|
+
if not first_response:
|
407
|
+
raise RuntimeError(f"Failed to get response for {url}")
|
408
|
+
|
409
|
+
if self.solve_cloudflare:
|
410
|
+
self._solve_cloudflare(page_info.page)
|
411
|
+
# Make sure the page is fully loaded after the captcha
|
412
|
+
page_info.page.wait_for_load_state(state="load")
|
413
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
414
|
+
if self.network_idle:
|
415
|
+
page_info.page.wait_for_load_state("networkidle")
|
416
|
+
|
417
|
+
if self.page_action is not None:
|
418
|
+
try:
|
419
|
+
page_info.page = self.page_action(page_info.page)
|
420
|
+
except Exception as e:
|
421
|
+
log.error(f"Error executing page_action: {e}")
|
422
|
+
|
423
|
+
if self.wait_selector:
|
424
|
+
try:
|
425
|
+
waiter: Locator = page_info.page.locator(self.wait_selector)
|
426
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
427
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
428
|
+
page_info.page.wait_for_load_state(state="load")
|
429
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
430
|
+
if self.network_idle:
|
431
|
+
page_info.page.wait_for_load_state("networkidle")
|
432
|
+
except Exception as e:
|
433
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
434
|
+
|
435
|
+
page_info.page.wait_for_timeout(self.wait)
|
436
|
+
response = ResponseFactory.from_playwright_response(
|
437
|
+
page_info.page, first_response, final_response, self.selector_config
|
438
|
+
)
|
439
|
+
|
440
|
+
# Mark the page as ready for next use
|
441
|
+
page_info.mark_ready()
|
442
|
+
|
443
|
+
return response
|
444
|
+
|
445
|
+
except Exception as e: # pragma: no cover
|
446
|
+
page_info.mark_error()
|
447
|
+
raise e
|
448
|
+
|
449
|
+
def get_pool_stats(self) -> Dict[str, int]:
|
450
|
+
"""Get statistics about the current page pool"""
|
451
|
+
return {
|
452
|
+
"total_pages": self.page_pool.pages_count,
|
453
|
+
"ready_pages": self.page_pool.ready_count,
|
454
|
+
"busy_pages": self.page_pool.busy_count,
|
455
|
+
"max_pages": self.max_pages,
|
456
|
+
}
|
457
|
+
|
458
|
+
|
459
|
+
class AsyncStealthySession(StealthySession):
|
460
|
+
"""A Stealthy session manager with page pooling."""
|
461
|
+
|
462
|
+
def __init__(
|
463
|
+
self,
|
464
|
+
max_pages: int = 1,
|
465
|
+
headless: bool = True, # noqa: F821
|
466
|
+
block_images: bool = False,
|
467
|
+
disable_resources: bool = False,
|
468
|
+
block_webrtc: bool = False,
|
469
|
+
allow_webgl: bool = True,
|
470
|
+
network_idle: bool = False,
|
471
|
+
humanize: bool | float = True,
|
472
|
+
solve_cloudflare: bool = False,
|
473
|
+
wait: int | float = 0,
|
474
|
+
timeout: int | float = 30000,
|
475
|
+
page_action: Optional[Callable] = None,
|
476
|
+
wait_selector: Optional[str] = None,
|
477
|
+
addons: Optional[List[str]] = None,
|
478
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
479
|
+
cookies: Optional[List[Dict]] = None,
|
480
|
+
google_search: bool = True,
|
481
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
482
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
483
|
+
os_randomize: bool = False,
|
484
|
+
disable_ads: bool = False,
|
485
|
+
geoip: bool = False,
|
486
|
+
selector_config: Optional[Dict] = None,
|
487
|
+
additional_args: Optional[Dict] = None,
|
488
|
+
):
|
489
|
+
"""A Browser session manager with page pooling
|
490
|
+
|
491
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
492
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
493
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
494
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
495
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
496
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
497
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
498
|
+
:param cookies: Set cookies for the next request.
|
499
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
500
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
501
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
502
|
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
503
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
504
|
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
505
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
506
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
507
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
508
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
509
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
510
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
511
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
512
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
513
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
514
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
515
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
516
|
+
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
517
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
518
|
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
519
|
+
"""
|
520
|
+
super().__init__(
|
521
|
+
max_pages,
|
522
|
+
headless,
|
523
|
+
block_images,
|
524
|
+
disable_resources,
|
525
|
+
block_webrtc,
|
526
|
+
allow_webgl,
|
527
|
+
network_idle,
|
528
|
+
humanize,
|
529
|
+
solve_cloudflare,
|
530
|
+
wait,
|
531
|
+
timeout,
|
532
|
+
page_action,
|
533
|
+
wait_selector,
|
534
|
+
addons,
|
535
|
+
wait_selector_state,
|
536
|
+
cookies,
|
537
|
+
google_search,
|
538
|
+
extra_headers,
|
539
|
+
proxy,
|
540
|
+
os_randomize,
|
541
|
+
disable_ads,
|
542
|
+
geoip,
|
543
|
+
selector_config,
|
544
|
+
additional_args,
|
545
|
+
)
|
546
|
+
self.playwright: Optional[AsyncPlaywright] = None
|
547
|
+
self.context: Optional[AsyncBrowserContext] = None
|
548
|
+
self._lock = Lock()
|
549
|
+
self.__enter__ = None
|
550
|
+
self.__exit__ = None
|
551
|
+
|
552
|
+
async def __create__(self):
|
553
|
+
"""Create a browser for this instance and context."""
|
554
|
+
self.playwright: AsyncPlaywright = await async_playwright().start()
|
555
|
+
self.context: AsyncBrowserContext = (
|
556
|
+
await self.playwright.firefox.launch_persistent_context(
|
557
|
+
**self.launch_options
|
558
|
+
)
|
559
|
+
)
|
560
|
+
if self.cookies:
|
561
|
+
await self.context.add_cookies(self.cookies)
|
562
|
+
|
563
|
+
async def __aenter__(self):
|
564
|
+
await self.__create__()
|
565
|
+
return self
|
566
|
+
|
567
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
568
|
+
await self.close()
|
569
|
+
|
570
|
+
async def close(self):
|
571
|
+
"""Close all resources"""
|
572
|
+
if self._closed: # pragma: no cover
|
573
|
+
return
|
574
|
+
|
575
|
+
if self.context:
|
576
|
+
await self.context.close()
|
577
|
+
self.context = None
|
578
|
+
|
579
|
+
if self.playwright:
|
580
|
+
await self.playwright.stop()
|
581
|
+
self.playwright = None
|
582
|
+
|
583
|
+
self._closed = True
|
584
|
+
|
585
|
+
async def _get_or_create_page(self) -> PageInfo:
|
586
|
+
"""Get an available page or create a new one"""
|
587
|
+
async with self._lock:
|
588
|
+
# Try to get a ready page first
|
589
|
+
page_info = self.page_pool.get_ready_page()
|
590
|
+
if page_info:
|
591
|
+
return page_info
|
592
|
+
|
593
|
+
# Create a new page if under limit
|
594
|
+
if self.page_pool.pages_count < self.max_pages:
|
595
|
+
page = await self.context.new_page()
|
596
|
+
page.set_default_navigation_timeout(self.timeout)
|
597
|
+
page.set_default_timeout(self.timeout)
|
598
|
+
if self.extra_headers:
|
599
|
+
await page.set_extra_http_headers(self.extra_headers)
|
600
|
+
|
601
|
+
if self.disable_resources:
|
602
|
+
await page.route("**/*", async_intercept_route)
|
603
|
+
|
604
|
+
return self.page_pool.add_page(page)
|
605
|
+
|
606
|
+
# Wait for a page to become available
|
607
|
+
max_wait = 30
|
608
|
+
start_time = time()
|
609
|
+
|
610
|
+
while time() - start_time < max_wait: # pragma: no cover
|
611
|
+
page_info = self.page_pool.get_ready_page()
|
612
|
+
if page_info:
|
613
|
+
return page_info
|
614
|
+
await asyncio_sleep(0.05)
|
615
|
+
|
616
|
+
raise TimeoutError("No pages available within timeout period")
|
617
|
+
|
618
|
+
async def _solve_cloudflare(self, page: async_Page):
|
619
|
+
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
620
|
+
|
621
|
+
:param page: The async targeted page
|
622
|
+
:return:
|
623
|
+
"""
|
624
|
+
challenge_type = self._detect_cloudflare(await page.content())
|
625
|
+
if not challenge_type:
|
626
|
+
log.error("No Cloudflare challenge found.")
|
627
|
+
return
|
628
|
+
else:
|
629
|
+
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
630
|
+
if challenge_type == "non-interactive": # pragma: no cover
|
631
|
+
while "<title>Just a moment...</title>" in (await page.content()):
|
632
|
+
log.info("Waiting for Cloudflare wait page to disappear.")
|
633
|
+
await page.wait_for_timeout(1000)
|
634
|
+
await page.wait_for_load_state()
|
635
|
+
log.info("Cloudflare captcha is solved")
|
636
|
+
return
|
637
|
+
|
638
|
+
else:
|
639
|
+
while "Verifying you are human." in (await page.content()):
|
640
|
+
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
641
|
+
await page.wait_for_timeout(500)
|
642
|
+
|
643
|
+
iframe = page.frame(url=__CF_PATTERN__)
|
644
|
+
if iframe is None:
|
645
|
+
log.info("Didn't find Cloudflare iframe!")
|
646
|
+
return
|
647
|
+
|
648
|
+
while not await (await iframe.frame_element()).is_visible():
|
649
|
+
# Double-checking that the iframe is loaded
|
650
|
+
await page.wait_for_timeout(500)
|
651
|
+
|
652
|
+
# Calculate the Captcha coordinates for any viewport
|
653
|
+
outer_box = await page.locator(
|
654
|
+
".main-content p+div>div>div"
|
655
|
+
).bounding_box()
|
656
|
+
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
657
|
+
|
658
|
+
# Move the mouse to the center of the window, then press and hold the left mouse button
|
659
|
+
await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
660
|
+
await page.locator(".zone-name-title").wait_for(state="hidden")
|
661
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
662
|
+
|
663
|
+
log.info("Cloudflare captcha is solved")
|
664
|
+
return
|
665
|
+
|
666
|
+
async def fetch(self, url: str) -> Response:
|
667
|
+
"""Opens up the browser and do your request based on your chosen options.
|
668
|
+
|
669
|
+
:param url: The Target url.
|
670
|
+
:return: A `Response` object.
|
671
|
+
"""
|
672
|
+
if self._closed: # pragma: no cover
|
673
|
+
raise RuntimeError("Context manager has been closed")
|
674
|
+
|
675
|
+
final_response = None
|
676
|
+
referer = (
|
677
|
+
generate_convincing_referer(url)
|
678
|
+
if (self.google_search and "referer" not in self._headers_keys)
|
679
|
+
else None
|
680
|
+
)
|
681
|
+
|
682
|
+
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
683
|
+
nonlocal final_response
|
684
|
+
if (
|
685
|
+
finished_response.request.resource_type == "document"
|
686
|
+
and finished_response.request.is_navigation_request()
|
687
|
+
):
|
688
|
+
final_response = finished_response
|
689
|
+
|
690
|
+
page_info = await self._get_or_create_page()
|
691
|
+
page_info.mark_busy(url=url)
|
692
|
+
|
693
|
+
try:
|
694
|
+
# Navigate to URL and wait for a specified state
|
695
|
+
page_info.page.on("response", handle_response)
|
696
|
+
first_response = await page_info.page.goto(url, referer=referer)
|
697
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
698
|
+
|
699
|
+
if self.network_idle:
|
700
|
+
await page_info.page.wait_for_load_state("networkidle")
|
701
|
+
|
702
|
+
if not first_response:
|
703
|
+
raise RuntimeError(f"Failed to get response for {url}")
|
704
|
+
|
705
|
+
if self.solve_cloudflare:
|
706
|
+
await self._solve_cloudflare(page_info.page)
|
707
|
+
# Make sure the page is fully loaded after the captcha
|
708
|
+
await page_info.page.wait_for_load_state(state="load")
|
709
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
710
|
+
if self.network_idle:
|
711
|
+
await page_info.page.wait_for_load_state("networkidle")
|
712
|
+
|
713
|
+
if self.page_action is not None:
|
714
|
+
try:
|
715
|
+
page_info.page = await self.page_action(page_info.page)
|
716
|
+
except Exception as e:
|
717
|
+
log.error(f"Error executing page_action: {e}")
|
718
|
+
|
719
|
+
if self.wait_selector:
|
720
|
+
try:
|
721
|
+
waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
|
722
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
723
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
724
|
+
await page_info.page.wait_for_load_state(state="load")
|
725
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
726
|
+
if self.network_idle:
|
727
|
+
await page_info.page.wait_for_load_state("networkidle")
|
728
|
+
except Exception as e:
|
729
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
730
|
+
|
731
|
+
await page_info.page.wait_for_timeout(self.wait)
|
732
|
+
|
733
|
+
# Create response object
|
734
|
+
response = await ResponseFactory.from_async_playwright_response(
|
735
|
+
page_info.page, first_response, final_response, self.selector_config
|
736
|
+
)
|
737
|
+
|
738
|
+
# Mark the page as ready for next use
|
739
|
+
page_info.mark_ready()
|
740
|
+
|
741
|
+
return response
|
742
|
+
|
743
|
+
except Exception as e:
|
744
|
+
page_info.mark_error()
|
745
|
+
raise e
|