scrapling 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +21 -4
- scrapling/core/_types.py +0 -2
- scrapling/core/ai.py +22 -14
- scrapling/core/shell.py +2 -1
- scrapling/core/storage.py +2 -1
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +9 -8
- scrapling/engines/_browsers/_camoufox.py +36 -22
- scrapling/engines/_browsers/_controllers.py +2 -2
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +419 -16
- scrapling/fetchers/__init__.py +36 -0
- scrapling/fetchers/chrome.py +205 -0
- scrapling/fetchers/firefox.py +216 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +4 -4
- {scrapling-0.3.5.dist-info → scrapling-0.3.6.dist-info}/METADATA +17 -15
- {scrapling-0.3.5.dist-info → scrapling-0.3.6.dist-info}/RECORD +24 -21
- scrapling/fetchers.py +0 -444
- {scrapling-0.3.5.dist-info → scrapling-0.3.6.dist-info}/WHEEL +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.6.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.6.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.6.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -1,28 +1,38 @@
|
|
1
1
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
2
|
-
__version__ = "0.3.
|
2
|
+
__version__ = "0.3.6"
|
3
3
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
4
4
|
|
5
|
+
from typing import Any, TYPE_CHECKING
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from scrapling.parser import Selector, Selectors
|
9
|
+
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
10
|
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
11
|
+
|
12
|
+
|
13
|
+
# Lazy import mapping
|
14
|
+
_LAZY_IMPORTS = {
|
15
|
+
"Fetcher": ("scrapling.fetchers", "Fetcher"),
|
16
|
+
"Selector": ("scrapling.parser", "Selector"),
|
17
|
+
"Selectors": ("scrapling.parser", "Selectors"),
|
18
|
+
"AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
|
19
|
+
"TextHandler": ("scrapling.core.custom_types", "TextHandler"),
|
20
|
+
"AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
|
21
|
+
"StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
|
22
|
+
"DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
|
23
|
+
}
|
24
|
+
__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
|
25
|
+
|
26
|
+
|
27
|
+
def __getattr__(name: str) -> Any:
|
28
|
+
if name in _LAZY_IMPORTS:
|
29
|
+
module_path, class_name = _LAZY_IMPORTS[name]
|
22
30
|
module = __import__(module_path, fromlist=[class_name])
|
23
31
|
return getattr(module, class_name)
|
24
32
|
else:
|
25
|
-
raise AttributeError(f"module
|
33
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
26
34
|
|
27
35
|
|
28
|
-
|
36
|
+
def __dir__() -> list[str]:
|
37
|
+
"""Support for dir() and autocomplete."""
|
38
|
+
return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])
|
scrapling/cli.py
CHANGED
@@ -2,8 +2,9 @@ from pathlib import Path
|
|
2
2
|
from subprocess import check_output
|
3
3
|
from sys import executable as python_executable
|
4
4
|
|
5
|
+
from scrapling.core.utils import log
|
5
6
|
from scrapling.engines.toolbelt.custom import Response
|
6
|
-
from scrapling.core.utils import
|
7
|
+
from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
|
7
8
|
from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
|
8
9
|
|
9
10
|
from orjson import loads as json_loads, JSONDecodeError
|
@@ -135,10 +136,26 @@ def install(force): # pragma: no cover
|
|
135
136
|
|
136
137
|
|
137
138
|
@command(help="Run Scrapling's MCP server (Check the docs for more info).")
|
138
|
-
|
139
|
+
@option(
|
140
|
+
"--http",
|
141
|
+
is_flag=True,
|
142
|
+
default=False,
|
143
|
+
help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
|
144
|
+
)
|
145
|
+
@option(
|
146
|
+
"--host",
|
147
|
+
type=str,
|
148
|
+
default="0.0.0.0",
|
149
|
+
help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
|
150
|
+
)
|
151
|
+
@option(
|
152
|
+
"--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
|
153
|
+
)
|
154
|
+
def mcp(http, host, port):
|
139
155
|
from scrapling.core.ai import ScraplingMCPServer
|
140
156
|
|
141
|
-
ScraplingMCPServer()
|
157
|
+
server = ScraplingMCPServer()
|
158
|
+
server.serve(http, host, port)
|
142
159
|
|
143
160
|
|
144
161
|
@command(help="Interactive scraping console")
|
@@ -766,7 +783,7 @@ def stealthy_fetch(
|
|
766
783
|
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
767
784
|
:param block_webrtc: Blocks WebRTC entirely.
|
768
785
|
:param humanize: Humanize the cursor movement.
|
769
|
-
:param solve_cloudflare: Solves all
|
786
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
|
770
787
|
:param allow_webgl: Allow WebGL (recommended to keep enabled).
|
771
788
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
772
789
|
:param disable_ads: Install the uBlock Origin addon on the browser.
|
scrapling/core/_types.py
CHANGED
scrapling/core/ai.py
CHANGED
@@ -42,10 +42,7 @@ def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResp
|
|
42
42
|
|
43
43
|
|
44
44
|
class ScraplingMCPServer:
|
45
|
-
_server = FastMCP(name="Scrapling")
|
46
|
-
|
47
45
|
@staticmethod
|
48
|
-
@_server.tool()
|
49
46
|
def get(
|
50
47
|
url: str,
|
51
48
|
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
@@ -124,7 +121,6 @@ class ScraplingMCPServer:
|
|
124
121
|
)
|
125
122
|
|
126
123
|
@staticmethod
|
127
|
-
@_server.tool()
|
128
124
|
async def bulk_get(
|
129
125
|
urls: Tuple[str, ...],
|
130
126
|
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
@@ -211,7 +207,6 @@ class ScraplingMCPServer:
|
|
211
207
|
]
|
212
208
|
|
213
209
|
@staticmethod
|
214
|
-
@_server.tool()
|
215
210
|
async def fetch(
|
216
211
|
url: str,
|
217
212
|
extraction_type: extraction_types = "markdown",
|
@@ -263,7 +258,7 @@ class ScraplingMCPServer:
|
|
263
258
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
264
259
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
265
260
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
266
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
261
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
267
262
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
268
263
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
269
264
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
@@ -300,7 +295,6 @@ class ScraplingMCPServer:
|
|
300
295
|
)
|
301
296
|
|
302
297
|
@staticmethod
|
303
|
-
@_server.tool()
|
304
298
|
async def bulk_fetch(
|
305
299
|
urls: Tuple[str, ...],
|
306
300
|
extraction_type: extraction_types = "markdown",
|
@@ -352,7 +346,7 @@ class ScraplingMCPServer:
|
|
352
346
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
353
347
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
354
348
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
355
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
349
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
356
350
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
357
351
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
358
352
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
@@ -394,7 +388,6 @@ class ScraplingMCPServer:
|
|
394
388
|
]
|
395
389
|
|
396
390
|
@staticmethod
|
397
|
-
@_server.tool()
|
398
391
|
async def stealthy_fetch(
|
399
392
|
url: str,
|
400
393
|
extraction_type: extraction_types = "markdown",
|
@@ -443,7 +436,7 @@ class ScraplingMCPServer:
|
|
443
436
|
:param cookies: Set cookies for the next request.
|
444
437
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
445
438
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
446
|
-
:param solve_cloudflare: Solves all
|
439
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
447
440
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
448
441
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
449
442
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
@@ -494,7 +487,6 @@ class ScraplingMCPServer:
|
|
494
487
|
)
|
495
488
|
|
496
489
|
@staticmethod
|
497
|
-
@_server.tool()
|
498
490
|
async def bulk_stealthy_fetch(
|
499
491
|
urls: Tuple[str, ...],
|
500
492
|
extraction_type: extraction_types = "markdown",
|
@@ -543,7 +535,7 @@ class ScraplingMCPServer:
|
|
543
535
|
:param cookies: Set cookies for the next request.
|
544
536
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
545
537
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
546
|
-
:param solve_cloudflare: Solves all
|
538
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
547
539
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
548
540
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
549
541
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
@@ -598,6 +590,22 @@ class ScraplingMCPServer:
|
|
598
590
|
for page in responses
|
599
591
|
]
|
600
592
|
|
601
|
-
def serve(self):
|
593
|
+
def serve(self, http: bool, host: str, port: int):
|
602
594
|
"""Serve the MCP server."""
|
603
|
-
|
595
|
+
server = FastMCP(name="Scrapling", host=host, port=port)
|
596
|
+
server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
|
597
|
+
server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
|
598
|
+
server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
|
599
|
+
server.add_tool(
|
600
|
+
self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
|
601
|
+
)
|
602
|
+
server.add_tool(
|
603
|
+
self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
|
604
|
+
)
|
605
|
+
server.add_tool(
|
606
|
+
self.bulk_stealthy_fetch,
|
607
|
+
title="bulk_stealthy_fetch",
|
608
|
+
description=self.bulk_stealthy_fetch.__doc__,
|
609
|
+
structured_output=True,
|
610
|
+
)
|
611
|
+
server.run(transport="stdio" if not http else "streamable-http")
|
scrapling/core/shell.py
CHANGED
@@ -22,10 +22,11 @@ from logging import (
|
|
22
22
|
from orjson import loads as json_loads, JSONDecodeError
|
23
23
|
|
24
24
|
from scrapling import __version__
|
25
|
+
from scrapling.core.utils import log
|
25
26
|
from scrapling.parser import Selector, Selectors
|
26
27
|
from scrapling.core.custom_types import TextHandler
|
27
28
|
from scrapling.engines.toolbelt.custom import Response
|
28
|
-
from scrapling.core.utils import
|
29
|
+
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
29
30
|
from scrapling.core._types import (
|
30
31
|
Optional,
|
31
32
|
Dict,
|
scrapling/core/storage.py
CHANGED
@@ -6,7 +6,6 @@ from sqlite3 import connect as db_connect
|
|
6
6
|
|
7
7
|
from orjson import dumps, loads
|
8
8
|
from lxml.html import HtmlElement
|
9
|
-
from tldextract import extract as tld
|
10
9
|
|
11
10
|
from scrapling.core.utils import _StorageTools, log
|
12
11
|
from scrapling.core._types import Dict, Optional, Any
|
@@ -26,6 +25,8 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
26
25
|
return default_value
|
27
26
|
|
28
27
|
try:
|
28
|
+
from tldextract import extract as tld
|
29
|
+
|
29
30
|
extracted = tld(self.url)
|
30
31
|
return extracted.top_domain_under_public_suffix or extracted.domain or default_value
|
31
32
|
except AttributeError:
|
scrapling/core/utils/__init__.py
CHANGED
@@ -12,17 +12,13 @@ from camoufox.utils import (
|
|
12
12
|
installed_verstr as camoufox_version,
|
13
13
|
)
|
14
14
|
|
15
|
-
from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
|
16
|
-
from scrapling.core._types import (
|
17
|
-
Any,
|
18
|
-
Dict,
|
19
|
-
Optional,
|
20
|
-
)
|
21
15
|
from ._page import PageInfo, PagePool
|
22
|
-
from .
|
23
|
-
from .
|
16
|
+
from scrapling.parser import Selector
|
17
|
+
from scrapling.core._types import Dict, Optional
|
24
18
|
from scrapling.engines.toolbelt.fingerprints import get_os_name
|
25
19
|
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
20
|
+
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
21
|
+
from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
|
26
22
|
|
27
23
|
__ff_version_str__ = camoufox_version().split(".", 1)[0]
|
28
24
|
|
@@ -268,4 +264,9 @@ class StealthySessionMixin:
|
|
268
264
|
if f"cType: '{ctype}'" in page_content:
|
269
265
|
return ctype
|
270
266
|
|
267
|
+
# Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
|
268
|
+
selector = Selector(content=page_content)
|
269
|
+
if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
|
270
|
+
return "embedded"
|
271
|
+
|
271
272
|
return None
|
@@ -116,7 +116,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
116
116
|
:param cookies: Set cookies for the next request.
|
117
117
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
118
118
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
119
|
-
:param solve_cloudflare: Solves all
|
119
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
120
120
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
121
121
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
122
122
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
@@ -237,26 +237,33 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
237
237
|
return
|
238
238
|
|
239
239
|
else:
|
240
|
-
|
241
|
-
|
242
|
-
|
240
|
+
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
241
|
+
if challenge_type != "embedded":
|
242
|
+
box_selector = ".main-content p+div>div>div"
|
243
|
+
while "Verifying you are human." in self._get_page_content(page):
|
244
|
+
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
245
|
+
page.wait_for_timeout(500)
|
243
246
|
|
244
247
|
iframe = page.frame(url=__CF_PATTERN__)
|
245
248
|
if iframe is None:
|
246
|
-
log.
|
249
|
+
log.error("Didn't find Cloudflare iframe!")
|
247
250
|
return
|
248
251
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
+
if challenge_type != "embedded":
|
253
|
+
while not iframe.frame_element().is_visible():
|
254
|
+
# Double-checking that the iframe is loaded
|
255
|
+
page.wait_for_timeout(500)
|
252
256
|
|
257
|
+
iframe.wait_for_load_state(state="domcontentloaded")
|
258
|
+
iframe.wait_for_load_state("networkidle")
|
253
259
|
# Calculate the Captcha coordinates for any viewport
|
254
|
-
outer_box = page.locator(
|
260
|
+
outer_box = page.locator(box_selector).last.bounding_box()
|
255
261
|
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
256
262
|
|
257
263
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
258
264
|
page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
259
|
-
|
265
|
+
if challenge_type != "embedded":
|
266
|
+
page.locator(".zone-name-title").wait_for(state="hidden")
|
260
267
|
page.wait_for_load_state(state="domcontentloaded")
|
261
268
|
|
262
269
|
log.info("Cloudflare captcha is solved")
|
@@ -293,7 +300,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
293
300
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
294
301
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
295
302
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
296
|
-
:param solve_cloudflare: Solves all
|
303
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
297
304
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
298
305
|
:return: A `Response` object.
|
299
306
|
"""
|
@@ -435,7 +442,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
435
442
|
:param cookies: Set cookies for the next request.
|
436
443
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
437
444
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
438
|
-
:param solve_cloudflare: Solves all
|
445
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
439
446
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
440
447
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
441
448
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
@@ -556,26 +563,33 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
556
563
|
return
|
557
564
|
|
558
565
|
else:
|
559
|
-
|
560
|
-
|
561
|
-
|
566
|
+
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
567
|
+
if challenge_type != "embedded":
|
568
|
+
box_selector = ".main-content p+div>div>div"
|
569
|
+
while "Verifying you are human." in (await self._get_page_content(page)):
|
570
|
+
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
571
|
+
await page.wait_for_timeout(500)
|
562
572
|
|
563
573
|
iframe = page.frame(url=__CF_PATTERN__)
|
564
574
|
if iframe is None:
|
565
|
-
log.
|
575
|
+
log.error("Didn't find Cloudflare iframe!")
|
566
576
|
return
|
567
577
|
|
568
|
-
|
569
|
-
|
570
|
-
|
578
|
+
if challenge_type != "embedded":
|
579
|
+
while not await (await iframe.frame_element()).is_visible():
|
580
|
+
# Double-checking that the iframe is loaded
|
581
|
+
await page.wait_for_timeout(500)
|
571
582
|
|
583
|
+
await iframe.wait_for_load_state(state="domcontentloaded")
|
584
|
+
await iframe.wait_for_load_state("networkidle")
|
572
585
|
# Calculate the Captcha coordinates for any viewport
|
573
|
-
outer_box = await page.locator(
|
586
|
+
outer_box = await page.locator(box_selector).last.bounding_box()
|
574
587
|
captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
|
575
588
|
|
576
589
|
# Move the mouse to the center of the window, then press and hold the left mouse button
|
577
590
|
await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
|
578
|
-
|
591
|
+
if challenge_type != "embedded":
|
592
|
+
await page.locator(".zone-name-title").wait_for(state="hidden")
|
579
593
|
await page.wait_for_load_state(state="domcontentloaded")
|
580
594
|
|
581
595
|
log.info("Cloudflare captcha is solved")
|
@@ -612,7 +626,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
612
626
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
613
627
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
614
628
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
615
|
-
:param solve_cloudflare: Solves all
|
629
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
616
630
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
617
631
|
:return: A `Response` object.
|
618
632
|
"""
|
@@ -117,7 +117,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
117
117
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
118
118
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
119
119
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
120
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
120
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
121
121
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
122
122
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
123
123
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
@@ -360,7 +360,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
360
360
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
361
361
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
362
362
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
363
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
363
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
364
364
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
365
365
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
366
366
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
scrapling/engines/constants.py
CHANGED
@@ -101,18 +101,3 @@ DEFAULT_STEALTH_FLAGS = (
|
|
101
101
|
"--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
|
102
102
|
"--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
|
103
103
|
)
|
104
|
-
|
105
|
-
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
106
|
-
NSTBROWSER_DEFAULT_QUERY = {
|
107
|
-
"once": True,
|
108
|
-
"headless": True,
|
109
|
-
"autoClose": True,
|
110
|
-
"fingerprint": {
|
111
|
-
"flags": {"timezone": "BasedOnIp", "screen": "Custom"},
|
112
|
-
"platform": "linux", # support: windows, mac, linux
|
113
|
-
"kernel": "chromium", # only support: chromium
|
114
|
-
"kernelMilestone": "128",
|
115
|
-
"hardwareConcurrency": 8,
|
116
|
-
"deviceMemory": 8,
|
117
|
-
},
|
118
|
-
}
|