scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +21 -4
- scrapling/core/_types.py +3 -2
- scrapling/core/ai.py +24 -15
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +6 -4
- scrapling/core/storage.py +7 -6
- scrapling/core/translator.py +13 -8
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +45 -21
- scrapling/engines/_browsers/_camoufox.py +98 -43
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +34 -13
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +749 -336
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +46 -0
- scrapling/fetchers/chrome.py +210 -0
- scrapling/fetchers/firefox.py +212 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +109 -84
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.5.dist-info/RECORD +0 -44
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
@@ -24,15 +24,15 @@ class ResponseFactory:
|
|
24
24
|
|
25
25
|
@classmethod
|
26
26
|
@lru_cache(maxsize=16)
|
27
|
-
def __extract_browser_encoding(cls, content_type: str | None) ->
|
27
|
+
def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
|
28
28
|
"""Extract browser encoding from headers.
|
29
29
|
Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
|
30
30
|
"""
|
31
31
|
if content_type:
|
32
32
|
# Because Playwright can't do that by themselves like all libraries for some reason :3
|
33
33
|
match = __CHARSET_RE__.search(content_type)
|
34
|
-
return match.group(1) if match else
|
35
|
-
return
|
34
|
+
return match.group(1) if match else default
|
35
|
+
return default
|
36
36
|
|
37
37
|
@classmethod
|
38
38
|
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
@@ -58,7 +58,8 @@ class ResponseFactory:
|
|
58
58
|
"encoding": cls.__extract_browser_encoding(
|
59
59
|
current_response.headers.get("content-type", "")
|
60
60
|
)
|
61
|
-
|
61
|
+
if current_response
|
62
|
+
else "utf-8",
|
62
63
|
"cookies": tuple(),
|
63
64
|
"headers": current_response.all_headers() if current_response else {},
|
64
65
|
"request_headers": current_request.all_headers(),
|
@@ -107,15 +108,13 @@ class ResponseFactory:
|
|
107
108
|
if not final_response:
|
108
109
|
raise ValueError("Failed to get a response from the page")
|
109
110
|
|
110
|
-
encoding = (
|
111
|
-
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
112
|
-
) # default encoding
|
111
|
+
encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
|
113
112
|
# PlayWright API sometimes give empty status text for some reason!
|
114
113
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
115
114
|
|
116
115
|
history = cls._process_response_history(first_response, parser_arguments)
|
117
116
|
try:
|
118
|
-
page_content =
|
117
|
+
page_content = final_response.text()
|
119
118
|
except Exception as e: # pragma: no cover
|
120
119
|
log.error(f"Error getting page content: {e}")
|
121
120
|
page_content = ""
|
@@ -161,7 +160,8 @@ class ResponseFactory:
|
|
161
160
|
"encoding": cls.__extract_browser_encoding(
|
162
161
|
current_response.headers.get("content-type", "")
|
163
162
|
)
|
164
|
-
|
163
|
+
if current_response
|
164
|
+
else "utf-8",
|
165
165
|
"cookies": tuple(),
|
166
166
|
"headers": await current_response.all_headers() if current_response else {},
|
167
167
|
"request_headers": await current_request.all_headers(),
|
@@ -210,15 +210,13 @@ class ResponseFactory:
|
|
210
210
|
if not final_response:
|
211
211
|
raise ValueError("Failed to get a response from the page")
|
212
212
|
|
213
|
-
encoding = (
|
214
|
-
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
215
|
-
) # default encoding
|
213
|
+
encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
|
216
214
|
# PlayWright API sometimes give empty status text for some reason!
|
217
215
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
218
216
|
|
219
217
|
history = await cls._async_process_response_history(first_response, parser_arguments)
|
220
218
|
try:
|
221
|
-
page_content = await
|
219
|
+
page_content = await final_response.text()
|
222
220
|
except Exception as e: # pragma: no cover
|
223
221
|
log.error(f"Error getting page content in async: {e}")
|
224
222
|
page_content = ""
|
@@ -255,8 +253,8 @@ class ResponseFactory:
|
|
255
253
|
"encoding": response.encoding or "utf-8",
|
256
254
|
"cookies": dict(response.cookies),
|
257
255
|
"headers": dict(response.headers),
|
258
|
-
"request_headers": dict(response.request.headers),
|
259
|
-
"method": response.request.method,
|
256
|
+
"request_headers": dict(response.request.headers) if response.request else {},
|
257
|
+
"method": response.request.method if response.request else "GET",
|
260
258
|
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
261
259
|
**parser_arguments,
|
262
260
|
}
|
@@ -8,6 +8,7 @@ from scrapling.core.utils import log
|
|
8
8
|
from scrapling.core._types import (
|
9
9
|
Any,
|
10
10
|
Dict,
|
11
|
+
cast,
|
11
12
|
List,
|
12
13
|
Optional,
|
13
14
|
Tuple,
|
@@ -30,10 +31,10 @@ class Response(Selector):
|
|
30
31
|
request_headers: Dict,
|
31
32
|
encoding: str = "utf-8",
|
32
33
|
method: str = "GET",
|
33
|
-
history: List = None,
|
34
|
-
**selector_config:
|
34
|
+
history: List | None = None,
|
35
|
+
**selector_config: Any,
|
35
36
|
):
|
36
|
-
adaptive_domain = selector_config.pop("adaptive_domain",
|
37
|
+
adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
|
37
38
|
self.status = status
|
38
39
|
self.reason = reason
|
39
40
|
self.cookies = cookies
|
@@ -58,7 +59,7 @@ class BaseFetcher:
|
|
58
59
|
keep_cdata: Optional[bool] = False
|
59
60
|
storage_args: Optional[Dict] = None
|
60
61
|
keep_comments: Optional[bool] = False
|
61
|
-
adaptive_domain:
|
62
|
+
adaptive_domain: str = ""
|
62
63
|
parser_keywords: Tuple = (
|
63
64
|
"huge_tree",
|
64
65
|
"adaptive",
|
@@ -124,12 +125,8 @@ class BaseFetcher:
|
|
124
125
|
adaptive=cls.adaptive,
|
125
126
|
storage=cls.storage,
|
126
127
|
storage_args=cls.storage_args,
|
128
|
+
adaptive_domain=cls.adaptive_domain,
|
127
129
|
)
|
128
|
-
if cls.adaptive_domain:
|
129
|
-
if not isinstance(cls.adaptive_domain, str):
|
130
|
-
log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
|
131
|
-
else:
|
132
|
-
parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
|
133
130
|
|
134
131
|
return parser_arguments
|
135
132
|
|
@@ -8,9 +8,10 @@ from platform import system as platform_system
|
|
8
8
|
from tldextract import extract
|
9
9
|
from browserforge.headers import Browser, HeaderGenerator
|
10
10
|
|
11
|
-
from scrapling.core._types import Dict,
|
11
|
+
from scrapling.core._types import Dict, Literal
|
12
12
|
|
13
13
|
__OS_NAME__ = platform_system()
|
14
|
+
OSName = Literal["linux", "macos", "windows"]
|
14
15
|
|
15
16
|
|
16
17
|
@lru_cache(10, typed=True)
|
@@ -28,16 +29,20 @@ def generate_convincing_referer(url: str) -> str:
|
|
28
29
|
|
29
30
|
|
30
31
|
@lru_cache(1, typed=True)
|
31
|
-
def get_os_name() ->
|
32
|
-
"""Get the current OS name in the same format needed for browserforge
|
32
|
+
def get_os_name() -> OSName | None:
|
33
|
+
"""Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
|
33
34
|
|
34
35
|
:return: Current OS name or `None` otherwise
|
35
36
|
"""
|
36
|
-
|
37
|
-
"Linux":
|
38
|
-
|
39
|
-
"
|
40
|
-
|
37
|
+
match __OS_NAME__:
|
38
|
+
case "Linux":
|
39
|
+
return "linux"
|
40
|
+
case "Darwin":
|
41
|
+
return "macos"
|
42
|
+
case "Windows":
|
43
|
+
return "windows"
|
44
|
+
case _:
|
45
|
+
return None
|
41
46
|
|
42
47
|
|
43
48
|
def generate_headers(browser_mode: bool = False) -> Dict:
|
@@ -58,8 +63,10 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
58
63
|
Browser(name="edge", min_version=130),
|
59
64
|
]
|
60
65
|
)
|
61
|
-
|
62
|
-
|
66
|
+
if os_name:
|
67
|
+
return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
|
68
|
+
else:
|
69
|
+
return HeaderGenerator(browser=browsers, device="desktop").generate()
|
63
70
|
|
64
71
|
|
65
72
|
__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
|
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
|
|
11
11
|
from playwright.sync_api import Route
|
12
12
|
|
13
13
|
from scrapling.core.utils import log
|
14
|
-
from scrapling.core._types import Dict,
|
14
|
+
from scrapling.core._types import Dict, Tuple, overload, Literal
|
15
15
|
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
16
16
|
|
17
17
|
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
|
|
49
49
|
await route.continue_()
|
50
50
|
|
51
51
|
|
52
|
-
|
52
|
+
@overload
|
53
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
|
54
|
+
|
55
|
+
|
56
|
+
@overload
|
57
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
|
58
|
+
|
59
|
+
|
60
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
|
53
61
|
"""Validate a proxy and return it in the acceptable format for Playwright
|
54
62
|
Reference: https://playwright.dev/python/docs/network#http-proxy
|
55
63
|
|
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
|
|
83
91
|
except ValidationError as e:
|
84
92
|
raise TypeError(f"Invalid proxy dictionary: {e}")
|
85
93
|
|
86
|
-
|
94
|
+
raise TypeError(f"Invalid proxy string: {proxy_string}")
|
87
95
|
|
88
96
|
|
89
97
|
@lru_cache(10, typed=True)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
2
|
+
|
3
|
+
if TYPE_CHECKING:
|
4
|
+
from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
|
5
|
+
from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
|
6
|
+
from scrapling.fetchers.firefox import StealthyFetcher, StealthySession, AsyncStealthySession
|
7
|
+
|
8
|
+
|
9
|
+
# Lazy import mapping
|
10
|
+
_LAZY_IMPORTS = {
|
11
|
+
"Fetcher": ("scrapling.fetchers.requests", "Fetcher"),
|
12
|
+
"AsyncFetcher": ("scrapling.fetchers.requests", "AsyncFetcher"),
|
13
|
+
"FetcherSession": ("scrapling.fetchers.requests", "FetcherSession"),
|
14
|
+
"DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
|
15
|
+
"DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
|
16
|
+
"AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
|
17
|
+
"StealthyFetcher": ("scrapling.fetchers.firefox", "StealthyFetcher"),
|
18
|
+
"StealthySession": ("scrapling.fetchers.firefox", "StealthySession"),
|
19
|
+
"AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
|
20
|
+
}
|
21
|
+
|
22
|
+
__all__ = [
|
23
|
+
"Fetcher",
|
24
|
+
"AsyncFetcher",
|
25
|
+
"FetcherSession",
|
26
|
+
"DynamicFetcher",
|
27
|
+
"DynamicSession",
|
28
|
+
"AsyncDynamicSession",
|
29
|
+
"StealthyFetcher",
|
30
|
+
"StealthySession",
|
31
|
+
"AsyncStealthySession",
|
32
|
+
]
|
33
|
+
|
34
|
+
|
35
|
+
def __getattr__(name: str) -> Any:
|
36
|
+
if name in _LAZY_IMPORTS:
|
37
|
+
module_path, class_name = _LAZY_IMPORTS[name]
|
38
|
+
module = __import__(module_path, fromlist=[class_name])
|
39
|
+
return getattr(module, class_name)
|
40
|
+
else:
|
41
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
42
|
+
|
43
|
+
|
44
|
+
def __dir__() -> list[str]:
|
45
|
+
"""Support for dir() and autocomplete."""
|
46
|
+
return sorted(list(_LAZY_IMPORTS.keys()))
|
@@ -0,0 +1,210 @@
|
|
1
|
+
from scrapling.core._types import (
|
2
|
+
Callable,
|
3
|
+
List,
|
4
|
+
Dict,
|
5
|
+
Optional,
|
6
|
+
SelectorWaitStates,
|
7
|
+
)
|
8
|
+
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
9
|
+
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
|
10
|
+
|
11
|
+
|
12
|
+
class DynamicFetcher(BaseFetcher):
|
13
|
+
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
14
|
+
|
15
|
+
Using this Fetcher class, you can do requests with:
|
16
|
+
- Vanilla Playwright without any modifications other than the ones you chose.
|
17
|
+
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress, but it bypasses many online tests like bot.sannysoft.com
|
18
|
+
Some of the things stealth mode does include:
|
19
|
+
1) Patches the CDP runtime fingerprint.
|
20
|
+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
21
|
+
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
22
|
+
4) Generates real browser's headers of the same type and same user OS, then append it to the request.
|
23
|
+
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.
|
24
|
+
|
25
|
+
> Note that these are the main options with PlayWright, but it can be mixed.
|
26
|
+
"""
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def fetch(
|
30
|
+
cls,
|
31
|
+
url: str,
|
32
|
+
headless: bool = True,
|
33
|
+
google_search: bool = True,
|
34
|
+
hide_canvas: bool = False,
|
35
|
+
disable_webgl: bool = False,
|
36
|
+
real_chrome: bool = False,
|
37
|
+
stealth: bool = False,
|
38
|
+
wait: int | float = 0,
|
39
|
+
page_action: Optional[Callable] = None,
|
40
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
41
|
+
locale: str = "en-US",
|
42
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
43
|
+
useragent: Optional[str] = None,
|
44
|
+
cdp_url: Optional[str] = None,
|
45
|
+
timeout: int | float = 30000,
|
46
|
+
disable_resources: bool = False,
|
47
|
+
wait_selector: Optional[str] = None,
|
48
|
+
init_script: Optional[str] = None,
|
49
|
+
cookies: Optional[List[Dict]] = None,
|
50
|
+
network_idle: bool = False,
|
51
|
+
load_dom: bool = True,
|
52
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
53
|
+
additional_args: Optional[Dict] = None,
|
54
|
+
custom_config: Optional[Dict] = None,
|
55
|
+
) -> Response:
|
56
|
+
"""Opens up a browser and do your request based on your chosen options below.
|
57
|
+
|
58
|
+
:param url: Target url.
|
59
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
60
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
61
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
62
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
63
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
64
|
+
:param cookies: Set cookies for the next request.
|
65
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
66
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
67
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
68
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
69
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
70
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
71
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
72
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
73
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
74
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
75
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
76
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
77
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
78
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
79
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
80
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
81
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
82
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
83
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
84
|
+
:return: A `Response` object.
|
85
|
+
"""
|
86
|
+
if not custom_config:
|
87
|
+
custom_config = {}
|
88
|
+
elif not isinstance(custom_config, dict):
|
89
|
+
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
90
|
+
|
91
|
+
with DynamicSession(
|
92
|
+
wait=wait,
|
93
|
+
proxy=proxy,
|
94
|
+
locale=locale,
|
95
|
+
timeout=timeout,
|
96
|
+
stealth=stealth,
|
97
|
+
cdp_url=cdp_url,
|
98
|
+
cookies=cookies,
|
99
|
+
headless=headless,
|
100
|
+
load_dom=load_dom,
|
101
|
+
useragent=useragent,
|
102
|
+
real_chrome=real_chrome,
|
103
|
+
page_action=page_action,
|
104
|
+
hide_canvas=hide_canvas,
|
105
|
+
init_script=init_script,
|
106
|
+
network_idle=network_idle,
|
107
|
+
google_search=google_search,
|
108
|
+
extra_headers=extra_headers,
|
109
|
+
wait_selector=wait_selector,
|
110
|
+
disable_webgl=disable_webgl,
|
111
|
+
additional_args=additional_args,
|
112
|
+
disable_resources=disable_resources,
|
113
|
+
wait_selector_state=wait_selector_state,
|
114
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
115
|
+
) as session:
|
116
|
+
return session.fetch(url)
|
117
|
+
|
118
|
+
@classmethod
|
119
|
+
async def async_fetch(
|
120
|
+
cls,
|
121
|
+
url: str,
|
122
|
+
headless: bool = True,
|
123
|
+
google_search: bool = True,
|
124
|
+
hide_canvas: bool = False,
|
125
|
+
disable_webgl: bool = False,
|
126
|
+
real_chrome: bool = False,
|
127
|
+
stealth: bool = False,
|
128
|
+
wait: int | float = 0,
|
129
|
+
page_action: Optional[Callable] = None,
|
130
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
131
|
+
locale: str = "en-US",
|
132
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
133
|
+
useragent: Optional[str] = None,
|
134
|
+
cdp_url: Optional[str] = None,
|
135
|
+
timeout: int | float = 30000,
|
136
|
+
disable_resources: bool = False,
|
137
|
+
wait_selector: Optional[str] = None,
|
138
|
+
init_script: Optional[str] = None,
|
139
|
+
cookies: Optional[List[Dict]] = None,
|
140
|
+
network_idle: bool = False,
|
141
|
+
load_dom: bool = True,
|
142
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
143
|
+
additional_args: Optional[Dict] = None,
|
144
|
+
custom_config: Optional[Dict] = None,
|
145
|
+
) -> Response:
|
146
|
+
"""Opens up a browser and do your request based on your chosen options below.
|
147
|
+
|
148
|
+
:param url: Target url.
|
149
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
150
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
151
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
152
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
153
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
154
|
+
:param cookies: Set cookies for the next request.
|
155
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
156
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
157
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
158
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
159
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
160
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
161
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
162
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
163
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
164
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
165
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
166
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
167
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
168
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
169
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
170
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
171
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
172
|
+
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
173
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
174
|
+
:return: A `Response` object.
|
175
|
+
"""
|
176
|
+
if not custom_config:
|
177
|
+
custom_config = {}
|
178
|
+
elif not isinstance(custom_config, dict):
|
179
|
+
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
180
|
+
|
181
|
+
async with AsyncDynamicSession(
|
182
|
+
wait=wait,
|
183
|
+
max_pages=1,
|
184
|
+
proxy=proxy,
|
185
|
+
locale=locale,
|
186
|
+
timeout=timeout,
|
187
|
+
stealth=stealth,
|
188
|
+
cdp_url=cdp_url,
|
189
|
+
cookies=cookies,
|
190
|
+
headless=headless,
|
191
|
+
load_dom=load_dom,
|
192
|
+
useragent=useragent,
|
193
|
+
real_chrome=real_chrome,
|
194
|
+
page_action=page_action,
|
195
|
+
hide_canvas=hide_canvas,
|
196
|
+
init_script=init_script,
|
197
|
+
network_idle=network_idle,
|
198
|
+
google_search=google_search,
|
199
|
+
extra_headers=extra_headers,
|
200
|
+
wait_selector=wait_selector,
|
201
|
+
disable_webgl=disable_webgl,
|
202
|
+
additional_args=additional_args,
|
203
|
+
disable_resources=disable_resources,
|
204
|
+
wait_selector_state=wait_selector_state,
|
205
|
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
206
|
+
) as session:
|
207
|
+
return await session.fetch(url)
|
208
|
+
|
209
|
+
|
210
|
+
PlayWrightFetcher = DynamicFetcher # For backward-compatibility
|