scrapling 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +3 -0
- scrapling/core/ai.py +2 -1
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +4 -3
- scrapling/core/storage.py +5 -5
- scrapling/core/translator.py +13 -8
- scrapling/engines/_browsers/_base.py +175 -21
- scrapling/engines/_browsers/_camoufox.py +95 -171
- scrapling/engines/_browsers/_config_tools.py +9 -3
- scrapling/engines/_browsers/_controllers.py +51 -101
- scrapling/engines/_browsers/_validators.py +95 -63
- scrapling/engines/static.py +678 -668
- scrapling/engines/toolbelt/convertor.py +48 -15
- scrapling/engines/toolbelt/custom.py +6 -21
- scrapling/engines/toolbelt/fingerprints.py +14 -9
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +11 -1
- scrapling/fetchers/chrome.py +15 -4
- scrapling/fetchers/firefox.py +0 -4
- scrapling/parser.py +105 -80
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/METADATA +7 -6
- scrapling-0.3.8.dist-info/RECORD +47 -0
- scrapling-0.3.6.dist-info/RECORD +0 -47
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/WHEEL +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@ from functools import lru_cache
|
|
|
2
2
|
from re import compile as re_compile
|
|
3
3
|
|
|
4
4
|
from curl_cffi.requests import Response as CurlResponse
|
|
5
|
+
from playwright._impl._errors import Error as PlaywrightError
|
|
5
6
|
from playwright.sync_api import Page as SyncPage, Response as SyncResponse
|
|
6
7
|
from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
|
|
7
8
|
|
|
@@ -24,15 +25,15 @@ class ResponseFactory:
|
|
|
24
25
|
|
|
25
26
|
@classmethod
|
|
26
27
|
@lru_cache(maxsize=16)
|
|
27
|
-
def __extract_browser_encoding(cls, content_type: str | None) ->
|
|
28
|
+
def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
|
|
28
29
|
"""Extract browser encoding from headers.
|
|
29
30
|
Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
|
|
30
31
|
"""
|
|
31
32
|
if content_type:
|
|
32
33
|
# Because Playwright can't do that by themselves like all libraries for some reason :3
|
|
33
34
|
match = __CHARSET_RE__.search(content_type)
|
|
34
|
-
return match.group(1) if match else
|
|
35
|
-
return
|
|
35
|
+
return match.group(1) if match else default
|
|
36
|
+
return default
|
|
36
37
|
|
|
37
38
|
@classmethod
|
|
38
39
|
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
|
@@ -58,7 +59,8 @@ class ResponseFactory:
|
|
|
58
59
|
"encoding": cls.__extract_browser_encoding(
|
|
59
60
|
current_response.headers.get("content-type", "")
|
|
60
61
|
)
|
|
61
|
-
|
|
62
|
+
if current_response
|
|
63
|
+
else "utf-8",
|
|
62
64
|
"cookies": tuple(),
|
|
63
65
|
"headers": current_response.all_headers() if current_response else {},
|
|
64
66
|
"request_headers": current_request.all_headers(),
|
|
@@ -83,6 +85,7 @@ class ResponseFactory:
|
|
|
83
85
|
first_response: SyncResponse,
|
|
84
86
|
final_response: Optional[SyncResponse],
|
|
85
87
|
parser_arguments: Dict,
|
|
88
|
+
automated_page: bool = False,
|
|
86
89
|
) -> Response:
|
|
87
90
|
"""
|
|
88
91
|
Transforms a Playwright response into an internal `Response` object, encapsulating
|
|
@@ -98,6 +101,7 @@ class ResponseFactory:
|
|
|
98
101
|
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
|
99
102
|
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
|
100
103
|
the `Response` object.
|
|
104
|
+
:param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
|
|
101
105
|
|
|
102
106
|
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
|
103
107
|
:rtype: Response
|
|
@@ -107,15 +111,13 @@ class ResponseFactory:
|
|
|
107
111
|
if not final_response:
|
|
108
112
|
raise ValueError("Failed to get a response from the page")
|
|
109
113
|
|
|
110
|
-
encoding = (
|
|
111
|
-
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
|
112
|
-
) # default encoding
|
|
114
|
+
encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
|
|
113
115
|
# PlayWright API sometimes give empty status text for some reason!
|
|
114
116
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
|
115
117
|
|
|
116
118
|
history = cls._process_response_history(first_response, parser_arguments)
|
|
117
119
|
try:
|
|
118
|
-
page_content =
|
|
120
|
+
page_content = final_response.text() if not automated_page else cls._get_page_content(page)
|
|
119
121
|
except Exception as e: # pragma: no cover
|
|
120
122
|
log.error(f"Error getting page content: {e}")
|
|
121
123
|
page_content = ""
|
|
@@ -161,7 +163,8 @@ class ResponseFactory:
|
|
|
161
163
|
"encoding": cls.__extract_browser_encoding(
|
|
162
164
|
current_response.headers.get("content-type", "")
|
|
163
165
|
)
|
|
164
|
-
|
|
166
|
+
if current_response
|
|
167
|
+
else "utf-8",
|
|
165
168
|
"cookies": tuple(),
|
|
166
169
|
"headers": await current_response.all_headers() if current_response else {},
|
|
167
170
|
"request_headers": await current_request.all_headers(),
|
|
@@ -179,6 +182,36 @@ class ResponseFactory:
|
|
|
179
182
|
|
|
180
183
|
return history
|
|
181
184
|
|
|
185
|
+
@classmethod
|
|
186
|
+
def _get_page_content(cls, page: SyncPage) -> str:
|
|
187
|
+
"""
|
|
188
|
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
|
189
|
+
:param page: The page to extract content from.
|
|
190
|
+
:return:
|
|
191
|
+
"""
|
|
192
|
+
while True:
|
|
193
|
+
try:
|
|
194
|
+
return page.content() or ""
|
|
195
|
+
except PlaywrightError:
|
|
196
|
+
page.wait_for_timeout(500)
|
|
197
|
+
continue
|
|
198
|
+
return "" # pyright: ignore
|
|
199
|
+
|
|
200
|
+
@classmethod
|
|
201
|
+
async def _get_async_page_content(cls, page: AsyncPage) -> str:
|
|
202
|
+
"""
|
|
203
|
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
|
204
|
+
:param page: The page to extract content from.
|
|
205
|
+
:return:
|
|
206
|
+
"""
|
|
207
|
+
while True:
|
|
208
|
+
try:
|
|
209
|
+
return (await page.content()) or ""
|
|
210
|
+
except PlaywrightError:
|
|
211
|
+
await page.wait_for_timeout(500)
|
|
212
|
+
continue
|
|
213
|
+
return "" # pyright: ignore
|
|
214
|
+
|
|
182
215
|
@classmethod
|
|
183
216
|
async def from_async_playwright_response(
|
|
184
217
|
cls,
|
|
@@ -186,6 +219,7 @@ class ResponseFactory:
|
|
|
186
219
|
first_response: AsyncResponse,
|
|
187
220
|
final_response: Optional[AsyncResponse],
|
|
188
221
|
parser_arguments: Dict,
|
|
222
|
+
automated_page: bool = False,
|
|
189
223
|
) -> Response:
|
|
190
224
|
"""
|
|
191
225
|
Transforms a Playwright response into an internal `Response` object, encapsulating
|
|
@@ -201,6 +235,7 @@ class ResponseFactory:
|
|
|
201
235
|
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
|
202
236
|
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
|
203
237
|
the `Response` object.
|
|
238
|
+
:param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
|
|
204
239
|
|
|
205
240
|
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
|
206
241
|
:rtype: Response
|
|
@@ -210,15 +245,13 @@ class ResponseFactory:
|
|
|
210
245
|
if not final_response:
|
|
211
246
|
raise ValueError("Failed to get a response from the page")
|
|
212
247
|
|
|
213
|
-
encoding = (
|
|
214
|
-
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
|
215
|
-
) # default encoding
|
|
248
|
+
encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
|
|
216
249
|
# PlayWright API sometimes give empty status text for some reason!
|
|
217
250
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
|
218
251
|
|
|
219
252
|
history = await cls._async_process_response_history(first_response, parser_arguments)
|
|
220
253
|
try:
|
|
221
|
-
page_content = await
|
|
254
|
+
page_content = await (final_response.text() if not automated_page else cls._get_async_page_content(page))
|
|
222
255
|
except Exception as e: # pragma: no cover
|
|
223
256
|
log.error(f"Error getting page content in async: {e}")
|
|
224
257
|
page_content = ""
|
|
@@ -255,8 +288,8 @@ class ResponseFactory:
|
|
|
255
288
|
"encoding": response.encoding or "utf-8",
|
|
256
289
|
"cookies": dict(response.cookies),
|
|
257
290
|
"headers": dict(response.headers),
|
|
258
|
-
"request_headers": dict(response.request.headers),
|
|
259
|
-
"method": response.request.method,
|
|
291
|
+
"request_headers": dict(response.request.headers) if response.request else {},
|
|
292
|
+
"method": response.request.method if response.request else "GET",
|
|
260
293
|
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
|
261
294
|
**parser_arguments,
|
|
262
295
|
}
|
|
@@ -8,6 +8,7 @@ from scrapling.core.utils import log
|
|
|
8
8
|
from scrapling.core._types import (
|
|
9
9
|
Any,
|
|
10
10
|
Dict,
|
|
11
|
+
cast,
|
|
11
12
|
List,
|
|
12
13
|
Optional,
|
|
13
14
|
Tuple,
|
|
@@ -30,10 +31,10 @@ class Response(Selector):
|
|
|
30
31
|
request_headers: Dict,
|
|
31
32
|
encoding: str = "utf-8",
|
|
32
33
|
method: str = "GET",
|
|
33
|
-
history: List = None,
|
|
34
|
-
**selector_config:
|
|
34
|
+
history: List | None = None,
|
|
35
|
+
**selector_config: Any,
|
|
35
36
|
):
|
|
36
|
-
adaptive_domain = selector_config.pop("adaptive_domain",
|
|
37
|
+
adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
|
|
37
38
|
self.status = status
|
|
38
39
|
self.reason = reason
|
|
39
40
|
self.cookies = cookies
|
|
@@ -58,7 +59,7 @@ class BaseFetcher:
|
|
|
58
59
|
keep_cdata: Optional[bool] = False
|
|
59
60
|
storage_args: Optional[Dict] = None
|
|
60
61
|
keep_comments: Optional[bool] = False
|
|
61
|
-
adaptive_domain:
|
|
62
|
+
adaptive_domain: str = ""
|
|
62
63
|
parser_keywords: Tuple = (
|
|
63
64
|
"huge_tree",
|
|
64
65
|
"adaptive",
|
|
@@ -124,12 +125,8 @@ class BaseFetcher:
|
|
|
124
125
|
adaptive=cls.adaptive,
|
|
125
126
|
storage=cls.storage,
|
|
126
127
|
storage_args=cls.storage_args,
|
|
128
|
+
adaptive_domain=cls.adaptive_domain,
|
|
127
129
|
)
|
|
128
|
-
if cls.adaptive_domain:
|
|
129
|
-
if not isinstance(cls.adaptive_domain, str):
|
|
130
|
-
log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
|
|
131
|
-
else:
|
|
132
|
-
parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
|
|
133
130
|
|
|
134
131
|
return parser_arguments
|
|
135
132
|
|
|
@@ -212,15 +209,3 @@ class StatusText:
|
|
|
212
209
|
def get(cls, status_code: int) -> str:
|
|
213
210
|
"""Get the phrase for a given HTTP status code."""
|
|
214
211
|
return cls._phrases.get(status_code, "Unknown Status Code")
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
def get_variable_name(var: Any) -> Optional[str]:
|
|
218
|
-
"""Get the name of a variable using global and local scopes.
|
|
219
|
-
:param var: The variable to find the name for
|
|
220
|
-
:return: The name of the variable if found, None otherwise
|
|
221
|
-
"""
|
|
222
|
-
for scope in [globals(), locals()]:
|
|
223
|
-
for name, value in scope.items():
|
|
224
|
-
if value is var:
|
|
225
|
-
return name
|
|
226
|
-
return None
|
|
@@ -7,10 +7,12 @@ from platform import system as platform_system
|
|
|
7
7
|
|
|
8
8
|
from tldextract import extract
|
|
9
9
|
from browserforge.headers import Browser, HeaderGenerator
|
|
10
|
+
from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
|
|
10
11
|
|
|
11
|
-
from scrapling.core._types import Dict,
|
|
12
|
+
from scrapling.core._types import Dict, Literal, Tuple
|
|
12
13
|
|
|
13
14
|
__OS_NAME__ = platform_system()
|
|
15
|
+
OSName = Literal["linux", "macos", "windows"]
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
@lru_cache(10, typed=True)
|
|
@@ -28,16 +30,20 @@ def generate_convincing_referer(url: str) -> str:
|
|
|
28
30
|
|
|
29
31
|
|
|
30
32
|
@lru_cache(1, typed=True)
|
|
31
|
-
def get_os_name() ->
|
|
32
|
-
"""Get the current OS name in the same format needed for browserforge
|
|
33
|
+
def get_os_name() -> OSName | Tuple:
|
|
34
|
+
"""Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
|
|
33
35
|
|
|
34
36
|
:return: Current OS name or `None` otherwise
|
|
35
37
|
"""
|
|
36
|
-
|
|
37
|
-
"Linux":
|
|
38
|
-
|
|
39
|
-
"
|
|
40
|
-
|
|
38
|
+
match __OS_NAME__: # pragma: no cover
|
|
39
|
+
case "Linux":
|
|
40
|
+
return "linux"
|
|
41
|
+
case "Darwin":
|
|
42
|
+
return "macos"
|
|
43
|
+
case "Windows":
|
|
44
|
+
return "windows"
|
|
45
|
+
case _:
|
|
46
|
+
return SUPPORTED_OPERATING_SYSTEMS
|
|
41
47
|
|
|
42
48
|
|
|
43
49
|
def generate_headers(browser_mode: bool = False) -> Dict:
|
|
@@ -58,7 +64,6 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
|
58
64
|
Browser(name="edge", min_version=130),
|
|
59
65
|
]
|
|
60
66
|
)
|
|
61
|
-
|
|
62
67
|
return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
|
|
63
68
|
|
|
64
69
|
|
|
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
|
|
|
11
11
|
from playwright.sync_api import Route
|
|
12
12
|
|
|
13
13
|
from scrapling.core.utils import log
|
|
14
|
-
from scrapling.core._types import Dict,
|
|
14
|
+
from scrapling.core._types import Dict, Tuple, overload, Literal
|
|
15
15
|
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
|
16
16
|
|
|
17
17
|
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
|
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
|
|
|
49
49
|
await route.continue_()
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
|
|
52
|
+
@overload
|
|
53
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@overload
|
|
57
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
|
|
53
61
|
"""Validate a proxy and return it in the acceptable format for Playwright
|
|
54
62
|
Reference: https://playwright.dev/python/docs/network#http-proxy
|
|
55
63
|
|
|
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
|
|
|
83
91
|
except ValidationError as e:
|
|
84
92
|
raise TypeError(f"Invalid proxy dictionary: {e}")
|
|
85
93
|
|
|
86
|
-
|
|
94
|
+
raise TypeError(f"Invalid proxy string: {proxy_string}")
|
|
87
95
|
|
|
88
96
|
|
|
89
97
|
@lru_cache(10, typed=True)
|
scrapling/fetchers/__init__.py
CHANGED
|
@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
|
|
|
19
19
|
"AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
__all__ = [
|
|
22
|
+
__all__ = [
|
|
23
|
+
"Fetcher",
|
|
24
|
+
"AsyncFetcher",
|
|
25
|
+
"FetcherSession",
|
|
26
|
+
"DynamicFetcher",
|
|
27
|
+
"DynamicSession",
|
|
28
|
+
"AsyncDynamicSession",
|
|
29
|
+
"StealthyFetcher",
|
|
30
|
+
"StealthySession",
|
|
31
|
+
"AsyncStealthySession",
|
|
32
|
+
]
|
|
23
33
|
|
|
24
34
|
|
|
25
35
|
def __getattr__(name: str) -> Any:
|
scrapling/fetchers/chrome.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from scrapling.core._types import (
|
|
2
2
|
Callable,
|
|
3
|
-
Dict,
|
|
4
3
|
List,
|
|
4
|
+
Dict,
|
|
5
5
|
Optional,
|
|
6
6
|
SelectorWaitStates,
|
|
7
|
-
Iterable,
|
|
8
7
|
)
|
|
9
8
|
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
|
10
9
|
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
|
|
@@ -47,10 +46,12 @@ class DynamicFetcher(BaseFetcher):
|
|
|
47
46
|
disable_resources: bool = False,
|
|
48
47
|
wait_selector: Optional[str] = None,
|
|
49
48
|
init_script: Optional[str] = None,
|
|
50
|
-
cookies: Optional[
|
|
49
|
+
cookies: Optional[List[Dict]] = None,
|
|
51
50
|
network_idle: bool = False,
|
|
52
51
|
load_dom: bool = True,
|
|
53
52
|
wait_selector_state: SelectorWaitStates = "attached",
|
|
53
|
+
extra_flags: Optional[List[str]] = None,
|
|
54
|
+
additional_args: Optional[Dict] = None,
|
|
54
55
|
custom_config: Optional[Dict] = None,
|
|
55
56
|
) -> Response:
|
|
56
57
|
"""Opens up a browser and do your request based on your chosen options below.
|
|
@@ -79,7 +80,9 @@ class DynamicFetcher(BaseFetcher):
|
|
|
79
80
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
|
80
81
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
81
82
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
83
|
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
82
84
|
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
|
85
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
|
83
86
|
:return: A `Response` object.
|
|
84
87
|
"""
|
|
85
88
|
if not custom_config:
|
|
@@ -107,6 +110,8 @@ class DynamicFetcher(BaseFetcher):
|
|
|
107
110
|
extra_headers=extra_headers,
|
|
108
111
|
wait_selector=wait_selector,
|
|
109
112
|
disable_webgl=disable_webgl,
|
|
113
|
+
extra_flags=extra_flags,
|
|
114
|
+
additional_args=additional_args,
|
|
110
115
|
disable_resources=disable_resources,
|
|
111
116
|
wait_selector_state=wait_selector_state,
|
|
112
117
|
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
|
@@ -134,10 +139,12 @@ class DynamicFetcher(BaseFetcher):
|
|
|
134
139
|
disable_resources: bool = False,
|
|
135
140
|
wait_selector: Optional[str] = None,
|
|
136
141
|
init_script: Optional[str] = None,
|
|
137
|
-
cookies: Optional[
|
|
142
|
+
cookies: Optional[List[Dict]] = None,
|
|
138
143
|
network_idle: bool = False,
|
|
139
144
|
load_dom: bool = True,
|
|
140
145
|
wait_selector_state: SelectorWaitStates = "attached",
|
|
146
|
+
extra_flags: Optional[List[str]] = None,
|
|
147
|
+
additional_args: Optional[Dict] = None,
|
|
141
148
|
custom_config: Optional[Dict] = None,
|
|
142
149
|
) -> Response:
|
|
143
150
|
"""Opens up a browser and do your request based on your chosen options below.
|
|
@@ -166,7 +173,9 @@ class DynamicFetcher(BaseFetcher):
|
|
|
166
173
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
|
167
174
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
168
175
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
176
|
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
169
177
|
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
|
178
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
|
170
179
|
:return: A `Response` object.
|
|
171
180
|
"""
|
|
172
181
|
if not custom_config:
|
|
@@ -195,6 +204,8 @@ class DynamicFetcher(BaseFetcher):
|
|
|
195
204
|
extra_headers=extra_headers,
|
|
196
205
|
wait_selector=wait_selector,
|
|
197
206
|
disable_webgl=disable_webgl,
|
|
207
|
+
extra_flags=extra_flags,
|
|
208
|
+
additional_args=additional_args,
|
|
198
209
|
disable_resources=disable_resources,
|
|
199
210
|
wait_selector_state=wait_selector_state,
|
|
200
211
|
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
scrapling/fetchers/firefox.py
CHANGED
|
@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
|
|
|
83
83
|
"""
|
|
84
84
|
if not custom_config:
|
|
85
85
|
custom_config = {}
|
|
86
|
-
elif not isinstance(custom_config, dict):
|
|
87
|
-
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
|
88
86
|
|
|
89
87
|
with StealthySession(
|
|
90
88
|
wait=wait,
|
|
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
|
|
|
182
180
|
"""
|
|
183
181
|
if not custom_config:
|
|
184
182
|
custom_config = {}
|
|
185
|
-
elif not isinstance(custom_config, dict):
|
|
186
|
-
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
|
187
183
|
|
|
188
184
|
async with AsyncStealthySession(
|
|
189
185
|
wait=wait,
|