scrapling 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +3 -0
- scrapling/core/ai.py +2 -1
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +4 -3
- scrapling/core/storage.py +5 -5
- scrapling/core/translator.py +13 -8
- scrapling/engines/_browsers/_base.py +37 -14
- scrapling/engines/_browsers/_camoufox.py +76 -35
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +32 -11
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/static.py +678 -668
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +11 -1
- scrapling/fetchers/chrome.py +9 -4
- scrapling/fetchers/firefox.py +0 -4
- scrapling/parser.py +105 -80
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/METADATA +3 -4
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling-0.3.6.dist-info/RECORD +0 -47
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
@@ -24,15 +24,15 @@ class ResponseFactory:
|
|
24
24
|
|
25
25
|
@classmethod
|
26
26
|
@lru_cache(maxsize=16)
|
27
|
-
def __extract_browser_encoding(cls, content_type: str | None) ->
|
27
|
+
def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
|
28
28
|
"""Extract browser encoding from headers.
|
29
29
|
Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
|
30
30
|
"""
|
31
31
|
if content_type:
|
32
32
|
# Because Playwright can't do that by themselves like all libraries for some reason :3
|
33
33
|
match = __CHARSET_RE__.search(content_type)
|
34
|
-
return match.group(1) if match else
|
35
|
-
return
|
34
|
+
return match.group(1) if match else default
|
35
|
+
return default
|
36
36
|
|
37
37
|
@classmethod
|
38
38
|
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
@@ -58,7 +58,8 @@ class ResponseFactory:
|
|
58
58
|
"encoding": cls.__extract_browser_encoding(
|
59
59
|
current_response.headers.get("content-type", "")
|
60
60
|
)
|
61
|
-
|
61
|
+
if current_response
|
62
|
+
else "utf-8",
|
62
63
|
"cookies": tuple(),
|
63
64
|
"headers": current_response.all_headers() if current_response else {},
|
64
65
|
"request_headers": current_request.all_headers(),
|
@@ -107,15 +108,13 @@ class ResponseFactory:
|
|
107
108
|
if not final_response:
|
108
109
|
raise ValueError("Failed to get a response from the page")
|
109
110
|
|
110
|
-
encoding = (
|
111
|
-
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
112
|
-
) # default encoding
|
111
|
+
encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
|
113
112
|
# PlayWright API sometimes give empty status text for some reason!
|
114
113
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
115
114
|
|
116
115
|
history = cls._process_response_history(first_response, parser_arguments)
|
117
116
|
try:
|
118
|
-
page_content =
|
117
|
+
page_content = final_response.text()
|
119
118
|
except Exception as e: # pragma: no cover
|
120
119
|
log.error(f"Error getting page content: {e}")
|
121
120
|
page_content = ""
|
@@ -161,7 +160,8 @@ class ResponseFactory:
|
|
161
160
|
"encoding": cls.__extract_browser_encoding(
|
162
161
|
current_response.headers.get("content-type", "")
|
163
162
|
)
|
164
|
-
|
163
|
+
if current_response
|
164
|
+
else "utf-8",
|
165
165
|
"cookies": tuple(),
|
166
166
|
"headers": await current_response.all_headers() if current_response else {},
|
167
167
|
"request_headers": await current_request.all_headers(),
|
@@ -210,15 +210,13 @@ class ResponseFactory:
|
|
210
210
|
if not final_response:
|
211
211
|
raise ValueError("Failed to get a response from the page")
|
212
212
|
|
213
|
-
encoding = (
|
214
|
-
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
215
|
-
) # default encoding
|
213
|
+
encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
|
216
214
|
# PlayWright API sometimes give empty status text for some reason!
|
217
215
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
218
216
|
|
219
217
|
history = await cls._async_process_response_history(first_response, parser_arguments)
|
220
218
|
try:
|
221
|
-
page_content = await
|
219
|
+
page_content = await final_response.text()
|
222
220
|
except Exception as e: # pragma: no cover
|
223
221
|
log.error(f"Error getting page content in async: {e}")
|
224
222
|
page_content = ""
|
@@ -255,8 +253,8 @@ class ResponseFactory:
|
|
255
253
|
"encoding": response.encoding or "utf-8",
|
256
254
|
"cookies": dict(response.cookies),
|
257
255
|
"headers": dict(response.headers),
|
258
|
-
"request_headers": dict(response.request.headers),
|
259
|
-
"method": response.request.method,
|
256
|
+
"request_headers": dict(response.request.headers) if response.request else {},
|
257
|
+
"method": response.request.method if response.request else "GET",
|
260
258
|
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
261
259
|
**parser_arguments,
|
262
260
|
}
|
@@ -8,6 +8,7 @@ from scrapling.core.utils import log
|
|
8
8
|
from scrapling.core._types import (
|
9
9
|
Any,
|
10
10
|
Dict,
|
11
|
+
cast,
|
11
12
|
List,
|
12
13
|
Optional,
|
13
14
|
Tuple,
|
@@ -30,10 +31,10 @@ class Response(Selector):
|
|
30
31
|
request_headers: Dict,
|
31
32
|
encoding: str = "utf-8",
|
32
33
|
method: str = "GET",
|
33
|
-
history: List = None,
|
34
|
-
**selector_config:
|
34
|
+
history: List | None = None,
|
35
|
+
**selector_config: Any,
|
35
36
|
):
|
36
|
-
adaptive_domain = selector_config.pop("adaptive_domain",
|
37
|
+
adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
|
37
38
|
self.status = status
|
38
39
|
self.reason = reason
|
39
40
|
self.cookies = cookies
|
@@ -58,7 +59,7 @@ class BaseFetcher:
|
|
58
59
|
keep_cdata: Optional[bool] = False
|
59
60
|
storage_args: Optional[Dict] = None
|
60
61
|
keep_comments: Optional[bool] = False
|
61
|
-
adaptive_domain:
|
62
|
+
adaptive_domain: str = ""
|
62
63
|
parser_keywords: Tuple = (
|
63
64
|
"huge_tree",
|
64
65
|
"adaptive",
|
@@ -124,12 +125,8 @@ class BaseFetcher:
|
|
124
125
|
adaptive=cls.adaptive,
|
125
126
|
storage=cls.storage,
|
126
127
|
storage_args=cls.storage_args,
|
128
|
+
adaptive_domain=cls.adaptive_domain,
|
127
129
|
)
|
128
|
-
if cls.adaptive_domain:
|
129
|
-
if not isinstance(cls.adaptive_domain, str):
|
130
|
-
log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
|
131
|
-
else:
|
132
|
-
parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
|
133
130
|
|
134
131
|
return parser_arguments
|
135
132
|
|
@@ -8,9 +8,10 @@ from platform import system as platform_system
|
|
8
8
|
from tldextract import extract
|
9
9
|
from browserforge.headers import Browser, HeaderGenerator
|
10
10
|
|
11
|
-
from scrapling.core._types import Dict,
|
11
|
+
from scrapling.core._types import Dict, Literal
|
12
12
|
|
13
13
|
__OS_NAME__ = platform_system()
|
14
|
+
OSName = Literal["linux", "macos", "windows"]
|
14
15
|
|
15
16
|
|
16
17
|
@lru_cache(10, typed=True)
|
@@ -28,16 +29,20 @@ def generate_convincing_referer(url: str) -> str:
|
|
28
29
|
|
29
30
|
|
30
31
|
@lru_cache(1, typed=True)
|
31
|
-
def get_os_name() ->
|
32
|
-
"""Get the current OS name in the same format needed for browserforge
|
32
|
+
def get_os_name() -> OSName | None:
|
33
|
+
"""Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
|
33
34
|
|
34
35
|
:return: Current OS name or `None` otherwise
|
35
36
|
"""
|
36
|
-
|
37
|
-
"Linux":
|
38
|
-
|
39
|
-
"
|
40
|
-
|
37
|
+
match __OS_NAME__:
|
38
|
+
case "Linux":
|
39
|
+
return "linux"
|
40
|
+
case "Darwin":
|
41
|
+
return "macos"
|
42
|
+
case "Windows":
|
43
|
+
return "windows"
|
44
|
+
case _:
|
45
|
+
return None
|
41
46
|
|
42
47
|
|
43
48
|
def generate_headers(browser_mode: bool = False) -> Dict:
|
@@ -58,8 +63,10 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
58
63
|
Browser(name="edge", min_version=130),
|
59
64
|
]
|
60
65
|
)
|
61
|
-
|
62
|
-
|
66
|
+
if os_name:
|
67
|
+
return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
|
68
|
+
else:
|
69
|
+
return HeaderGenerator(browser=browsers, device="desktop").generate()
|
63
70
|
|
64
71
|
|
65
72
|
__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
|
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
|
|
11
11
|
from playwright.sync_api import Route
|
12
12
|
|
13
13
|
from scrapling.core.utils import log
|
14
|
-
from scrapling.core._types import Dict,
|
14
|
+
from scrapling.core._types import Dict, Tuple, overload, Literal
|
15
15
|
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
16
16
|
|
17
17
|
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
|
|
49
49
|
await route.continue_()
|
50
50
|
|
51
51
|
|
52
|
-
|
52
|
+
@overload
|
53
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
|
54
|
+
|
55
|
+
|
56
|
+
@overload
|
57
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
|
58
|
+
|
59
|
+
|
60
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
|
53
61
|
"""Validate a proxy and return it in the acceptable format for Playwright
|
54
62
|
Reference: https://playwright.dev/python/docs/network#http-proxy
|
55
63
|
|
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
|
|
83
91
|
except ValidationError as e:
|
84
92
|
raise TypeError(f"Invalid proxy dictionary: {e}")
|
85
93
|
|
86
|
-
|
94
|
+
raise TypeError(f"Invalid proxy string: {proxy_string}")
|
87
95
|
|
88
96
|
|
89
97
|
@lru_cache(10, typed=True)
|
scrapling/fetchers/__init__.py
CHANGED
@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
|
|
19
19
|
"AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
|
20
20
|
}
|
21
21
|
|
22
|
-
__all__ = [
|
22
|
+
__all__ = [
|
23
|
+
"Fetcher",
|
24
|
+
"AsyncFetcher",
|
25
|
+
"FetcherSession",
|
26
|
+
"DynamicFetcher",
|
27
|
+
"DynamicSession",
|
28
|
+
"AsyncDynamicSession",
|
29
|
+
"StealthyFetcher",
|
30
|
+
"StealthySession",
|
31
|
+
"AsyncStealthySession",
|
32
|
+
]
|
23
33
|
|
24
34
|
|
25
35
|
def __getattr__(name: str) -> Any:
|
scrapling/fetchers/chrome.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
from scrapling.core._types import (
|
2
2
|
Callable,
|
3
|
-
Dict,
|
4
3
|
List,
|
4
|
+
Dict,
|
5
5
|
Optional,
|
6
6
|
SelectorWaitStates,
|
7
|
-
Iterable,
|
8
7
|
)
|
9
8
|
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
10
9
|
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
|
@@ -47,10 +46,11 @@ class DynamicFetcher(BaseFetcher):
|
|
47
46
|
disable_resources: bool = False,
|
48
47
|
wait_selector: Optional[str] = None,
|
49
48
|
init_script: Optional[str] = None,
|
50
|
-
cookies: Optional[
|
49
|
+
cookies: Optional[List[Dict]] = None,
|
51
50
|
network_idle: bool = False,
|
52
51
|
load_dom: bool = True,
|
53
52
|
wait_selector_state: SelectorWaitStates = "attached",
|
53
|
+
additional_args: Optional[Dict] = None,
|
54
54
|
custom_config: Optional[Dict] = None,
|
55
55
|
) -> Response:
|
56
56
|
"""Opens up a browser and do your request based on your chosen options below.
|
@@ -80,6 +80,7 @@ class DynamicFetcher(BaseFetcher):
|
|
80
80
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
81
81
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
82
82
|
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
83
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
83
84
|
:return: A `Response` object.
|
84
85
|
"""
|
85
86
|
if not custom_config:
|
@@ -107,6 +108,7 @@ class DynamicFetcher(BaseFetcher):
|
|
107
108
|
extra_headers=extra_headers,
|
108
109
|
wait_selector=wait_selector,
|
109
110
|
disable_webgl=disable_webgl,
|
111
|
+
additional_args=additional_args,
|
110
112
|
disable_resources=disable_resources,
|
111
113
|
wait_selector_state=wait_selector_state,
|
112
114
|
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
@@ -134,10 +136,11 @@ class DynamicFetcher(BaseFetcher):
|
|
134
136
|
disable_resources: bool = False,
|
135
137
|
wait_selector: Optional[str] = None,
|
136
138
|
init_script: Optional[str] = None,
|
137
|
-
cookies: Optional[
|
139
|
+
cookies: Optional[List[Dict]] = None,
|
138
140
|
network_idle: bool = False,
|
139
141
|
load_dom: bool = True,
|
140
142
|
wait_selector_state: SelectorWaitStates = "attached",
|
143
|
+
additional_args: Optional[Dict] = None,
|
141
144
|
custom_config: Optional[Dict] = None,
|
142
145
|
) -> Response:
|
143
146
|
"""Opens up a browser and do your request based on your chosen options below.
|
@@ -167,6 +170,7 @@ class DynamicFetcher(BaseFetcher):
|
|
167
170
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
168
171
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
169
172
|
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
173
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
170
174
|
:return: A `Response` object.
|
171
175
|
"""
|
172
176
|
if not custom_config:
|
@@ -195,6 +199,7 @@ class DynamicFetcher(BaseFetcher):
|
|
195
199
|
extra_headers=extra_headers,
|
196
200
|
wait_selector=wait_selector,
|
197
201
|
disable_webgl=disable_webgl,
|
202
|
+
additional_args=additional_args,
|
198
203
|
disable_resources=disable_resources,
|
199
204
|
wait_selector_state=wait_selector_state,
|
200
205
|
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
scrapling/fetchers/firefox.py
CHANGED
@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
|
|
83
83
|
"""
|
84
84
|
if not custom_config:
|
85
85
|
custom_config = {}
|
86
|
-
elif not isinstance(custom_config, dict):
|
87
|
-
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
88
86
|
|
89
87
|
with StealthySession(
|
90
88
|
wait=wait,
|
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
|
|
182
180
|
"""
|
183
181
|
if not custom_config:
|
184
182
|
custom_config = {}
|
185
|
-
elif not isinstance(custom_config, dict):
|
186
|
-
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
187
183
|
|
188
184
|
async with AsyncStealthySession(
|
189
185
|
wait=wait,
|