scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +205 -186
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +255 -260
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -19
- scrapling/engines/camo.py +0 -299
- scrapling/engines/pw.py +0 -428
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.98.dist-info/METADATA +0 -867
- scrapling-0.2.98.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -95
- tests/fetchers/async/test_httpx.py +0 -83
- tests/fetchers/async/test_playwright.py +0 -99
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -68
- tests/fetchers/sync/test_httpx.py +0 -82
- tests/fetchers/sync/test_playwright.py +0 -87
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,74 +1,97 @@
|
|
1
1
|
"""
|
2
2
|
Functions related to files and URLs
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
|
+
from pathlib import Path
|
6
|
+
from functools import lru_cache
|
5
7
|
from urllib.parse import urlencode, urlparse
|
6
8
|
|
7
9
|
from playwright.async_api import Route as async_Route
|
10
|
+
from msgspec import Struct, structs, convert, ValidationError
|
8
11
|
from playwright.sync_api import Route
|
9
12
|
|
10
|
-
from scrapling.core.
|
11
|
-
from scrapling.core.
|
13
|
+
from scrapling.core.utils import log
|
14
|
+
from scrapling.core._types import Dict, Optional, Tuple
|
12
15
|
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
13
16
|
|
17
|
+
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
18
|
+
|
19
|
+
|
20
|
+
class ProxyDict(Struct):
|
21
|
+
server: str
|
22
|
+
username: str = ""
|
23
|
+
password: str = ""
|
24
|
+
|
14
25
|
|
15
26
|
def intercept_route(route: Route):
|
16
|
-
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
27
|
+
"""This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
17
28
|
|
18
29
|
:param route: PlayWright `Route` object of the current page
|
19
30
|
:return: PlayWright `Route` object
|
20
31
|
"""
|
21
32
|
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
22
|
-
log.debug(
|
33
|
+
log.debug(
|
34
|
+
f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
|
35
|
+
)
|
23
36
|
route.abort()
|
24
37
|
else:
|
25
38
|
route.continue_()
|
26
39
|
|
27
40
|
|
28
41
|
async def async_intercept_route(route: async_Route):
|
29
|
-
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
42
|
+
"""This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
30
43
|
|
31
44
|
:param route: PlayWright `Route` object of the current page
|
32
45
|
:return: PlayWright `Route` object
|
33
46
|
"""
|
34
47
|
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
35
|
-
log.debug(
|
48
|
+
log.debug(
|
49
|
+
f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
|
50
|
+
)
|
36
51
|
await route.abort()
|
37
52
|
else:
|
38
53
|
await route.continue_()
|
39
54
|
|
40
55
|
|
41
|
-
def construct_proxy_dict(
|
56
|
+
def construct_proxy_dict(
|
57
|
+
proxy_string: str | Dict[str, str], as_tuple=False
|
58
|
+
) -> Optional[Dict | Tuple]:
|
42
59
|
"""Validate a proxy and return it in the acceptable format for Playwright
|
43
60
|
Reference: https://playwright.dev/python/docs/network#http-proxy
|
44
61
|
|
45
62
|
:param proxy_string: A string or a dictionary representation of the proxy.
|
63
|
+
:param as_tuple: Return the proxy dictionary as a tuple to be cachable
|
46
64
|
:return:
|
47
65
|
"""
|
48
|
-
if proxy_string:
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
66
|
+
if isinstance(proxy_string, str):
|
67
|
+
proxy = urlparse(proxy_string)
|
68
|
+
if (
|
69
|
+
proxy.scheme not in ("http", "https", "socks4", "socks5")
|
70
|
+
or not proxy.hostname
|
71
|
+
):
|
72
|
+
raise ValueError("Invalid proxy string!")
|
73
|
+
|
74
|
+
try:
|
75
|
+
result = {
|
76
|
+
"server": f"{proxy.scheme}://{proxy.hostname}",
|
77
|
+
"username": proxy.username or "",
|
78
|
+
"password": proxy.password or "",
|
79
|
+
}
|
80
|
+
if proxy.port:
|
81
|
+
result["server"] += f":{proxy.port}"
|
82
|
+
return tuple(result.items()) if as_tuple else result
|
83
|
+
except ValueError:
|
84
|
+
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
|
85
|
+
raise ValueError("The proxy argument's string is in invalid format!")
|
86
|
+
|
87
|
+
elif isinstance(proxy_string, dict):
|
88
|
+
try:
|
89
|
+
validated = convert(proxy_string, ProxyDict)
|
90
|
+
result_dict = structs.asdict(validated)
|
91
|
+
return tuple(result_dict.items()) if as_tuple else result_dict
|
92
|
+
except ValidationError as e:
|
93
|
+
raise TypeError(f"Invalid proxy dictionary: {e}")
|
94
|
+
|
72
95
|
return None
|
73
96
|
|
74
97
|
|
@@ -84,17 +107,24 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
84
107
|
parsed = urlparse(cdp_url)
|
85
108
|
|
86
109
|
# Check scheme
|
87
|
-
if parsed.scheme not in (
|
110
|
+
if parsed.scheme not in ("ws", "wss"):
|
88
111
|
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
89
112
|
|
90
113
|
# Validate hostname and port
|
91
114
|
if not parsed.netloc:
|
92
115
|
raise ValueError("Invalid hostname for the CDP URL")
|
93
116
|
|
94
|
-
|
117
|
+
try:
|
118
|
+
# Checking if the port is valid (if available)
|
119
|
+
_ = parsed.port
|
120
|
+
except ValueError:
|
121
|
+
# urlparse will raise `ValueError` if the port can't be casted to integer
|
122
|
+
raise ValueError("Invalid port for the CDP URL")
|
123
|
+
|
124
|
+
# Ensure the path starts with /
|
95
125
|
path = parsed.path
|
96
|
-
if not path.startswith(
|
97
|
-
path =
|
126
|
+
if not path.startswith("/"):
|
127
|
+
path = "/" + path
|
98
128
|
|
99
129
|
# Reconstruct the base URL with validated parts
|
100
130
|
validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
|
@@ -112,10 +142,9 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
112
142
|
|
113
143
|
@lru_cache(10, typed=True)
|
114
144
|
def js_bypass_path(filename: str) -> str:
|
115
|
-
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
145
|
+
"""Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it
|
116
146
|
|
117
147
|
:param filename: The base filename of the JS file.
|
118
148
|
:return: The full path of the JS file.
|
119
149
|
"""
|
120
|
-
|
121
|
-
return os.path.join(current_directory, 'bypasses', filename)
|
150
|
+
return str(__BYPASSES_DIR__ / filename)
|