scrapling 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +25 -8
- scrapling/core/_types.py +0 -2
- scrapling/core/ai.py +22 -14
- scrapling/core/custom_types.py +2 -2
- scrapling/core/shell.py +6 -5
- scrapling/core/storage.py +2 -1
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +11 -36
- scrapling/engines/_browsers/_camoufox.py +75 -60
- scrapling/engines/_browsers/_controllers.py +43 -52
- scrapling/engines/_browsers/_page.py +1 -42
- scrapling/engines/_browsers/_validators.py +130 -65
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +417 -16
- scrapling/engines/toolbelt/navigation.py +1 -1
- scrapling/fetchers/__init__.py +36 -0
- scrapling/fetchers/chrome.py +205 -0
- scrapling/fetchers/firefox.py +216 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +7 -7
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/METADATA +25 -23
- scrapling-0.3.6.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.4.dist-info/RECORD +0 -44
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/WHEEL +0 -0
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,69 @@
|
|
1
|
-
from msgspec import Struct, convert, ValidationError
|
2
|
-
from urllib.parse import urlparse
|
3
1
|
from pathlib import Path
|
2
|
+
from typing import Annotated
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from urllib.parse import urlparse
|
5
|
+
|
6
|
+
from msgspec import Struct, Meta, convert, ValidationError
|
4
7
|
|
5
8
|
from scrapling.core._types import (
|
6
|
-
Optional,
|
7
9
|
Dict,
|
8
|
-
Callable,
|
9
10
|
List,
|
11
|
+
Tuple,
|
12
|
+
Optional,
|
13
|
+
Callable,
|
10
14
|
SelectorWaitStates,
|
11
15
|
)
|
12
16
|
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
13
17
|
|
14
18
|
|
19
|
+
# Custom validators for msgspec
|
20
|
+
def _validate_file_path(value: str):
|
21
|
+
"""Fast file path validation"""
|
22
|
+
path = Path(value)
|
23
|
+
if not path.exists():
|
24
|
+
raise ValueError(f"Init script path not found: {value}")
|
25
|
+
if not path.is_file():
|
26
|
+
raise ValueError(f"Init script is not a file: {value}")
|
27
|
+
if not path.is_absolute():
|
28
|
+
raise ValueError(f"Init script is not a absolute path: {value}")
|
29
|
+
|
30
|
+
|
31
|
+
def _validate_addon_path(value: str):
|
32
|
+
"""Fast addon path validation"""
|
33
|
+
path = Path(value)
|
34
|
+
if not path.exists():
|
35
|
+
raise FileNotFoundError(f"Addon path not found: {value}")
|
36
|
+
if not path.is_dir():
|
37
|
+
raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
|
38
|
+
|
39
|
+
|
40
|
+
def _validate_cdp_url(cdp_url: str):
|
41
|
+
"""Fast CDP URL validation"""
|
42
|
+
try:
|
43
|
+
# Check the scheme
|
44
|
+
if not cdp_url.startswith(("ws://", "wss://")):
|
45
|
+
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
46
|
+
|
47
|
+
# Validate hostname and port
|
48
|
+
if not urlparse(cdp_url).netloc:
|
49
|
+
raise ValueError("Invalid hostname for the CDP URL")
|
50
|
+
|
51
|
+
except AttributeError as e:
|
52
|
+
raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
|
53
|
+
|
54
|
+
except Exception as e:
|
55
|
+
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
56
|
+
|
57
|
+
|
58
|
+
# Type aliases for cleaner annotations
|
59
|
+
PagesCount = Annotated[int, Meta(ge=1, le=50)]
|
60
|
+
Seconds = Annotated[int, float, Meta(ge=0)]
|
61
|
+
|
62
|
+
|
15
63
|
class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
16
64
|
"""Configuration struct for validation"""
|
17
65
|
|
18
|
-
max_pages:
|
66
|
+
max_pages: PagesCount = 1
|
19
67
|
cdp_url: Optional[str] = None
|
20
68
|
headless: bool = True
|
21
69
|
google_search: bool = True
|
@@ -23,13 +71,13 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
23
71
|
disable_webgl: bool = False
|
24
72
|
real_chrome: bool = False
|
25
73
|
stealth: bool = False
|
26
|
-
wait:
|
74
|
+
wait: Seconds = 0
|
27
75
|
page_action: Optional[Callable] = None
|
28
76
|
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
29
77
|
locale: str = "en-US"
|
30
78
|
extra_headers: Optional[Dict[str, str]] = None
|
31
79
|
useragent: Optional[str] = None
|
32
|
-
timeout:
|
80
|
+
timeout: Seconds = 30000
|
33
81
|
init_script: Optional[str] = None
|
34
82
|
disable_resources: bool = False
|
35
83
|
wait_selector: Optional[str] = None
|
@@ -41,52 +89,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
41
89
|
|
42
90
|
def __post_init__(self):
|
43
91
|
"""Custom validation after msgspec validation"""
|
44
|
-
if self.max_pages < 1 or self.max_pages > 50:
|
45
|
-
raise ValueError("max_pages must be between 1 and 50")
|
46
|
-
if self.timeout < 0:
|
47
|
-
raise ValueError("timeout must be >= 0")
|
48
92
|
if self.page_action and not callable(self.page_action):
|
49
93
|
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
|
50
94
|
if self.proxy:
|
51
95
|
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
52
96
|
if self.cdp_url:
|
53
|
-
|
97
|
+
_validate_cdp_url(self.cdp_url)
|
98
|
+
|
54
99
|
if not self.cookies:
|
55
100
|
self.cookies = []
|
56
101
|
if not self.selector_config:
|
57
102
|
self.selector_config = {}
|
58
103
|
|
59
104
|
if self.init_script is not None:
|
60
|
-
|
61
|
-
if not script_path.exists():
|
62
|
-
raise ValueError("Init script path not found")
|
63
|
-
elif not script_path.is_file():
|
64
|
-
raise ValueError("Init script is not a file")
|
65
|
-
elif not script_path.is_absolute():
|
66
|
-
raise ValueError("Init script is not a absolute path")
|
67
|
-
|
68
|
-
@staticmethod
|
69
|
-
def __validate_cdp(cdp_url):
|
70
|
-
try:
|
71
|
-
# Check the scheme
|
72
|
-
if not cdp_url.startswith(("ws://", "wss://")):
|
73
|
-
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
74
|
-
|
75
|
-
# Validate hostname and port
|
76
|
-
if not urlparse(cdp_url).netloc:
|
77
|
-
raise ValueError("Invalid hostname for the CDP URL")
|
78
|
-
|
79
|
-
except AttributeError as e:
|
80
|
-
raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
|
81
|
-
|
82
|
-
except Exception as e:
|
83
|
-
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
105
|
+
_validate_file_path(self.init_script)
|
84
106
|
|
85
107
|
|
86
108
|
class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
87
109
|
"""Configuration struct for validation"""
|
88
110
|
|
89
|
-
max_pages:
|
111
|
+
max_pages: PagesCount = 1
|
90
112
|
headless: bool = True # noqa: F821
|
91
113
|
block_images: bool = False
|
92
114
|
disable_resources: bool = False
|
@@ -96,8 +118,8 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
96
118
|
load_dom: bool = True
|
97
119
|
humanize: bool | float = True
|
98
120
|
solve_cloudflare: bool = False
|
99
|
-
wait:
|
100
|
-
timeout:
|
121
|
+
wait: Seconds = 0
|
122
|
+
timeout: Seconds = 30000
|
101
123
|
init_script: Optional[str] = None
|
102
124
|
page_action: Optional[Callable] = None
|
103
125
|
wait_selector: Optional[str] = None
|
@@ -115,38 +137,23 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
115
137
|
|
116
138
|
def __post_init__(self):
|
117
139
|
"""Custom validation after msgspec validation"""
|
118
|
-
if self.max_pages < 1 or self.max_pages > 50:
|
119
|
-
raise ValueError("max_pages must be between 1 and 50")
|
120
|
-
if self.timeout < 0:
|
121
|
-
raise ValueError("timeout must be >= 0")
|
122
140
|
if self.page_action and not callable(self.page_action):
|
123
141
|
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
|
124
142
|
if self.proxy:
|
125
143
|
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
126
144
|
|
127
|
-
if
|
128
|
-
self.addons = []
|
129
|
-
else:
|
145
|
+
if self.addons and isinstance(self.addons, list):
|
130
146
|
for addon in self.addons:
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
elif not addon_path.is_dir():
|
135
|
-
raise ValueError(
|
136
|
-
f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
|
137
|
-
)
|
147
|
+
_validate_addon_path(addon)
|
148
|
+
else:
|
149
|
+
self.addons = []
|
138
150
|
|
139
151
|
if self.init_script is not None:
|
140
|
-
|
141
|
-
if not script_path.exists():
|
142
|
-
raise ValueError("Init script path not found")
|
143
|
-
elif not script_path.is_file():
|
144
|
-
raise ValueError("Init script is not a file")
|
145
|
-
elif not script_path.is_absolute():
|
146
|
-
raise ValueError("Init script is not a absolute path")
|
152
|
+
_validate_file_path(self.init_script)
|
147
153
|
|
148
154
|
if not self.cookies:
|
149
155
|
self.cookies = []
|
156
|
+
# Cloudflare timeout adjustment
|
150
157
|
if self.solve_cloudflare and self.timeout < 60_000:
|
151
158
|
self.timeout = 60_000
|
152
159
|
if not self.selector_config:
|
@@ -155,10 +162,68 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
155
162
|
self.additional_args = {}
|
156
163
|
|
157
164
|
|
158
|
-
|
165
|
+
# Code parts to validate `fetch` in the least possible numbers of lines overall
|
166
|
+
class FetchConfig(Struct, kw_only=True):
|
167
|
+
"""Configuration struct for `fetch` calls validation"""
|
168
|
+
|
169
|
+
google_search: bool = True
|
170
|
+
timeout: Seconds = 30000
|
171
|
+
wait: Seconds = 0
|
172
|
+
page_action: Optional[Callable] = None
|
173
|
+
extra_headers: Optional[Dict[str, str]] = None
|
174
|
+
disable_resources: bool = False
|
175
|
+
wait_selector: Optional[str] = None
|
176
|
+
wait_selector_state: SelectorWaitStates = "attached"
|
177
|
+
network_idle: bool = False
|
178
|
+
load_dom: bool = True
|
179
|
+
solve_cloudflare: bool = False
|
180
|
+
selector_config: Optional[Dict] = {}
|
181
|
+
|
182
|
+
def to_dict(self):
|
183
|
+
return {f: getattr(self, f) for f in self.__struct_fields__}
|
184
|
+
|
185
|
+
|
186
|
+
@dataclass
|
187
|
+
class _fetch_params:
|
188
|
+
"""A dataclass of all parameters used by `fetch` calls"""
|
189
|
+
|
190
|
+
google_search: bool
|
191
|
+
timeout: Seconds
|
192
|
+
wait: Seconds
|
193
|
+
page_action: Optional[Callable]
|
194
|
+
extra_headers: Optional[Dict[str, str]]
|
195
|
+
disable_resources: bool
|
196
|
+
wait_selector: Optional[str]
|
197
|
+
wait_selector_state: SelectorWaitStates
|
198
|
+
network_idle: bool
|
199
|
+
load_dom: bool
|
200
|
+
solve_cloudflare: bool
|
201
|
+
selector_config: Optional[Dict]
|
202
|
+
|
203
|
+
|
204
|
+
def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
205
|
+
result = {}
|
206
|
+
overrides = {}
|
207
|
+
|
208
|
+
for arg, request_value, session_value in params:
|
209
|
+
if request_value is not sentinel:
|
210
|
+
overrides[arg] = request_value
|
211
|
+
else:
|
212
|
+
result[arg] = session_value
|
213
|
+
|
214
|
+
if overrides:
|
215
|
+
overrides = validate(overrides, FetchConfig).to_dict()
|
216
|
+
overrides.update(result)
|
217
|
+
return _fetch_params(**overrides)
|
218
|
+
|
219
|
+
if not result.get("solve_cloudflare"):
|
220
|
+
result["solve_cloudflare"] = False
|
221
|
+
|
222
|
+
return _fetch_params(**result)
|
223
|
+
|
224
|
+
|
225
|
+
def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
|
159
226
|
try:
|
160
|
-
|
227
|
+
return convert(params, model)
|
161
228
|
except ValidationError as e:
|
162
|
-
raise TypeError(f"Invalid argument type: {e}")
|
163
|
-
|
164
|
-
return config
|
229
|
+
raise TypeError(f"Invalid argument type: {e}") from e
|
scrapling/engines/constants.py
CHANGED
@@ -101,18 +101,3 @@ DEFAULT_STEALTH_FLAGS = (
|
|
101
101
|
"--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
|
102
102
|
"--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
|
103
103
|
)
|
104
|
-
|
105
|
-
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
106
|
-
NSTBROWSER_DEFAULT_QUERY = {
|
107
|
-
"once": True,
|
108
|
-
"headless": True,
|
109
|
-
"autoClose": True,
|
110
|
-
"fingerprint": {
|
111
|
-
"flags": {"timezone": "BasedOnIp", "screen": "Custom"},
|
112
|
-
"platform": "linux", # support: windows, mac, linux
|
113
|
-
"kernel": "chromium", # only support: chromium
|
114
|
-
"kernelMilestone": "128",
|
115
|
-
"hardwareConcurrency": 8,
|
116
|
-
"deviceMemory": 8,
|
117
|
-
},
|
118
|
-
}
|