scrapling 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,69 @@
1
- from msgspec import Struct, convert, ValidationError
2
- from urllib.parse import urlparse
3
1
  from pathlib import Path
2
+ from typing import Annotated
3
+ from dataclasses import dataclass
4
+ from urllib.parse import urlparse
5
+
6
+ from msgspec import Struct, Meta, convert, ValidationError
4
7
 
5
8
  from scrapling.core._types import (
6
- Optional,
7
9
  Dict,
8
- Callable,
9
10
  List,
11
+ Tuple,
12
+ Optional,
13
+ Callable,
10
14
  SelectorWaitStates,
11
15
  )
12
16
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
13
17
 
14
18
 
19
+ # Custom validators for msgspec
20
+ def _validate_file_path(value: str):
21
+ """Fast file path validation"""
22
+ path = Path(value)
23
+ if not path.exists():
24
+ raise ValueError(f"Init script path not found: {value}")
25
+ if not path.is_file():
26
+ raise ValueError(f"Init script is not a file: {value}")
27
+ if not path.is_absolute():
28
+ raise ValueError(f"Init script is not a absolute path: {value}")
29
+
30
+
31
+ def _validate_addon_path(value: str):
32
+ """Fast addon path validation"""
33
+ path = Path(value)
34
+ if not path.exists():
35
+ raise FileNotFoundError(f"Addon path not found: {value}")
36
+ if not path.is_dir():
37
+ raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
38
+
39
+
40
+ def _validate_cdp_url(cdp_url: str):
41
+ """Fast CDP URL validation"""
42
+ try:
43
+ # Check the scheme
44
+ if not cdp_url.startswith(("ws://", "wss://")):
45
+ raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
46
+
47
+ # Validate hostname and port
48
+ if not urlparse(cdp_url).netloc:
49
+ raise ValueError("Invalid hostname for the CDP URL")
50
+
51
+ except AttributeError as e:
52
+ raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
53
+
54
+ except Exception as e:
55
+ raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
56
+
57
+
58
+ # Type aliases for cleaner annotations
59
+ PagesCount = Annotated[int, Meta(ge=1, le=50)]
60
+ Seconds = Annotated[int, float, Meta(ge=0)]
61
+
62
+
15
63
  class PlaywrightConfig(Struct, kw_only=True, frozen=False):
16
64
  """Configuration struct for validation"""
17
65
 
18
- max_pages: int = 1
66
+ max_pages: PagesCount = 1
19
67
  cdp_url: Optional[str] = None
20
68
  headless: bool = True
21
69
  google_search: bool = True
@@ -23,13 +71,13 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
23
71
  disable_webgl: bool = False
24
72
  real_chrome: bool = False
25
73
  stealth: bool = False
26
- wait: int | float = 0
74
+ wait: Seconds = 0
27
75
  page_action: Optional[Callable] = None
28
76
  proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
29
77
  locale: str = "en-US"
30
78
  extra_headers: Optional[Dict[str, str]] = None
31
79
  useragent: Optional[str] = None
32
- timeout: int | float = 30000
80
+ timeout: Seconds = 30000
33
81
  init_script: Optional[str] = None
34
82
  disable_resources: bool = False
35
83
  wait_selector: Optional[str] = None
@@ -41,52 +89,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
41
89
 
42
90
  def __post_init__(self):
43
91
  """Custom validation after msgspec validation"""
44
- if self.max_pages < 1 or self.max_pages > 50:
45
- raise ValueError("max_pages must be between 1 and 50")
46
- if self.timeout < 0:
47
- raise ValueError("timeout must be >= 0")
48
92
  if self.page_action and not callable(self.page_action):
49
93
  raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
50
94
  if self.proxy:
51
95
  self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
52
96
  if self.cdp_url:
53
- self.__validate_cdp(self.cdp_url)
97
+ _validate_cdp_url(self.cdp_url)
98
+
54
99
  if not self.cookies:
55
100
  self.cookies = []
56
101
  if not self.selector_config:
57
102
  self.selector_config = {}
58
103
 
59
104
  if self.init_script is not None:
60
- script_path = Path(self.init_script)
61
- if not script_path.exists():
62
- raise ValueError("Init script path not found")
63
- elif not script_path.is_file():
64
- raise ValueError("Init script is not a file")
65
- elif not script_path.is_absolute():
66
- raise ValueError("Init script is not a absolute path")
67
-
68
- @staticmethod
69
- def __validate_cdp(cdp_url):
70
- try:
71
- # Check the scheme
72
- if not cdp_url.startswith(("ws://", "wss://")):
73
- raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
74
-
75
- # Validate hostname and port
76
- if not urlparse(cdp_url).netloc:
77
- raise ValueError("Invalid hostname for the CDP URL")
78
-
79
- except AttributeError as e:
80
- raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
81
-
82
- except Exception as e:
83
- raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
105
+ _validate_file_path(self.init_script)
84
106
 
85
107
 
86
108
  class CamoufoxConfig(Struct, kw_only=True, frozen=False):
87
109
  """Configuration struct for validation"""
88
110
 
89
- max_pages: int = 1
111
+ max_pages: PagesCount = 1
90
112
  headless: bool = True # noqa: F821
91
113
  block_images: bool = False
92
114
  disable_resources: bool = False
@@ -96,8 +118,8 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
96
118
  load_dom: bool = True
97
119
  humanize: bool | float = True
98
120
  solve_cloudflare: bool = False
99
- wait: int | float = 0
100
- timeout: int | float = 30000
121
+ wait: Seconds = 0
122
+ timeout: Seconds = 30000
101
123
  init_script: Optional[str] = None
102
124
  page_action: Optional[Callable] = None
103
125
  wait_selector: Optional[str] = None
@@ -115,38 +137,23 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
115
137
 
116
138
  def __post_init__(self):
117
139
  """Custom validation after msgspec validation"""
118
- if self.max_pages < 1 or self.max_pages > 50:
119
- raise ValueError("max_pages must be between 1 and 50")
120
- if self.timeout < 0:
121
- raise ValueError("timeout must be >= 0")
122
140
  if self.page_action and not callable(self.page_action):
123
141
  raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
124
142
  if self.proxy:
125
143
  self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
126
144
 
127
- if not self.addons:
128
- self.addons = []
129
- else:
145
+ if self.addons and isinstance(self.addons, list):
130
146
  for addon in self.addons:
131
- addon_path = Path(addon)
132
- if not addon_path.exists():
133
- raise FileNotFoundError(f"Addon's path not found: {addon}")
134
- elif not addon_path.is_dir():
135
- raise ValueError(
136
- f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
137
- )
147
+ _validate_addon_path(addon)
148
+ else:
149
+ self.addons = []
138
150
 
139
151
  if self.init_script is not None:
140
- script_path = Path(self.init_script)
141
- if not script_path.exists():
142
- raise ValueError("Init script path not found")
143
- elif not script_path.is_file():
144
- raise ValueError("Init script is not a file")
145
- elif not script_path.is_absolute():
146
- raise ValueError("Init script is not a absolute path")
152
+ _validate_file_path(self.init_script)
147
153
 
148
154
  if not self.cookies:
149
155
  self.cookies = []
156
+ # Cloudflare timeout adjustment
150
157
  if self.solve_cloudflare and self.timeout < 60_000:
151
158
  self.timeout = 60_000
152
159
  if not self.selector_config:
@@ -155,10 +162,68 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
155
162
  self.additional_args = {}
156
163
 
157
164
 
158
- def validate(params, model):
165
+ # Code parts to validate `fetch` in the least possible numbers of lines overall
166
+ class FetchConfig(Struct, kw_only=True):
167
+ """Configuration struct for `fetch` calls validation"""
168
+
169
+ google_search: bool = True
170
+ timeout: Seconds = 30000
171
+ wait: Seconds = 0
172
+ page_action: Optional[Callable] = None
173
+ extra_headers: Optional[Dict[str, str]] = None
174
+ disable_resources: bool = False
175
+ wait_selector: Optional[str] = None
176
+ wait_selector_state: SelectorWaitStates = "attached"
177
+ network_idle: bool = False
178
+ load_dom: bool = True
179
+ solve_cloudflare: bool = False
180
+ selector_config: Optional[Dict] = {}
181
+
182
+ def to_dict(self):
183
+ return {f: getattr(self, f) for f in self.__struct_fields__}
184
+
185
+
186
+ @dataclass
187
+ class _fetch_params:
188
+ """A dataclass of all parameters used by `fetch` calls"""
189
+
190
+ google_search: bool
191
+ timeout: Seconds
192
+ wait: Seconds
193
+ page_action: Optional[Callable]
194
+ extra_headers: Optional[Dict[str, str]]
195
+ disable_resources: bool
196
+ wait_selector: Optional[str]
197
+ wait_selector_state: SelectorWaitStates
198
+ network_idle: bool
199
+ load_dom: bool
200
+ solve_cloudflare: bool
201
+ selector_config: Optional[Dict]
202
+
203
+
204
+ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
205
+ result = {}
206
+ overrides = {}
207
+
208
+ for arg, request_value, session_value in params:
209
+ if request_value is not sentinel:
210
+ overrides[arg] = request_value
211
+ else:
212
+ result[arg] = session_value
213
+
214
+ if overrides:
215
+ overrides = validate(overrides, FetchConfig).to_dict()
216
+ overrides.update(result)
217
+ return _fetch_params(**overrides)
218
+
219
+ if not result.get("solve_cloudflare"):
220
+ result["solve_cloudflare"] = False
221
+
222
+ return _fetch_params(**result)
223
+
224
+
225
+ def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
159
226
  try:
160
- config = convert(params, model)
227
+ return convert(params, model)
161
228
  except ValidationError as e:
162
- raise TypeError(f"Invalid argument type: {e}")
163
-
164
- return config
229
+ raise TypeError(f"Invalid argument type: {e}") from e
@@ -101,18 +101,3 @@ DEFAULT_STEALTH_FLAGS = (
101
101
  "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
102
102
  "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
103
103
  )
104
-
105
- # Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
106
- NSTBROWSER_DEFAULT_QUERY = {
107
- "once": True,
108
- "headless": True,
109
- "autoClose": True,
110
- "fingerprint": {
111
- "flags": {"timezone": "BasedOnIp", "screen": "Custom"},
112
- "platform": "linux", # support: windows, mac, linux
113
- "kernel": "chromium", # only support: chromium
114
- "kernelMilestone": "128",
115
- "hardwareConcurrency": 8,
116
- "deviceMemory": 8,
117
- },
118
- }