scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +49 -127
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +219 -296
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +201 -281
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +9 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +29 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
- scrapling-0.3.3.dist-info/RECORD +44 -0
- scrapling-0.3.1.dist-info/RECORD +0 -41
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,15 @@
|
|
1
|
+
from functools import lru_cache
|
2
|
+
from re import compile as re_compile
|
3
|
+
|
1
4
|
from curl_cffi.requests import Response as CurlResponse
|
2
5
|
from playwright.sync_api import Page as SyncPage, Response as SyncResponse
|
3
6
|
from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
|
4
7
|
|
5
8
|
from scrapling.core.utils import log
|
6
|
-
from scrapling.core._types import Dict, Optional
|
7
9
|
from .custom import Response, StatusText
|
10
|
+
from scrapling.core._types import Dict, Optional
|
11
|
+
|
12
|
+
__CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
|
8
13
|
|
9
14
|
|
10
15
|
class ResponseFactory:
|
@@ -18,9 +23,19 @@ class ResponseFactory:
|
|
18
23
|
"""
|
19
24
|
|
20
25
|
@classmethod
|
21
|
-
|
22
|
-
|
23
|
-
|
26
|
+
@lru_cache(maxsize=16)
|
27
|
+
def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
|
28
|
+
"""Extract browser encoding from headers.
|
29
|
+
Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
|
30
|
+
"""
|
31
|
+
if content_type:
|
32
|
+
# Because Playwright can't do that by themselves like all libraries for some reason :3
|
33
|
+
match = __CHARSET_RE__.search(content_type)
|
34
|
+
return match.group(1) if match else None
|
35
|
+
return None
|
36
|
+
|
37
|
+
@classmethod
|
38
|
+
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
24
39
|
"""Process response history to build a list of `Response` objects"""
|
25
40
|
history = []
|
26
41
|
current_request = first_response.request.redirected_from
|
@@ -32,24 +47,23 @@ class ResponseFactory:
|
|
32
47
|
history.insert(
|
33
48
|
0,
|
34
49
|
Response(
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
current_response.status_text
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
**parser_arguments,
|
50
|
+
**{
|
51
|
+
"url": current_request.url,
|
52
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
53
|
+
"content": "",
|
54
|
+
"status": current_response.status if current_response else 301,
|
55
|
+
"reason": (current_response.status_text or StatusText.get(current_response.status))
|
56
|
+
if current_response
|
57
|
+
else StatusText.get(301),
|
58
|
+
"encoding": cls.__extract_browser_encoding(
|
59
|
+
current_response.headers.get("content-type", "")
|
60
|
+
)
|
61
|
+
or "utf-8",
|
62
|
+
"cookies": tuple(),
|
63
|
+
"headers": current_response.all_headers() if current_response else {},
|
64
|
+
"request_headers": current_request.all_headers(),
|
65
|
+
**parser_arguments,
|
66
|
+
}
|
53
67
|
),
|
54
68
|
)
|
55
69
|
except Exception as e: # pragma: no cover
|
@@ -93,14 +107,11 @@ class ResponseFactory:
|
|
93
107
|
if not final_response:
|
94
108
|
raise ValueError("Failed to get a response from the page")
|
95
109
|
|
96
|
-
# This will be parsed inside `Response`
|
97
110
|
encoding = (
|
98
|
-
final_response.headers.get("content-type", "") or "utf-8"
|
111
|
+
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
99
112
|
) # default encoding
|
100
113
|
# PlayWright API sometimes give empty status text for some reason!
|
101
|
-
status_text = final_response.status_text or StatusText.get(
|
102
|
-
final_response.status
|
103
|
-
)
|
114
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
104
115
|
|
105
116
|
history = cls._process_response_history(first_response, parser_arguments)
|
106
117
|
try:
|
@@ -110,16 +121,18 @@ class ResponseFactory:
|
|
110
121
|
page_content = ""
|
111
122
|
|
112
123
|
return Response(
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
124
|
+
**{
|
125
|
+
"url": page.url,
|
126
|
+
"content": page_content,
|
127
|
+
"status": final_response.status,
|
128
|
+
"reason": status_text,
|
129
|
+
"encoding": encoding,
|
130
|
+
"cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
|
131
|
+
"headers": first_response.all_headers(),
|
132
|
+
"request_headers": first_response.request.all_headers(),
|
133
|
+
"history": history,
|
134
|
+
**parser_arguments,
|
135
|
+
}
|
123
136
|
)
|
124
137
|
|
125
138
|
@classmethod
|
@@ -137,24 +150,23 @@ class ResponseFactory:
|
|
137
150
|
history.insert(
|
138
151
|
0,
|
139
152
|
Response(
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
current_response.status_text
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
**parser_arguments,
|
153
|
+
**{
|
154
|
+
"url": current_request.url,
|
155
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
156
|
+
"content": "",
|
157
|
+
"status": current_response.status if current_response else 301,
|
158
|
+
"reason": (current_response.status_text or StatusText.get(current_response.status))
|
159
|
+
if current_response
|
160
|
+
else StatusText.get(301),
|
161
|
+
"encoding": cls.__extract_browser_encoding(
|
162
|
+
current_response.headers.get("content-type", "")
|
163
|
+
)
|
164
|
+
or "utf-8",
|
165
|
+
"cookies": tuple(),
|
166
|
+
"headers": await current_response.all_headers() if current_response else {},
|
167
|
+
"request_headers": await current_request.all_headers(),
|
168
|
+
**parser_arguments,
|
169
|
+
}
|
158
170
|
),
|
159
171
|
)
|
160
172
|
except Exception as e: # pragma: no cover
|
@@ -198,18 +210,13 @@ class ResponseFactory:
|
|
198
210
|
if not final_response:
|
199
211
|
raise ValueError("Failed to get a response from the page")
|
200
212
|
|
201
|
-
# This will be parsed inside `Response`
|
202
213
|
encoding = (
|
203
|
-
final_response.headers.get("content-type", "") or "utf-8"
|
214
|
+
cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
|
204
215
|
) # default encoding
|
205
216
|
# PlayWright API sometimes give empty status text for some reason!
|
206
|
-
status_text = final_response.status_text or StatusText.get(
|
207
|
-
final_response.status
|
208
|
-
)
|
217
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
209
218
|
|
210
|
-
history = await cls._async_process_response_history(
|
211
|
-
first_response, parser_arguments
|
212
|
-
)
|
219
|
+
history = await cls._async_process_response_history(first_response, parser_arguments)
|
213
220
|
try:
|
214
221
|
page_content = await page.content()
|
215
222
|
except Exception as e: # pragma: no cover
|
@@ -217,16 +224,18 @@ class ResponseFactory:
|
|
217
224
|
page_content = ""
|
218
225
|
|
219
226
|
return Response(
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
227
|
+
**{
|
228
|
+
"url": page.url,
|
229
|
+
"content": page_content,
|
230
|
+
"status": final_response.status,
|
231
|
+
"reason": status_text,
|
232
|
+
"encoding": encoding,
|
233
|
+
"cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
|
234
|
+
"headers": await first_response.all_headers(),
|
235
|
+
"request_headers": await first_response.request.all_headers(),
|
236
|
+
"history": history,
|
237
|
+
**parser_arguments,
|
238
|
+
}
|
230
239
|
)
|
231
240
|
|
232
241
|
@staticmethod
|
@@ -238,17 +247,17 @@ class ResponseFactory:
|
|
238
247
|
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
239
248
|
"""
|
240
249
|
return Response(
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
250
|
+
**{
|
251
|
+
"url": response.url,
|
252
|
+
"content": response.content,
|
253
|
+
"status": response.status_code,
|
254
|
+
"reason": response.reason,
|
255
|
+
"encoding": response.encoding or "utf-8",
|
256
|
+
"cookies": dict(response.cookies),
|
257
|
+
"headers": dict(response.headers),
|
258
|
+
"request_headers": dict(response.request.headers),
|
259
|
+
"method": response.request.method,
|
260
|
+
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
261
|
+
**parser_arguments,
|
262
|
+
}
|
254
263
|
)
|
@@ -2,8 +2,9 @@
|
|
2
2
|
Functions related to custom types or type checking
|
3
3
|
"""
|
4
4
|
|
5
|
-
from
|
5
|
+
from functools import lru_cache
|
6
6
|
|
7
|
+
from scrapling.core.utils import log
|
7
8
|
from scrapling.core._types import (
|
8
9
|
Any,
|
9
10
|
Dict,
|
@@ -12,89 +13,9 @@ from scrapling.core._types import (
|
|
12
13
|
Tuple,
|
13
14
|
)
|
14
15
|
from scrapling.core.custom_types import MappingProxyType
|
15
|
-
from scrapling.core.utils import log, lru_cache
|
16
16
|
from scrapling.parser import Selector, SQLiteStorageSystem
|
17
17
|
|
18
18
|
|
19
|
-
class ResponseEncoding:
|
20
|
-
__DEFAULT_ENCODING = "utf-8"
|
21
|
-
__ISO_8859_1_CONTENT_TYPES = {
|
22
|
-
"text/plain",
|
23
|
-
"text/html",
|
24
|
-
"text/css",
|
25
|
-
"text/javascript",
|
26
|
-
}
|
27
|
-
|
28
|
-
@classmethod
|
29
|
-
@lru_cache(maxsize=128)
|
30
|
-
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
31
|
-
"""Parse content type and parameters from a content-type header value.
|
32
|
-
|
33
|
-
Uses `email.message.Message` for robust header parsing according to RFC 2045.
|
34
|
-
|
35
|
-
:param header_value: Raw content-type header string
|
36
|
-
:return: Tuple of (content_type, parameters_dict)
|
37
|
-
"""
|
38
|
-
# Create a Message object and set the Content-Type header then get the content type and parameters
|
39
|
-
msg = Message()
|
40
|
-
msg["content-type"] = header_value
|
41
|
-
|
42
|
-
content_type = msg.get_content_type()
|
43
|
-
params = dict(msg.get_params(failobj=[]))
|
44
|
-
|
45
|
-
# Remove the content-type from params if present somehow
|
46
|
-
params.pop("content-type", None)
|
47
|
-
|
48
|
-
return content_type, params
|
49
|
-
|
50
|
-
@classmethod
|
51
|
-
@lru_cache(maxsize=128)
|
52
|
-
def get_value(
|
53
|
-
cls, content_type: Optional[str], text: Optional[str] = "test"
|
54
|
-
) -> str:
|
55
|
-
"""Determine the appropriate character encoding from a content-type header.
|
56
|
-
|
57
|
-
The encoding is determined by these rules in order:
|
58
|
-
1. If no content-type is provided, use UTF-8
|
59
|
-
2. If charset parameter is present, use that encoding
|
60
|
-
3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
|
61
|
-
4. If content-type is application/json, use UTF-8 per RFC 4627
|
62
|
-
5. Default to UTF-8 if nothing else matches
|
63
|
-
|
64
|
-
:param content_type: Content-Type header value or None
|
65
|
-
:param text: A text to test the encoding on it
|
66
|
-
:return: String naming the character encoding
|
67
|
-
"""
|
68
|
-
if not content_type:
|
69
|
-
return cls.__DEFAULT_ENCODING
|
70
|
-
|
71
|
-
try:
|
72
|
-
encoding = None
|
73
|
-
content_type, params = cls.__parse_content_type(content_type)
|
74
|
-
|
75
|
-
# First check for explicit charset parameter
|
76
|
-
if "charset" in params:
|
77
|
-
encoding = params["charset"].strip("'\"")
|
78
|
-
|
79
|
-
# Apply content-type specific rules
|
80
|
-
elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
|
81
|
-
encoding = "ISO-8859-1"
|
82
|
-
|
83
|
-
elif content_type == "application/json":
|
84
|
-
encoding = cls.__DEFAULT_ENCODING
|
85
|
-
|
86
|
-
if encoding:
|
87
|
-
_ = text.encode(
|
88
|
-
encoding
|
89
|
-
) # Validate encoding and validate it can encode the given text
|
90
|
-
return encoding
|
91
|
-
|
92
|
-
return cls.__DEFAULT_ENCODING
|
93
|
-
|
94
|
-
except (ValueError, LookupError, UnicodeEncodeError):
|
95
|
-
return cls.__DEFAULT_ENCODING
|
96
|
-
|
97
|
-
|
98
19
|
class Response(Selector):
|
99
20
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
100
21
|
|
@@ -119,9 +40,6 @@ class Response(Selector):
|
|
119
40
|
self.headers = headers
|
120
41
|
self.request_headers = request_headers
|
121
42
|
self.history = history or []
|
122
|
-
encoding = ResponseEncoding.get_value(
|
123
|
-
encoding, content.decode("utf-8") if isinstance(content, bytes) else content
|
124
|
-
)
|
125
43
|
super().__init__(
|
126
44
|
content=content,
|
127
45
|
url=adaptive_domain or url,
|
@@ -129,9 +47,7 @@ class Response(Selector):
|
|
129
47
|
**selector_config,
|
130
48
|
)
|
131
49
|
# For easier debugging while working from a Python shell
|
132
|
-
log.info(
|
133
|
-
f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
|
134
|
-
)
|
50
|
+
log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
|
135
51
|
|
136
52
|
|
137
53
|
class BaseFetcher:
|
@@ -190,18 +106,12 @@ class BaseFetcher:
|
|
190
106
|
setattr(cls, key, value)
|
191
107
|
else:
|
192
108
|
# Yup, no fun allowed LOL
|
193
|
-
raise AttributeError(
|
194
|
-
f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
|
195
|
-
)
|
109
|
+
raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
|
196
110
|
else:
|
197
|
-
raise ValueError(
|
198
|
-
f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
|
199
|
-
)
|
111
|
+
raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
|
200
112
|
|
201
113
|
if not kwargs:
|
202
|
-
raise AttributeError(
|
203
|
-
f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?"
|
204
|
-
)
|
114
|
+
raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")
|
205
115
|
|
206
116
|
@classmethod
|
207
117
|
def _generate_parser_arguments(cls) -> Dict:
|
@@ -217,9 +127,7 @@ class BaseFetcher:
|
|
217
127
|
)
|
218
128
|
if cls.adaptive_domain:
|
219
129
|
if not isinstance(cls.adaptive_domain, str):
|
220
|
-
log.warning(
|
221
|
-
'[Ignored] The argument "adaptive_domain" must be of string type'
|
222
|
-
)
|
130
|
+
log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
|
223
131
|
else:
|
224
132
|
parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
|
225
133
|
|
@@ -2,13 +2,13 @@
|
|
2
2
|
Functions related to generating headers and fingerprints generally
|
3
3
|
"""
|
4
4
|
|
5
|
+
from functools import lru_cache
|
5
6
|
from platform import system as platform_system
|
6
7
|
|
7
8
|
from tldextract import extract
|
8
9
|
from browserforge.headers import Browser, HeaderGenerator
|
9
10
|
|
10
11
|
from scrapling.core._types import Dict, Optional
|
11
|
-
from scrapling.core.utils import lru_cache
|
12
12
|
|
13
13
|
__OS_NAME__ = platform_system()
|
14
14
|
|
@@ -37,8 +37,6 @@ def get_os_name() -> Optional[str]:
|
|
37
37
|
"Linux": "linux",
|
38
38
|
"Darwin": "macos",
|
39
39
|
"Windows": "windows",
|
40
|
-
# For the future? because why not?
|
41
|
-
"iOS": "ios",
|
42
40
|
}.get(__OS_NAME__)
|
43
41
|
|
44
42
|
|
@@ -30,9 +30,7 @@ def intercept_route(route: Route):
|
|
30
30
|
:return: PlayWright `Route` object
|
31
31
|
"""
|
32
32
|
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
33
|
-
log.debug(
|
34
|
-
f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
|
35
|
-
)
|
33
|
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
36
34
|
route.abort()
|
37
35
|
else:
|
38
36
|
route.continue_()
|
@@ -45,17 +43,13 @@ async def async_intercept_route(route: async_Route):
|
|
45
43
|
:return: PlayWright `Route` object
|
46
44
|
"""
|
47
45
|
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
48
|
-
log.debug(
|
49
|
-
f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
|
50
|
-
)
|
46
|
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
51
47
|
await route.abort()
|
52
48
|
else:
|
53
49
|
await route.continue_()
|
54
50
|
|
55
51
|
|
56
|
-
def construct_proxy_dict(
|
57
|
-
proxy_string: str | Dict[str, str], as_tuple=False
|
58
|
-
) -> Optional[Dict | Tuple]:
|
52
|
+
def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
|
59
53
|
"""Validate a proxy and return it in the acceptable format for Playwright
|
60
54
|
Reference: https://playwright.dev/python/docs/network#http-proxy
|
61
55
|
|
@@ -65,10 +59,7 @@ def construct_proxy_dict(
|
|
65
59
|
"""
|
66
60
|
if isinstance(proxy_string, str):
|
67
61
|
proxy = urlparse(proxy_string)
|
68
|
-
if (
|
69
|
-
proxy.scheme not in ("http", "https", "socks4", "socks5")
|
70
|
-
or not proxy.hostname
|
71
|
-
):
|
62
|
+
if proxy.scheme not in ("http", "https", "socks4", "socks5") or not proxy.hostname:
|
72
63
|
raise ValueError("Invalid proxy string!")
|
73
64
|
|
74
65
|
try:
|
@@ -95,51 +86,6 @@ def construct_proxy_dict(
|
|
95
86
|
return None
|
96
87
|
|
97
88
|
|
98
|
-
def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
99
|
-
"""Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
|
100
|
-
|
101
|
-
:param cdp_url: The target URL.
|
102
|
-
:param query_params: A dictionary of the parameters to add.
|
103
|
-
:return: The new CDP URL.
|
104
|
-
"""
|
105
|
-
try:
|
106
|
-
# Validate the base URL structure
|
107
|
-
parsed = urlparse(cdp_url)
|
108
|
-
|
109
|
-
# Check scheme
|
110
|
-
if parsed.scheme not in ("ws", "wss"):
|
111
|
-
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
112
|
-
|
113
|
-
# Validate hostname and port
|
114
|
-
if not parsed.netloc:
|
115
|
-
raise ValueError("Invalid hostname for the CDP URL")
|
116
|
-
|
117
|
-
try:
|
118
|
-
# Checking if the port is valid (if available)
|
119
|
-
_ = parsed.port
|
120
|
-
except ValueError:
|
121
|
-
# urlparse will raise `ValueError` if the port can't be casted to integer
|
122
|
-
raise ValueError("Invalid port for the CDP URL")
|
123
|
-
|
124
|
-
# Ensure the path starts with /
|
125
|
-
path = parsed.path
|
126
|
-
if not path.startswith("/"):
|
127
|
-
path = "/" + path
|
128
|
-
|
129
|
-
# Reconstruct the base URL with validated parts
|
130
|
-
validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
|
131
|
-
|
132
|
-
# Add query parameters
|
133
|
-
if query_params:
|
134
|
-
query_string = urlencode(query_params)
|
135
|
-
return f"{validated_base}?{query_string}"
|
136
|
-
|
137
|
-
return validated_base
|
138
|
-
|
139
|
-
except Exception as e:
|
140
|
-
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
141
|
-
|
142
|
-
|
143
89
|
@lru_cache(10, typed=True)
|
144
90
|
def js_bypass_path(filename: str) -> str:
|
145
91
|
"""Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it
|