scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +21 -4
- scrapling/core/_types.py +3 -2
- scrapling/core/ai.py +24 -15
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +6 -4
- scrapling/core/storage.py +7 -6
- scrapling/core/translator.py +13 -8
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +45 -21
- scrapling/engines/_browsers/_camoufox.py +98 -43
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +34 -13
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +749 -336
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +46 -0
- scrapling/fetchers/chrome.py +210 -0
- scrapling/fetchers/firefox.py +212 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +109 -84
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.5.dist-info/RECORD +0 -44
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -1,28 +1,38 @@
|
|
1
1
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
2
|
-
__version__ = "0.3.
|
2
|
+
__version__ = "0.3.7"
|
3
3
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
4
4
|
|
5
|
+
from typing import Any, TYPE_CHECKING
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from scrapling.parser import Selector, Selectors
|
9
|
+
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
10
|
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
11
|
+
|
12
|
+
|
13
|
+
# Lazy import mapping
|
14
|
+
_LAZY_IMPORTS = {
|
15
|
+
"Fetcher": ("scrapling.fetchers", "Fetcher"),
|
16
|
+
"Selector": ("scrapling.parser", "Selector"),
|
17
|
+
"Selectors": ("scrapling.parser", "Selectors"),
|
18
|
+
"AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
|
19
|
+
"TextHandler": ("scrapling.core.custom_types", "TextHandler"),
|
20
|
+
"AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
|
21
|
+
"StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
|
22
|
+
"DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
|
23
|
+
}
|
24
|
+
__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
|
25
|
+
|
26
|
+
|
27
|
+
def __getattr__(name: str) -> Any:
|
28
|
+
if name in _LAZY_IMPORTS:
|
29
|
+
module_path, class_name = _LAZY_IMPORTS[name]
|
22
30
|
module = __import__(module_path, fromlist=[class_name])
|
23
31
|
return getattr(module, class_name)
|
24
32
|
else:
|
25
|
-
raise AttributeError(f"module
|
33
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
26
34
|
|
27
35
|
|
28
|
-
|
36
|
+
def __dir__() -> list[str]:
|
37
|
+
"""Support for dir() and autocomplete."""
|
38
|
+
return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])
|
scrapling/cli.py
CHANGED
@@ -2,8 +2,9 @@ from pathlib import Path
|
|
2
2
|
from subprocess import check_output
|
3
3
|
from sys import executable as python_executable
|
4
4
|
|
5
|
+
from scrapling.core.utils import log
|
5
6
|
from scrapling.engines.toolbelt.custom import Response
|
6
|
-
from scrapling.core.utils import
|
7
|
+
from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
|
7
8
|
from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
|
8
9
|
|
9
10
|
from orjson import loads as json_loads, JSONDecodeError
|
@@ -135,10 +136,26 @@ def install(force): # pragma: no cover
|
|
135
136
|
|
136
137
|
|
137
138
|
@command(help="Run Scrapling's MCP server (Check the docs for more info).")
|
138
|
-
|
139
|
+
@option(
|
140
|
+
"--http",
|
141
|
+
is_flag=True,
|
142
|
+
default=False,
|
143
|
+
help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
|
144
|
+
)
|
145
|
+
@option(
|
146
|
+
"--host",
|
147
|
+
type=str,
|
148
|
+
default="0.0.0.0",
|
149
|
+
help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
|
150
|
+
)
|
151
|
+
@option(
|
152
|
+
"--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
|
153
|
+
)
|
154
|
+
def mcp(http, host, port):
|
139
155
|
from scrapling.core.ai import ScraplingMCPServer
|
140
156
|
|
141
|
-
ScraplingMCPServer()
|
157
|
+
server = ScraplingMCPServer()
|
158
|
+
server.serve(http, host, port)
|
142
159
|
|
143
160
|
|
144
161
|
@command(help="Interactive scraping console")
|
@@ -766,7 +783,7 @@ def stealthy_fetch(
|
|
766
783
|
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
767
784
|
:param block_webrtc: Blocks WebRTC entirely.
|
768
785
|
:param humanize: Humanize the cursor movement.
|
769
|
-
:param solve_cloudflare: Solves all
|
786
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
|
770
787
|
:param allow_webgl: Allow WebGL (recommended to keep enabled).
|
771
788
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
772
789
|
:param disable_ads: Install the uBlock Origin addon on the browser.
|
scrapling/core/_types.py
CHANGED
@@ -12,9 +12,11 @@ from typing import (
|
|
12
12
|
Generator,
|
13
13
|
Iterable,
|
14
14
|
List,
|
15
|
+
Set,
|
15
16
|
Literal,
|
16
17
|
Optional,
|
17
18
|
Pattern,
|
19
|
+
Sequence,
|
18
20
|
Tuple,
|
19
21
|
TypeVar,
|
20
22
|
Union,
|
@@ -22,6 +24,7 @@ from typing import (
|
|
22
24
|
Mapping,
|
23
25
|
Awaitable,
|
24
26
|
Protocol,
|
27
|
+
Coroutine,
|
25
28
|
SupportsIndex,
|
26
29
|
)
|
27
30
|
|
@@ -39,6 +42,4 @@ except ImportError: # pragma: no cover
|
|
39
42
|
try:
|
40
43
|
from typing_extensions import Self # Backport
|
41
44
|
except ImportError:
|
42
|
-
from typing import TypeVar
|
43
|
-
|
44
45
|
Self = object
|
scrapling/core/ai.py
CHANGED
@@ -20,6 +20,7 @@ from scrapling.core._types import (
|
|
20
20
|
Mapping,
|
21
21
|
Dict,
|
22
22
|
List,
|
23
|
+
Any,
|
23
24
|
SelectorWaitStates,
|
24
25
|
Generator,
|
25
26
|
)
|
@@ -42,10 +43,7 @@ def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResp
|
|
42
43
|
|
43
44
|
|
44
45
|
class ScraplingMCPServer:
|
45
|
-
_server = FastMCP(name="Scrapling")
|
46
|
-
|
47
46
|
@staticmethod
|
48
|
-
@_server.tool()
|
49
47
|
def get(
|
50
48
|
url: str,
|
51
49
|
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
@@ -124,7 +122,6 @@ class ScraplingMCPServer:
|
|
124
122
|
)
|
125
123
|
|
126
124
|
@staticmethod
|
127
|
-
@_server.tool()
|
128
125
|
async def bulk_get(
|
129
126
|
urls: Tuple[str, ...],
|
130
127
|
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
@@ -175,7 +172,7 @@ class ScraplingMCPServer:
|
|
175
172
|
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
176
173
|
"""
|
177
174
|
async with FetcherSession() as session:
|
178
|
-
tasks = [
|
175
|
+
tasks: List[Any] = [
|
179
176
|
session.get(
|
180
177
|
url,
|
181
178
|
auth=auth,
|
@@ -211,7 +208,6 @@ class ScraplingMCPServer:
|
|
211
208
|
]
|
212
209
|
|
213
210
|
@staticmethod
|
214
|
-
@_server.tool()
|
215
211
|
async def fetch(
|
216
212
|
url: str,
|
217
213
|
extraction_type: extraction_types = "markdown",
|
@@ -263,7 +259,7 @@ class ScraplingMCPServer:
|
|
263
259
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
264
260
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
265
261
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
266
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
262
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
267
263
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
268
264
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
269
265
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
@@ -300,7 +296,6 @@ class ScraplingMCPServer:
|
|
300
296
|
)
|
301
297
|
|
302
298
|
@staticmethod
|
303
|
-
@_server.tool()
|
304
299
|
async def bulk_fetch(
|
305
300
|
urls: Tuple[str, ...],
|
306
301
|
extraction_type: extraction_types = "markdown",
|
@@ -352,7 +347,7 @@ class ScraplingMCPServer:
|
|
352
347
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
353
348
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
354
349
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
355
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
350
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
356
351
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
357
352
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
358
353
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
@@ -394,7 +389,6 @@ class ScraplingMCPServer:
|
|
394
389
|
]
|
395
390
|
|
396
391
|
@staticmethod
|
397
|
-
@_server.tool()
|
398
392
|
async def stealthy_fetch(
|
399
393
|
url: str,
|
400
394
|
extraction_type: extraction_types = "markdown",
|
@@ -443,7 +437,7 @@ class ScraplingMCPServer:
|
|
443
437
|
:param cookies: Set cookies for the next request.
|
444
438
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
445
439
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
446
|
-
:param solve_cloudflare: Solves all
|
440
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
447
441
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
448
442
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
449
443
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
@@ -494,7 +488,6 @@ class ScraplingMCPServer:
|
|
494
488
|
)
|
495
489
|
|
496
490
|
@staticmethod
|
497
|
-
@_server.tool()
|
498
491
|
async def bulk_stealthy_fetch(
|
499
492
|
urls: Tuple[str, ...],
|
500
493
|
extraction_type: extraction_types = "markdown",
|
@@ -543,7 +536,7 @@ class ScraplingMCPServer:
|
|
543
536
|
:param cookies: Set cookies for the next request.
|
544
537
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
545
538
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
546
|
-
:param solve_cloudflare: Solves all
|
539
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
547
540
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
548
541
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
549
542
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
@@ -598,6 +591,22 @@ class ScraplingMCPServer:
|
|
598
591
|
for page in responses
|
599
592
|
]
|
600
593
|
|
601
|
-
def serve(self):
|
594
|
+
def serve(self, http: bool, host: str, port: int):
|
602
595
|
"""Serve the MCP server."""
|
603
|
-
|
596
|
+
server = FastMCP(name="Scrapling", host=host, port=port)
|
597
|
+
server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
|
598
|
+
server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
|
599
|
+
server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
|
600
|
+
server.add_tool(
|
601
|
+
self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
|
602
|
+
)
|
603
|
+
server.add_tool(
|
604
|
+
self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
|
605
|
+
)
|
606
|
+
server.add_tool(
|
607
|
+
self.bulk_stealthy_fetch,
|
608
|
+
title="bulk_stealthy_fetch",
|
609
|
+
description=self.bulk_stealthy_fetch.__doc__,
|
610
|
+
structured_output=True,
|
611
|
+
)
|
612
|
+
server.run(transport="stdio" if not http else "streamable-http")
|
scrapling/core/custom_types.py
CHANGED
@@ -5,6 +5,7 @@ from re import compile as re_compile, UNICODE, IGNORECASE
|
|
5
5
|
from orjson import dumps, loads
|
6
6
|
|
7
7
|
from scrapling.core._types import (
|
8
|
+
Any,
|
8
9
|
cast,
|
9
10
|
Dict,
|
10
11
|
List,
|
@@ -14,7 +15,6 @@ from scrapling.core._types import (
|
|
14
15
|
Literal,
|
15
16
|
Pattern,
|
16
17
|
Iterable,
|
17
|
-
Optional,
|
18
18
|
Generator,
|
19
19
|
SupportsIndex,
|
20
20
|
)
|
@@ -33,23 +33,20 @@ class TextHandler(str):
|
|
33
33
|
|
34
34
|
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
|
35
35
|
lst = super().__getitem__(key)
|
36
|
-
return
|
36
|
+
return TextHandler(lst)
|
37
37
|
|
38
|
-
def split(
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
[TextHandler(s) for s in super().split(sep, maxsplit)],
|
43
|
-
)
|
44
|
-
)
|
38
|
+
def split(
|
39
|
+
self, sep: str | None = None, maxsplit: SupportsIndex = -1
|
40
|
+
) -> Union[List, "TextHandlers"]: # pragma: no cover
|
41
|
+
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
45
42
|
|
46
|
-
def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
43
|
+
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
47
44
|
return TextHandler(super().strip(chars))
|
48
45
|
|
49
|
-
def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
46
|
+
def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
50
47
|
return TextHandler(super().lstrip(chars))
|
51
48
|
|
52
|
-
def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
49
|
+
def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
53
50
|
return TextHandler(super().rstrip(chars))
|
54
51
|
|
55
52
|
def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
@@ -64,7 +61,7 @@ class TextHandler(str):
|
|
64
61
|
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
65
62
|
return TextHandler(super().expandtabs(tabsize))
|
66
63
|
|
67
|
-
def format(self, *args:
|
64
|
+
def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
|
68
65
|
return TextHandler(super().format(*args, **kwargs))
|
69
66
|
|
70
67
|
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
@@ -131,10 +128,11 @@ class TextHandler(str):
|
|
131
128
|
def re(
|
132
129
|
self,
|
133
130
|
regex: str | Pattern,
|
134
|
-
check_match: Literal[True],
|
135
131
|
replace_entities: bool = True,
|
136
132
|
clean_match: bool = False,
|
137
133
|
case_sensitive: bool = True,
|
134
|
+
*,
|
135
|
+
check_match: Literal[True],
|
138
136
|
) -> bool: ...
|
139
137
|
|
140
138
|
@overload
|
@@ -179,19 +177,14 @@ class TextHandler(str):
|
|
179
177
|
results = flatten(results)
|
180
178
|
|
181
179
|
if not replace_entities:
|
182
|
-
return TextHandlers(
|
180
|
+
return TextHandlers([TextHandler(string) for string in results])
|
183
181
|
|
184
|
-
return TextHandlers(
|
185
|
-
cast(
|
186
|
-
List[_TextHandlerType],
|
187
|
-
[TextHandler(_replace_entities(s)) for s in results],
|
188
|
-
)
|
189
|
-
)
|
182
|
+
return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
|
190
183
|
|
191
184
|
def re_first(
|
192
185
|
self,
|
193
186
|
regex: str | Pattern,
|
194
|
-
default=None,
|
187
|
+
default: Any = None,
|
195
188
|
replace_entities: bool = True,
|
196
189
|
clean_match: bool = False,
|
197
190
|
case_sensitive: bool = True,
|
@@ -232,8 +225,8 @@ class TextHandlers(List[TextHandler]):
|
|
232
225
|
def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
|
233
226
|
lst = super().__getitem__(pos)
|
234
227
|
if isinstance(pos, slice):
|
235
|
-
return TextHandlers(cast(List[
|
236
|
-
return cast(
|
228
|
+
return TextHandlers(cast(List[TextHandler], lst))
|
229
|
+
return TextHandler(cast(TextHandler, lst))
|
237
230
|
|
238
231
|
def re(
|
239
232
|
self,
|
@@ -256,7 +249,7 @@ class TextHandlers(List[TextHandler]):
|
|
256
249
|
def re_first(
|
257
250
|
self,
|
258
251
|
regex: str | Pattern,
|
259
|
-
default=None,
|
252
|
+
default: Any = None,
|
260
253
|
replace_entities: bool = True,
|
261
254
|
clean_match: bool = False,
|
262
255
|
case_sensitive: bool = True,
|
@@ -309,9 +302,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
309
302
|
)
|
310
303
|
|
311
304
|
# Fastest read-only mapping type
|
312
|
-
self._data = MappingProxyType(mapping)
|
305
|
+
self._data: Mapping[str, Any] = MappingProxyType(mapping)
|
313
306
|
|
314
|
-
def get(self, key: str, default:
|
307
|
+
def get(self, key: str, default: Any = None) -> _TextHandlerType:
|
315
308
|
"""Acts like the standard dictionary `.get()` method"""
|
316
309
|
return self._data.get(key, default)
|
317
310
|
|
scrapling/core/mixins.py
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
from scrapling.core._types import TYPE_CHECKING
|
2
|
+
|
3
|
+
if TYPE_CHECKING:
|
4
|
+
from scrapling.parser import Selector
|
5
|
+
|
6
|
+
|
1
7
|
class SelectorsGeneration:
|
2
8
|
"""
|
3
9
|
Functions for generating selectors
|
@@ -5,7 +11,7 @@ class SelectorsGeneration:
|
|
5
11
|
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
6
12
|
"""
|
7
13
|
|
8
|
-
def
|
14
|
+
def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined]
|
9
15
|
"""Generate a selector for the current element.
|
10
16
|
:return: A string of the generated selector.
|
11
17
|
"""
|
@@ -47,29 +53,29 @@ class SelectorsGeneration:
|
|
47
53
|
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
48
54
|
|
49
55
|
@property
|
50
|
-
def generate_css_selector(self) -> str:
|
56
|
+
def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
51
57
|
"""Generate a CSS selector for the current element
|
52
58
|
:return: A string of the generated selector.
|
53
59
|
"""
|
54
|
-
return self.
|
60
|
+
return self._general_selection()
|
55
61
|
|
56
62
|
@property
|
57
|
-
def generate_full_css_selector(self) -> str:
|
63
|
+
def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
58
64
|
"""Generate a complete CSS selector for the current element
|
59
65
|
:return: A string of the generated selector.
|
60
66
|
"""
|
61
|
-
return self.
|
67
|
+
return self._general_selection(full_path=True)
|
62
68
|
|
63
69
|
@property
|
64
|
-
def generate_xpath_selector(self) -> str:
|
70
|
+
def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
65
71
|
"""Generate an XPath selector for the current element
|
66
72
|
:return: A string of the generated selector.
|
67
73
|
"""
|
68
|
-
return self.
|
74
|
+
return self._general_selection("xpath")
|
69
75
|
|
70
76
|
@property
|
71
|
-
def generate_full_xpath_selector(self) -> str:
|
77
|
+
def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
72
78
|
"""Generate a complete XPath selector for the current element
|
73
79
|
:return: A string of the generated selector.
|
74
80
|
"""
|
75
|
-
return self.
|
81
|
+
return self._general_selection("xpath", full_path=True)
|
scrapling/core/shell.py
CHANGED
@@ -22,14 +22,16 @@ from logging import (
|
|
22
22
|
from orjson import loads as json_loads, JSONDecodeError
|
23
23
|
|
24
24
|
from scrapling import __version__
|
25
|
+
from scrapling.core.utils import log
|
25
26
|
from scrapling.parser import Selector, Selectors
|
26
27
|
from scrapling.core.custom_types import TextHandler
|
27
28
|
from scrapling.engines.toolbelt.custom import Response
|
28
|
-
from scrapling.core.utils import
|
29
|
+
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
29
30
|
from scrapling.core._types import (
|
30
31
|
Optional,
|
31
32
|
Dict,
|
32
33
|
Any,
|
34
|
+
cast,
|
33
35
|
extraction_types,
|
34
36
|
Generator,
|
35
37
|
)
|
@@ -539,15 +541,15 @@ class Convertor:
|
|
539
541
|
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
540
542
|
else:
|
541
543
|
if main_content_only:
|
542
|
-
page = page.css_first("body") or page
|
544
|
+
page = cast(Selector, page.css_first("body")) or page
|
543
545
|
|
544
|
-
pages = [page] if not css_selector else page.css(css_selector)
|
546
|
+
pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
|
545
547
|
for page in pages:
|
546
548
|
match extraction_type:
|
547
549
|
case "markdown":
|
548
550
|
yield cls._convert_to_markdown(page.html_content)
|
549
551
|
case "html":
|
550
|
-
yield page.
|
552
|
+
yield page.html_content
|
551
553
|
case "text":
|
552
554
|
txt_content = page.get_all_text(strip=True)
|
553
555
|
for s in (
|
scrapling/core/storage.py
CHANGED
@@ -6,7 +6,6 @@ from sqlite3 import connect as db_connect
|
|
6
6
|
|
7
7
|
from orjson import dumps, loads
|
8
8
|
from lxml.html import HtmlElement
|
9
|
-
from tldextract import extract as tld
|
10
9
|
|
11
10
|
from scrapling.core.utils import _StorageTools, log
|
12
11
|
from scrapling.core._types import Dict, Optional, Any
|
@@ -26,6 +25,8 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
26
25
|
return default_value
|
27
26
|
|
28
27
|
try:
|
28
|
+
from tldextract import extract as tld
|
29
|
+
|
29
30
|
extracted = tld(self.url)
|
30
31
|
return extracted.top_domain_under_public_suffix or extracted.domain or default_value
|
31
32
|
except AttributeError:
|
@@ -55,13 +56,13 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
55
56
|
@lru_cache(128, typed=True)
|
56
57
|
def _get_hash(identifier: str) -> str:
|
57
58
|
"""If you want to hash identifier in your storage system, use this safer"""
|
58
|
-
|
59
|
-
if isinstance(
|
59
|
+
_identifier = identifier.lower().strip()
|
60
|
+
if isinstance(_identifier, str):
|
60
61
|
# Hash functions have to take bytes
|
61
|
-
|
62
|
+
_identifier = _identifier.encode("utf-8")
|
62
63
|
|
63
|
-
hash_value = sha256(
|
64
|
-
return f"{hash_value}_{len(
|
64
|
+
hash_value = sha256(_identifier).hexdigest()
|
65
|
+
return f"{hash_value}_{len(_identifier)}" # Length to reduce collision chance
|
65
66
|
|
66
67
|
|
67
68
|
@lru_cache(1, typed=True)
|
scrapling/core/translator.py
CHANGED
@@ -10,24 +10,23 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
10
10
|
|
11
11
|
from functools import lru_cache
|
12
12
|
|
13
|
-
from cssselect.xpath import ExpressionError
|
14
|
-
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
15
13
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
14
|
+
from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
|
16
15
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
17
16
|
|
18
|
-
from scrapling.core._types import Any,
|
17
|
+
from scrapling.core._types import Any, Protocol, Self
|
19
18
|
|
20
19
|
|
21
20
|
class XPathExpr(OriginalXPathExpr):
|
22
21
|
textnode: bool = False
|
23
|
-
attribute:
|
22
|
+
attribute: str | None = None
|
24
23
|
|
25
24
|
@classmethod
|
26
25
|
def from_xpath(
|
27
26
|
cls,
|
28
27
|
xpath: OriginalXPathExpr,
|
29
28
|
textnode: bool = False,
|
30
|
-
attribute:
|
29
|
+
attribute: str | None = None,
|
31
30
|
) -> Self:
|
32
31
|
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
|
33
32
|
x.textnode = textnode
|
@@ -71,10 +70,10 @@ class XPathExpr(OriginalXPathExpr):
|
|
71
70
|
|
72
71
|
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
|
73
72
|
class TranslatorProtocol(Protocol):
|
74
|
-
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pragma: no cover
|
73
|
+
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
|
75
74
|
pass
|
76
75
|
|
77
|
-
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pragma: no cover
|
76
|
+
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
|
78
77
|
pass
|
79
78
|
|
80
79
|
|
@@ -121,9 +120,15 @@ class TranslatorMixin:
|
|
121
120
|
|
122
121
|
|
123
122
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
124
|
-
@lru_cache(maxsize=256)
|
125
123
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
126
124
|
return super().css_to_xpath(css, prefix)
|
127
125
|
|
128
126
|
|
129
127
|
translator = HTMLTranslator()
|
128
|
+
# Using a function instead of the translator directly to avoid Pyright override error
|
129
|
+
|
130
|
+
|
131
|
+
@lru_cache(maxsize=256)
|
132
|
+
def css_to_xpath(query: str) -> str:
|
133
|
+
"""Return translated XPath version of a given CSS query"""
|
134
|
+
return translator.css_to_xpath(query)
|
scrapling/core/utils/__init__.py
CHANGED