scrapling 0.3.6__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.3.6/scrapling.egg-info → scrapling-0.3.8}/PKG-INFO +7 -6
- {scrapling-0.3.6 → scrapling-0.3.8}/README.md +3 -3
- {scrapling-0.3.6 → scrapling-0.3.8}/pyproject.toml +5 -4
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/__init__.py +1 -1
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/_types.py +3 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/ai.py +2 -1
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/custom_types.py +20 -27
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/mixins.py +15 -9
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/shell.py +4 -3
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/storage.py +5 -5
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/translator.py +13 -8
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/_browsers/_base.py +175 -21
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/_browsers/_camoufox.py +95 -171
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/_browsers/_config_tools.py +9 -3
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/_browsers/_controllers.py +51 -101
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/_browsers/_validators.py +95 -63
- scrapling-0.3.8/scrapling/engines/static.py +1074 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/convertor.py +48 -15
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/custom.py +6 -21
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/fingerprints.py +14 -9
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/navigation.py +11 -3
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/fetchers/__init__.py +11 -1
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/fetchers/chrome.py +15 -4
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/fetchers/firefox.py +0 -4
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/parser.py +105 -80
- {scrapling-0.3.6 → scrapling-0.3.8/scrapling.egg-info}/PKG-INFO +7 -6
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling.egg-info/requires.txt +2 -2
- {scrapling-0.3.6 → scrapling-0.3.8}/setup.cfg +1 -1
- scrapling-0.3.6/scrapling/engines/static.py +0 -1064
- {scrapling-0.3.6 → scrapling-0.3.8}/LICENSE +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/MANIFEST.in +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/cli.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/__init__.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/_html_utils.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/utils/__init__.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/utils/_shell.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/core/utils/_utils.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/_browsers/_page.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/constants.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/fetchers/requests.py +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling/py.typed +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.3.6 → scrapling-0.3.8}/scrapling.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapling
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
|
6
6
|
Author: Karim Shoair
|
|
@@ -36,6 +36,7 @@ License: BSD 3-Clause License
|
|
|
36
36
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
37
37
|
|
|
38
38
|
Project-URL: Homepage, https://github.com/D4Vinci/Scrapling
|
|
39
|
+
Project-URL: Changelog, https://github.com/D4Vinci/Scrapling/releases
|
|
39
40
|
Project-URL: Documentation, https://scrapling.readthedocs.io/en/latest/
|
|
40
41
|
Project-URL: Repository, https://github.com/D4Vinci/Scrapling
|
|
41
42
|
Project-URL: Bug Tracker, https://github.com/D4Vinci/Scrapling/issues
|
|
@@ -66,7 +67,7 @@ Description-Content-Type: text/markdown
|
|
|
66
67
|
License-File: LICENSE
|
|
67
68
|
Requires-Dist: lxml>=6.0.2
|
|
68
69
|
Requires-Dist: cssselect>=1.3.0
|
|
69
|
-
Requires-Dist: orjson>=3.11.
|
|
70
|
+
Requires-Dist: orjson>=3.11.4
|
|
70
71
|
Requires-Dist: tldextract>=5.3.0
|
|
71
72
|
Provides-Extra: fetchers
|
|
72
73
|
Requires-Dist: click>=8.3.0; extra == "fetchers"
|
|
@@ -77,7 +78,7 @@ Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
|
|
77
78
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
|
78
79
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
|
79
80
|
Provides-Extra: ai
|
|
80
|
-
Requires-Dist: mcp>=1.
|
|
81
|
+
Requires-Dist: mcp>=1.19.0; extra == "ai"
|
|
81
82
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
|
82
83
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
|
83
84
|
Provides-Extra: shell
|
|
@@ -157,12 +158,12 @@ Built for the modern Web, Scrapling features its own rapid parsing engine and fe
|
|
|
157
158
|
|
|
158
159
|
<!-- sponsors -->
|
|
159
160
|
|
|
160
|
-
<a href="https://www.
|
|
161
|
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
|
161
162
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
|
162
163
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
|
163
164
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
165
|
+
<a href="https://app.cyberyozh.com/?utm_source=github&utm_medium=scrapling" target="_blank" title="We have gathered the best solutions for multi‑accounting and automation in one place."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/cyberyozh.png"></a>
|
|
164
166
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
|
165
|
-
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
|
166
167
|
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
|
167
168
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
|
168
169
|
|
|
@@ -389,7 +390,7 @@ Starting with v0.3.2, this installation only includes the parser engine and its
|
|
|
389
390
|
### Docker
|
|
390
391
|
You can also install a Docker image with all extras and browsers with the following command:
|
|
391
392
|
```bash
|
|
392
|
-
docker pull scrapling
|
|
393
|
+
docker pull pyd4vinci/scrapling
|
|
393
394
|
```
|
|
394
395
|
This image is automatically built and pushed to Docker Hub through GitHub actions right here.
|
|
395
396
|
|
|
@@ -67,12 +67,12 @@ Built for the modern Web, Scrapling features its own rapid parsing engine and fe
|
|
|
67
67
|
|
|
68
68
|
<!-- sponsors -->
|
|
69
69
|
|
|
70
|
-
<a href="https://www.
|
|
70
|
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
|
71
71
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
|
72
72
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
|
73
73
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
74
|
+
<a href="https://app.cyberyozh.com/?utm_source=github&utm_medium=scrapling" target="_blank" title="We have gathered the best solutions for multi‑accounting and automation in one place."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/cyberyozh.png"></a>
|
|
74
75
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
|
75
|
-
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
|
76
76
|
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
|
77
77
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
|
78
78
|
|
|
@@ -299,7 +299,7 @@ Starting with v0.3.2, this installation only includes the parser engine and its
|
|
|
299
299
|
### Docker
|
|
300
300
|
You can also install a Docker image with all extras and browsers with the following command:
|
|
301
301
|
```bash
|
|
302
|
-
docker pull scrapling
|
|
302
|
+
docker pull pyd4vinci/scrapling
|
|
303
303
|
```
|
|
304
304
|
This image is automatically built and pushed to Docker Hub through GitHub actions right here.
|
|
305
305
|
|
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scrapling"
|
|
7
|
-
# Static version instead of dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
|
8
|
-
version = "0.3.
|
|
7
|
+
# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
|
8
|
+
version = "0.3.8"
|
|
9
9
|
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
11
11
|
license = {file = "LICENSE"}
|
|
@@ -59,7 +59,7 @@ classifiers = [
|
|
|
59
59
|
dependencies = [
|
|
60
60
|
"lxml>=6.0.2",
|
|
61
61
|
"cssselect>=1.3.0",
|
|
62
|
-
"orjson>=3.11.
|
|
62
|
+
"orjson>=3.11.4",
|
|
63
63
|
"tldextract>=5.3.0",
|
|
64
64
|
]
|
|
65
65
|
|
|
@@ -74,7 +74,7 @@ fetchers = [
|
|
|
74
74
|
"msgspec>=0.19.0",
|
|
75
75
|
]
|
|
76
76
|
ai = [
|
|
77
|
-
"mcp>=1.
|
|
77
|
+
"mcp>=1.19.0",
|
|
78
78
|
"markdownify>=1.2.0",
|
|
79
79
|
"scrapling[fetchers]",
|
|
80
80
|
]
|
|
@@ -89,6 +89,7 @@ all = [
|
|
|
89
89
|
|
|
90
90
|
[project.urls]
|
|
91
91
|
Homepage = "https://github.com/D4Vinci/Scrapling"
|
|
92
|
+
Changelog = "https://github.com/D4Vinci/Scrapling/releases"
|
|
92
93
|
Documentation = "https://scrapling.readthedocs.io/en/latest/"
|
|
93
94
|
Repository = "https://github.com/D4Vinci/Scrapling"
|
|
94
95
|
"Bug Tracker" = "https://github.com/D4Vinci/Scrapling/issues"
|
|
@@ -12,9 +12,11 @@ from typing import (
|
|
|
12
12
|
Generator,
|
|
13
13
|
Iterable,
|
|
14
14
|
List,
|
|
15
|
+
Set,
|
|
15
16
|
Literal,
|
|
16
17
|
Optional,
|
|
17
18
|
Pattern,
|
|
19
|
+
Sequence,
|
|
18
20
|
Tuple,
|
|
19
21
|
TypeVar,
|
|
20
22
|
Union,
|
|
@@ -22,6 +24,7 @@ from typing import (
|
|
|
22
24
|
Mapping,
|
|
23
25
|
Awaitable,
|
|
24
26
|
Protocol,
|
|
27
|
+
Coroutine,
|
|
25
28
|
SupportsIndex,
|
|
26
29
|
)
|
|
27
30
|
|
|
@@ -20,6 +20,7 @@ from scrapling.core._types import (
|
|
|
20
20
|
Mapping,
|
|
21
21
|
Dict,
|
|
22
22
|
List,
|
|
23
|
+
Any,
|
|
23
24
|
SelectorWaitStates,
|
|
24
25
|
Generator,
|
|
25
26
|
)
|
|
@@ -171,7 +172,7 @@ class ScraplingMCPServer:
|
|
|
171
172
|
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
|
172
173
|
"""
|
|
173
174
|
async with FetcherSession() as session:
|
|
174
|
-
tasks = [
|
|
175
|
+
tasks: List[Any] = [
|
|
175
176
|
session.get(
|
|
176
177
|
url,
|
|
177
178
|
auth=auth,
|
|
@@ -5,6 +5,7 @@ from re import compile as re_compile, UNICODE, IGNORECASE
|
|
|
5
5
|
from orjson import dumps, loads
|
|
6
6
|
|
|
7
7
|
from scrapling.core._types import (
|
|
8
|
+
Any,
|
|
8
9
|
cast,
|
|
9
10
|
Dict,
|
|
10
11
|
List,
|
|
@@ -14,7 +15,6 @@ from scrapling.core._types import (
|
|
|
14
15
|
Literal,
|
|
15
16
|
Pattern,
|
|
16
17
|
Iterable,
|
|
17
|
-
Optional,
|
|
18
18
|
Generator,
|
|
19
19
|
SupportsIndex,
|
|
20
20
|
)
|
|
@@ -33,23 +33,20 @@ class TextHandler(str):
|
|
|
33
33
|
|
|
34
34
|
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
|
|
35
35
|
lst = super().__getitem__(key)
|
|
36
|
-
return
|
|
36
|
+
return TextHandler(lst)
|
|
37
37
|
|
|
38
|
-
def split(
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
[TextHandler(s) for s in super().split(sep, maxsplit)],
|
|
43
|
-
)
|
|
44
|
-
)
|
|
38
|
+
def split(
|
|
39
|
+
self, sep: str | None = None, maxsplit: SupportsIndex = -1
|
|
40
|
+
) -> Union[List, "TextHandlers"]: # pragma: no cover
|
|
41
|
+
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
|
45
42
|
|
|
46
|
-
def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
43
|
+
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
47
44
|
return TextHandler(super().strip(chars))
|
|
48
45
|
|
|
49
|
-
def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
46
|
+
def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
50
47
|
return TextHandler(super().lstrip(chars))
|
|
51
48
|
|
|
52
|
-
def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
49
|
+
def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
53
50
|
return TextHandler(super().rstrip(chars))
|
|
54
51
|
|
|
55
52
|
def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -64,7 +61,7 @@ class TextHandler(str):
|
|
|
64
61
|
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
65
62
|
return TextHandler(super().expandtabs(tabsize))
|
|
66
63
|
|
|
67
|
-
def format(self, *args:
|
|
64
|
+
def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
68
65
|
return TextHandler(super().format(*args, **kwargs))
|
|
69
66
|
|
|
70
67
|
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -131,10 +128,11 @@ class TextHandler(str):
|
|
|
131
128
|
def re(
|
|
132
129
|
self,
|
|
133
130
|
regex: str | Pattern,
|
|
134
|
-
check_match: Literal[True],
|
|
135
131
|
replace_entities: bool = True,
|
|
136
132
|
clean_match: bool = False,
|
|
137
133
|
case_sensitive: bool = True,
|
|
134
|
+
*,
|
|
135
|
+
check_match: Literal[True],
|
|
138
136
|
) -> bool: ...
|
|
139
137
|
|
|
140
138
|
@overload
|
|
@@ -179,19 +177,14 @@ class TextHandler(str):
|
|
|
179
177
|
results = flatten(results)
|
|
180
178
|
|
|
181
179
|
if not replace_entities:
|
|
182
|
-
return TextHandlers(
|
|
180
|
+
return TextHandlers([TextHandler(string) for string in results])
|
|
183
181
|
|
|
184
|
-
return TextHandlers(
|
|
185
|
-
cast(
|
|
186
|
-
List[_TextHandlerType],
|
|
187
|
-
[TextHandler(_replace_entities(s)) for s in results],
|
|
188
|
-
)
|
|
189
|
-
)
|
|
182
|
+
return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
|
|
190
183
|
|
|
191
184
|
def re_first(
|
|
192
185
|
self,
|
|
193
186
|
regex: str | Pattern,
|
|
194
|
-
default=None,
|
|
187
|
+
default: Any = None,
|
|
195
188
|
replace_entities: bool = True,
|
|
196
189
|
clean_match: bool = False,
|
|
197
190
|
case_sensitive: bool = True,
|
|
@@ -232,8 +225,8 @@ class TextHandlers(List[TextHandler]):
|
|
|
232
225
|
def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
|
|
233
226
|
lst = super().__getitem__(pos)
|
|
234
227
|
if isinstance(pos, slice):
|
|
235
|
-
return TextHandlers(cast(List[
|
|
236
|
-
return cast(
|
|
228
|
+
return TextHandlers(cast(List[TextHandler], lst))
|
|
229
|
+
return TextHandler(cast(TextHandler, lst))
|
|
237
230
|
|
|
238
231
|
def re(
|
|
239
232
|
self,
|
|
@@ -256,7 +249,7 @@ class TextHandlers(List[TextHandler]):
|
|
|
256
249
|
def re_first(
|
|
257
250
|
self,
|
|
258
251
|
regex: str | Pattern,
|
|
259
|
-
default=None,
|
|
252
|
+
default: Any = None,
|
|
260
253
|
replace_entities: bool = True,
|
|
261
254
|
clean_match: bool = False,
|
|
262
255
|
case_sensitive: bool = True,
|
|
@@ -309,9 +302,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
|
309
302
|
)
|
|
310
303
|
|
|
311
304
|
# Fastest read-only mapping type
|
|
312
|
-
self._data = MappingProxyType(mapping)
|
|
305
|
+
self._data: Mapping[str, Any] = MappingProxyType(mapping)
|
|
313
306
|
|
|
314
|
-
def get(self, key: str, default:
|
|
307
|
+
def get(self, key: str, default: Any = None) -> _TextHandlerType:
|
|
315
308
|
"""Acts like the standard dictionary `.get()` method"""
|
|
316
309
|
return self._data.get(key, default)
|
|
317
310
|
|
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
from scrapling.core._types import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from scrapling.parser import Selector
|
|
5
|
+
|
|
6
|
+
|
|
1
7
|
class SelectorsGeneration:
|
|
2
8
|
"""
|
|
3
9
|
Functions for generating selectors
|
|
@@ -5,7 +11,7 @@ class SelectorsGeneration:
|
|
|
5
11
|
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
|
6
12
|
"""
|
|
7
13
|
|
|
8
|
-
def
|
|
14
|
+
def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined]
|
|
9
15
|
"""Generate a selector for the current element.
|
|
10
16
|
:return: A string of the generated selector.
|
|
11
17
|
"""
|
|
@@ -47,29 +53,29 @@ class SelectorsGeneration:
|
|
|
47
53
|
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
|
48
54
|
|
|
49
55
|
@property
|
|
50
|
-
def generate_css_selector(self) -> str:
|
|
56
|
+
def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
|
51
57
|
"""Generate a CSS selector for the current element
|
|
52
58
|
:return: A string of the generated selector.
|
|
53
59
|
"""
|
|
54
|
-
return self.
|
|
60
|
+
return self._general_selection()
|
|
55
61
|
|
|
56
62
|
@property
|
|
57
|
-
def generate_full_css_selector(self) -> str:
|
|
63
|
+
def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
|
58
64
|
"""Generate a complete CSS selector for the current element
|
|
59
65
|
:return: A string of the generated selector.
|
|
60
66
|
"""
|
|
61
|
-
return self.
|
|
67
|
+
return self._general_selection(full_path=True)
|
|
62
68
|
|
|
63
69
|
@property
|
|
64
|
-
def generate_xpath_selector(self) -> str:
|
|
70
|
+
def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
|
65
71
|
"""Generate an XPath selector for the current element
|
|
66
72
|
:return: A string of the generated selector.
|
|
67
73
|
"""
|
|
68
|
-
return self.
|
|
74
|
+
return self._general_selection("xpath")
|
|
69
75
|
|
|
70
76
|
@property
|
|
71
|
-
def generate_full_xpath_selector(self) -> str:
|
|
77
|
+
def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
|
72
78
|
"""Generate a complete XPath selector for the current element
|
|
73
79
|
:return: A string of the generated selector.
|
|
74
80
|
"""
|
|
75
|
-
return self.
|
|
81
|
+
return self._general_selection("xpath", full_path=True)
|
|
@@ -31,6 +31,7 @@ from scrapling.core._types import (
|
|
|
31
31
|
Optional,
|
|
32
32
|
Dict,
|
|
33
33
|
Any,
|
|
34
|
+
cast,
|
|
34
35
|
extraction_types,
|
|
35
36
|
Generator,
|
|
36
37
|
)
|
|
@@ -540,15 +541,15 @@ class Convertor:
|
|
|
540
541
|
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
|
541
542
|
else:
|
|
542
543
|
if main_content_only:
|
|
543
|
-
page = page.css_first("body") or page
|
|
544
|
+
page = cast(Selector, page.css_first("body")) or page
|
|
544
545
|
|
|
545
|
-
pages = [page] if not css_selector else page.css(css_selector)
|
|
546
|
+
pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
|
|
546
547
|
for page in pages:
|
|
547
548
|
match extraction_type:
|
|
548
549
|
case "markdown":
|
|
549
550
|
yield cls._convert_to_markdown(page.html_content)
|
|
550
551
|
case "html":
|
|
551
|
-
yield page.
|
|
552
|
+
yield page.html_content
|
|
552
553
|
case "text":
|
|
553
554
|
txt_content = page.get_all_text(strip=True)
|
|
554
555
|
for s in (
|
|
@@ -56,13 +56,13 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
|
56
56
|
@lru_cache(128, typed=True)
|
|
57
57
|
def _get_hash(identifier: str) -> str:
|
|
58
58
|
"""If you want to hash identifier in your storage system, use this safer"""
|
|
59
|
-
|
|
60
|
-
if isinstance(
|
|
59
|
+
_identifier = identifier.lower().strip()
|
|
60
|
+
if isinstance(_identifier, str):
|
|
61
61
|
# Hash functions have to take bytes
|
|
62
|
-
|
|
62
|
+
_identifier = _identifier.encode("utf-8")
|
|
63
63
|
|
|
64
|
-
hash_value = sha256(
|
|
65
|
-
return f"{hash_value}_{len(
|
|
64
|
+
hash_value = sha256(_identifier).hexdigest()
|
|
65
|
+
return f"{hash_value}_{len(_identifier)}" # Length to reduce collision chance
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
@lru_cache(1, typed=True)
|
|
@@ -10,24 +10,23 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
|
10
10
|
|
|
11
11
|
from functools import lru_cache
|
|
12
12
|
|
|
13
|
-
from cssselect.xpath import ExpressionError
|
|
14
|
-
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
|
15
13
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
|
14
|
+
from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
|
|
16
15
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
|
17
16
|
|
|
18
|
-
from scrapling.core._types import Any,
|
|
17
|
+
from scrapling.core._types import Any, Protocol, Self
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
class XPathExpr(OriginalXPathExpr):
|
|
22
21
|
textnode: bool = False
|
|
23
|
-
attribute:
|
|
22
|
+
attribute: str | None = None
|
|
24
23
|
|
|
25
24
|
@classmethod
|
|
26
25
|
def from_xpath(
|
|
27
26
|
cls,
|
|
28
27
|
xpath: OriginalXPathExpr,
|
|
29
28
|
textnode: bool = False,
|
|
30
|
-
attribute:
|
|
29
|
+
attribute: str | None = None,
|
|
31
30
|
) -> Self:
|
|
32
31
|
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
|
|
33
32
|
x.textnode = textnode
|
|
@@ -71,10 +70,10 @@ class XPathExpr(OriginalXPathExpr):
|
|
|
71
70
|
|
|
72
71
|
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
|
|
73
72
|
class TranslatorProtocol(Protocol):
|
|
74
|
-
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pragma: no cover
|
|
73
|
+
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
|
|
75
74
|
pass
|
|
76
75
|
|
|
77
|
-
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pragma: no cover
|
|
76
|
+
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
|
|
78
77
|
pass
|
|
79
78
|
|
|
80
79
|
|
|
@@ -121,9 +120,15 @@ class TranslatorMixin:
|
|
|
121
120
|
|
|
122
121
|
|
|
123
122
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
|
124
|
-
@lru_cache(maxsize=256)
|
|
125
123
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
|
126
124
|
return super().css_to_xpath(css, prefix)
|
|
127
125
|
|
|
128
126
|
|
|
129
127
|
translator = HTMLTranslator()
|
|
128
|
+
# Using a function instead of the translator directly to avoid Pyright override error
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@lru_cache(maxsize=256)
|
|
132
|
+
def css_to_xpath(query: str) -> str:
|
|
133
|
+
"""Return translated XPath version of a given CSS query"""
|
|
134
|
+
return translator.css_to_xpath(query)
|