scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +51 -129
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +238 -293
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +220 -278
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +29 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +41 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
- scrapling-0.3.2.dist-info/RECORD +44 -0
- scrapling-0.3.dist-info/RECORD +0 -41
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
scrapling/cli.py
CHANGED
@@ -2,14 +2,18 @@ from pathlib import Path
|
|
2
2
|
from subprocess import check_output
|
3
3
|
from sys import executable as python_executable
|
4
4
|
|
5
|
-
from scrapling.
|
6
|
-
from scrapling.
|
5
|
+
from scrapling.engines.toolbelt.custom import Response
|
6
|
+
from scrapling.core.utils import log, _CookieParser, _ParseHeaders
|
7
7
|
from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
|
8
|
-
from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
|
9
|
-
from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
|
10
8
|
|
11
9
|
from orjson import loads as json_loads, JSONDecodeError
|
12
|
-
|
10
|
+
|
11
|
+
try:
|
12
|
+
from click import command, option, Choice, group, argument
|
13
|
+
except (ImportError, ModuleNotFoundError) as e:
|
14
|
+
raise ModuleNotFoundError(
|
15
|
+
"You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation"
|
16
|
+
) from e
|
13
17
|
|
14
18
|
__OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
|
15
19
|
__PACKAGE_DIR__ = Path(__file__).parent
|
@@ -40,6 +44,8 @@ def __Request_and_Save(
|
|
40
44
|
**kwargs,
|
41
45
|
) -> None:
|
42
46
|
"""Make a request using the specified fetcher function and save the result"""
|
47
|
+
from scrapling.core.shell import Convertor
|
48
|
+
|
43
49
|
# Handle relative paths - convert to an absolute path based on the current working directory
|
44
50
|
output_path = Path(output_file)
|
45
51
|
if not output_path.is_absolute():
|
@@ -72,14 +78,10 @@ def __ParseExtractArguments(
|
|
72
78
|
return parsed_headers, parsed_cookies, parsed_params, parsed_json
|
73
79
|
|
74
80
|
|
75
|
-
def __BuildRequest(
|
76
|
-
headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs
|
77
|
-
) -> Dict:
|
81
|
+
def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:
|
78
82
|
"""Build a request object using the specified arguments"""
|
79
83
|
# Parse parameters
|
80
|
-
parsed_headers, parsed_cookies, parsed_params, parsed_json = (
|
81
|
-
__ParseExtractArguments(headers, cookies, params, json)
|
82
|
-
)
|
84
|
+
parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)
|
83
85
|
# Build request arguments
|
84
86
|
request_kwargs = {
|
85
87
|
"headers": parsed_headers if parsed_headers else None,
|
@@ -106,10 +108,7 @@ def __BuildRequest(
|
|
106
108
|
help="Force Scrapling to reinstall all Fetchers dependencies",
|
107
109
|
)
|
108
110
|
def install(force): # pragma: no cover
|
109
|
-
if (
|
110
|
-
force
|
111
|
-
or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists()
|
112
|
-
):
|
111
|
+
if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists():
|
113
112
|
__Execute(
|
114
113
|
[python_executable, "-m", "playwright", "install", "chromium"],
|
115
114
|
"Playwright browsers",
|
@@ -158,9 +157,7 @@ def mcp():
|
|
158
157
|
"level",
|
159
158
|
is_flag=False,
|
160
159
|
default="debug",
|
161
|
-
type=Choice(
|
162
|
-
["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False
|
163
|
-
),
|
160
|
+
type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False),
|
164
161
|
help="Log level (default: DEBUG)",
|
165
162
|
)
|
166
163
|
def shell(code, level):
|
@@ -178,9 +175,7 @@ def extract():
|
|
178
175
|
pass
|
179
176
|
|
180
177
|
|
181
|
-
@extract.command(
|
182
|
-
help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
183
|
-
)
|
178
|
+
@extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
184
179
|
@argument("url", required=True)
|
185
180
|
@argument("output_file", required=True)
|
186
181
|
@option(
|
@@ -190,9 +185,7 @@ def extract():
|
|
190
185
|
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
191
186
|
)
|
192
187
|
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
193
|
-
@option(
|
194
|
-
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
195
|
-
)
|
188
|
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
196
189
|
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
197
190
|
@option(
|
198
191
|
"--css-selector",
|
@@ -264,12 +257,12 @@ def get(
|
|
264
257
|
impersonate=impersonate,
|
265
258
|
proxy=proxy,
|
266
259
|
)
|
260
|
+
from scrapling.fetchers import Fetcher
|
261
|
+
|
267
262
|
__Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
|
268
263
|
|
269
264
|
|
270
|
-
@extract.command(
|
271
|
-
help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
272
|
-
)
|
265
|
+
@extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
273
266
|
@argument("url", required=True)
|
274
267
|
@argument("output_file", required=True)
|
275
268
|
@option(
|
@@ -285,9 +278,7 @@ def get(
|
|
285
278
|
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
286
279
|
)
|
287
280
|
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
288
|
-
@option(
|
289
|
-
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
290
|
-
)
|
281
|
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
291
282
|
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
292
283
|
@option(
|
293
284
|
"--css-selector",
|
@@ -364,12 +355,12 @@ def post(
|
|
364
355
|
proxy=proxy,
|
365
356
|
data=data,
|
366
357
|
)
|
358
|
+
from scrapling.fetchers import Fetcher
|
359
|
+
|
367
360
|
__Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
|
368
361
|
|
369
362
|
|
370
|
-
@extract.command(
|
371
|
-
help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
372
|
-
)
|
363
|
+
@extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
373
364
|
@argument("url", required=True)
|
374
365
|
@argument("output_file", required=True)
|
375
366
|
@option("--data", "-d", help="Form data to include in the request body")
|
@@ -381,9 +372,7 @@ def post(
|
|
381
372
|
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
382
373
|
)
|
383
374
|
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
384
|
-
@option(
|
385
|
-
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
386
|
-
)
|
375
|
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
387
376
|
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
388
377
|
@option(
|
389
378
|
"--css-selector",
|
@@ -460,12 +449,12 @@ def put(
|
|
460
449
|
proxy=proxy,
|
461
450
|
data=data,
|
462
451
|
)
|
452
|
+
from scrapling.fetchers import Fetcher
|
453
|
+
|
463
454
|
__Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
|
464
455
|
|
465
456
|
|
466
|
-
@extract.command(
|
467
|
-
help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
468
|
-
)
|
457
|
+
@extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
469
458
|
@argument("url", required=True)
|
470
459
|
@argument("output_file", required=True)
|
471
460
|
@option(
|
@@ -475,9 +464,7 @@ def put(
|
|
475
464
|
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
476
465
|
)
|
477
466
|
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
478
|
-
@option(
|
479
|
-
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
480
|
-
)
|
467
|
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
481
468
|
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
482
469
|
@option(
|
483
470
|
"--css-selector",
|
@@ -549,12 +536,12 @@ def delete(
|
|
549
536
|
impersonate=impersonate,
|
550
537
|
proxy=proxy,
|
551
538
|
)
|
539
|
+
from scrapling.fetchers import Fetcher
|
540
|
+
|
552
541
|
__Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
|
553
542
|
|
554
543
|
|
555
|
-
@extract.command(
|
556
|
-
help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}"
|
557
|
-
)
|
544
|
+
@extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}")
|
558
545
|
@argument("url", required=True)
|
559
546
|
@argument("output_file", required=True)
|
560
547
|
@option(
|
@@ -591,9 +578,7 @@ def delete(
|
|
591
578
|
)
|
592
579
|
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
593
580
|
@option("--locale", default="en-US", help="Browser locale (default: en-US)")
|
594
|
-
@option(
|
595
|
-
"--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)"
|
596
|
-
)
|
581
|
+
@option("--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)")
|
597
582
|
@option(
|
598
583
|
"--hide-canvas/--show-canvas",
|
599
584
|
default=False,
|
@@ -672,12 +657,12 @@ def fetch(
|
|
672
657
|
if parsed_headers:
|
673
658
|
kwargs["extra_headers"] = parsed_headers
|
674
659
|
|
660
|
+
from scrapling.fetchers import DynamicFetcher
|
661
|
+
|
675
662
|
__Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
|
676
663
|
|
677
664
|
|
678
|
-
@extract.command(
|
679
|
-
help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}"
|
680
|
-
)
|
665
|
+
@extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}")
|
681
666
|
@argument("url", required=True)
|
682
667
|
@argument("output_file", required=True)
|
683
668
|
@option(
|
@@ -821,6 +806,8 @@ def stealthy_fetch(
|
|
821
806
|
if parsed_headers:
|
822
807
|
kwargs["extra_headers"] = parsed_headers
|
823
808
|
|
809
|
+
from scrapling.fetchers import StealthyFetcher
|
810
|
+
|
824
811
|
__Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
|
825
812
|
|
826
813
|
|
scrapling/core/_html_utils.py
CHANGED
@@ -269,17 +269,13 @@ name2codepoint = {
|
|
269
269
|
}
|
270
270
|
|
271
271
|
|
272
|
-
def to_unicode(
|
273
|
-
text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
|
274
|
-
) -> str:
|
272
|
+
def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict") -> str:
|
275
273
|
"""Return the Unicode representation of a bytes object `text`. If `text`
|
276
274
|
is already a Unicode object, return it as-is."""
|
277
275
|
if isinstance(text, str):
|
278
276
|
return text
|
279
277
|
if not isinstance(text, (bytes, str)):
|
280
|
-
raise TypeError(
|
281
|
-
f"to_unicode must receive bytes or str, got {type(text).__name__}"
|
282
|
-
)
|
278
|
+
raise TypeError(f"to_unicode must receive bytes or str, got {type(text).__name__}")
|
283
279
|
if encoding is None:
|
284
280
|
encoding = "utf-8"
|
285
281
|
return text.decode(encoding, errors)
|
@@ -328,9 +324,7 @@ def _replace_entities(
|
|
328
324
|
entity_name = groups["named"]
|
329
325
|
if entity_name.lower() in keep:
|
330
326
|
return m.group(0)
|
331
|
-
number = name2codepoint.get(entity_name) or name2codepoint.get(
|
332
|
-
entity_name.lower()
|
333
|
-
)
|
327
|
+
number = name2codepoint.get(entity_name) or name2codepoint.get(entity_name.lower())
|
334
328
|
if number is not None:
|
335
329
|
# Browsers typically
|
336
330
|
# interpret numeric character references in the 80-9F range as representing the characters mapped
|
scrapling/core/ai.py
CHANGED
@@ -4,7 +4,7 @@ from mcp.server.fastmcp import FastMCP
|
|
4
4
|
from pydantic import BaseModel, Field
|
5
5
|
|
6
6
|
from scrapling.core.shell import Convertor
|
7
|
-
from scrapling.engines.toolbelt import Response as _ScraplingResponse
|
7
|
+
from scrapling.engines.toolbelt.custom import Response as _ScraplingResponse
|
8
8
|
from scrapling.fetchers import (
|
9
9
|
Fetcher,
|
10
10
|
FetcherSession,
|
@@ -32,21 +32,13 @@ class ResponseModel(BaseModel):
|
|
32
32
|
"""Request's response information structure."""
|
33
33
|
|
34
34
|
status: int = Field(description="The status code returned by the website.")
|
35
|
-
content: list[str] = Field(
|
36
|
-
|
37
|
-
)
|
38
|
-
url: str = Field(
|
39
|
-
description="The URL given by the user that resulted in this response."
|
40
|
-
)
|
35
|
+
content: list[str] = Field(description="The content as Markdown/HTML or the text content of the page.")
|
36
|
+
url: str = Field(description="The URL given by the user that resulted in this response.")
|
41
37
|
|
42
38
|
|
43
|
-
def _ContentTranslator(
|
44
|
-
content: Generator[str, None, None], page: _ScraplingResponse
|
45
|
-
) -> ResponseModel:
|
39
|
+
def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel:
|
46
40
|
"""Convert a content generator to a list of ResponseModel objects."""
|
47
|
-
return ResponseModel(
|
48
|
-
status=page.status, content=[result for result in content], url=page.url
|
49
|
-
)
|
41
|
+
return ResponseModel(status=page.status, content=[result for result in content], url=page.url)
|
50
42
|
|
51
43
|
|
52
44
|
class ScraplingMCPServer:
|
scrapling/core/custom_types.py
CHANGED
@@ -31,15 +31,11 @@ class TextHandler(str):
|
|
31
31
|
|
32
32
|
__slots__ = ()
|
33
33
|
|
34
|
-
def __getitem__(
|
35
|
-
self, key: SupportsIndex | slice
|
36
|
-
) -> "TextHandler": # pragma: no cover
|
34
|
+
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
|
37
35
|
lst = super().__getitem__(key)
|
38
36
|
return cast(_TextHandlerType, TextHandler(lst))
|
39
37
|
|
40
|
-
def split(
|
41
|
-
self, sep: str = None, maxsplit: SupportsIndex = -1
|
42
|
-
) -> "TextHandlers": # pragma: no cover
|
38
|
+
def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers": # pragma: no cover
|
43
39
|
return TextHandlers(
|
44
40
|
cast(
|
45
41
|
List[_TextHandlerType],
|
@@ -50,14 +46,10 @@ class TextHandler(str):
|
|
50
46
|
def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
51
47
|
return TextHandler(super().strip(chars))
|
52
48
|
|
53
|
-
def lstrip(
|
54
|
-
self, chars: str = None
|
55
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
49
|
+
def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
56
50
|
return TextHandler(super().lstrip(chars))
|
57
51
|
|
58
|
-
def rstrip(
|
59
|
-
self, chars: str = None
|
60
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
52
|
+
def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
61
53
|
return TextHandler(super().rstrip(chars))
|
62
54
|
|
63
55
|
def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
@@ -66,37 +58,25 @@ class TextHandler(str):
|
|
66
58
|
def casefold(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
67
59
|
return TextHandler(super().casefold())
|
68
60
|
|
69
|
-
def center(
|
70
|
-
self, width: SupportsIndex, fillchar: str = " "
|
71
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
61
|
+
def center(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
|
72
62
|
return TextHandler(super().center(width, fillchar))
|
73
63
|
|
74
|
-
def expandtabs(
|
75
|
-
self, tabsize: SupportsIndex = 8
|
76
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
64
|
+
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
77
65
|
return TextHandler(super().expandtabs(tabsize))
|
78
66
|
|
79
|
-
def format(
|
80
|
-
self, *args: str, **kwargs: str
|
81
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
67
|
+
def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
|
82
68
|
return TextHandler(super().format(*args, **kwargs))
|
83
69
|
|
84
70
|
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
85
71
|
return TextHandler(super().format_map(mapping))
|
86
72
|
|
87
|
-
def join(
|
88
|
-
self, iterable: Iterable[str]
|
89
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
73
|
+
def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]: # pragma: no cover
|
90
74
|
return TextHandler(super().join(iterable))
|
91
75
|
|
92
|
-
def ljust(
|
93
|
-
self, width: SupportsIndex, fillchar: str = " "
|
94
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
76
|
+
def ljust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
|
95
77
|
return TextHandler(super().ljust(width, fillchar))
|
96
78
|
|
97
|
-
def rjust(
|
98
|
-
self, width: SupportsIndex, fillchar: str = " "
|
99
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
79
|
+
def rjust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
|
100
80
|
return TextHandler(super().rjust(width, fillchar))
|
101
81
|
|
102
82
|
def swapcase(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
@@ -108,14 +88,10 @@ class TextHandler(str):
|
|
108
88
|
def translate(self, table) -> Union[str, "TextHandler"]: # pragma: no cover
|
109
89
|
return TextHandler(super().translate(table))
|
110
90
|
|
111
|
-
def zfill(
|
112
|
-
self, width: SupportsIndex
|
113
|
-
) -> Union[str, "TextHandler"]: # pragma: no cover
|
91
|
+
def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]: # pragma: no cover
|
114
92
|
return TextHandler(super().zfill(width))
|
115
93
|
|
116
|
-
def replace(
|
117
|
-
self, old: str, new: str, count: SupportsIndex = -1
|
118
|
-
) -> Union[str, "TextHandler"]:
|
94
|
+
def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, "TextHandler"]:
|
119
95
|
return TextHandler(super().replace(old, new, count))
|
120
96
|
|
121
97
|
def upper(self) -> Union[str, "TextHandler"]:
|
@@ -203,11 +179,7 @@ class TextHandler(str):
|
|
203
179
|
results = flatten(results)
|
204
180
|
|
205
181
|
if not replace_entities:
|
206
|
-
return TextHandlers(
|
207
|
-
cast(
|
208
|
-
List[_TextHandlerType], [TextHandler(string) for string in results]
|
209
|
-
)
|
210
|
-
)
|
182
|
+
return TextHandlers(cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
|
211
183
|
|
212
184
|
return TextHandlers(
|
213
185
|
cast(
|
@@ -257,9 +229,7 @@ class TextHandlers(List[TextHandler]):
|
|
257
229
|
def __getitem__(self, pos: slice) -> "TextHandlers": # pragma: no cover
|
258
230
|
pass
|
259
231
|
|
260
|
-
def __getitem__(
|
261
|
-
self, pos: SupportsIndex | slice
|
262
|
-
) -> Union[TextHandler, "TextHandlers"]:
|
232
|
+
def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
|
263
233
|
lst = super().__getitem__(pos)
|
264
234
|
if isinstance(pos, slice):
|
265
235
|
return TextHandlers(cast(List[_TextHandlerType], lst))
|
@@ -280,9 +250,7 @@ class TextHandlers(List[TextHandler]):
|
|
280
250
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
281
251
|
:param case_sensitive: if disabled, the function will set the regex to ignore the letters-case while compiling it
|
282
252
|
"""
|
283
|
-
results = [
|
284
|
-
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
285
|
-
]
|
253
|
+
results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
|
286
254
|
return TextHandlers(flatten(results))
|
287
255
|
|
288
256
|
def re_first(
|
@@ -330,34 +298,24 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
330
298
|
|
331
299
|
def __init__(self, mapping=None, **kwargs):
|
332
300
|
mapping = (
|
333
|
-
{
|
334
|
-
key: TextHandler(value) if isinstance(value, str) else value
|
335
|
-
for key, value in mapping.items()
|
336
|
-
}
|
301
|
+
{key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
|
337
302
|
if mapping is not None
|
338
303
|
else {}
|
339
304
|
)
|
340
305
|
|
341
306
|
if kwargs:
|
342
307
|
mapping.update(
|
343
|
-
{
|
344
|
-
key: TextHandler(value) if isinstance(value, str) else value
|
345
|
-
for key, value in kwargs.items()
|
346
|
-
}
|
308
|
+
{key: TextHandler(value) if isinstance(value, str) else value for key, value in kwargs.items()}
|
347
309
|
)
|
348
310
|
|
349
311
|
# Fastest read-only mapping type
|
350
312
|
self._data = MappingProxyType(mapping)
|
351
313
|
|
352
|
-
def get(
|
353
|
-
self, key: str, default: Optional[str] = None
|
354
|
-
) -> Optional[_TextHandlerType]:
|
314
|
+
def get(self, key: str, default: Optional[str] = None) -> Optional[_TextHandlerType]:
|
355
315
|
"""Acts like the standard dictionary `.get()` method"""
|
356
316
|
return self._data.get(key, default)
|
357
317
|
|
358
|
-
def search_values(
|
359
|
-
self, keyword: str, partial: bool = False
|
360
|
-
) -> Generator["AttributesHandler", None, None]:
|
318
|
+
def search_values(self, keyword: str, partial: bool = False) -> Generator["AttributesHandler", None, None]:
|
361
319
|
"""Search current attributes by values and return a dictionary of each matching item
|
362
320
|
:param keyword: The keyword to search for in the attribute values
|
363
321
|
:param partial: If True, the function will search if keyword in each value instead of perfect match
|
scrapling/core/mixins.py
CHANGED
@@ -5,9 +5,7 @@ class SelectorsGeneration:
|
|
5
5
|
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
6
6
|
"""
|
7
7
|
|
8
|
-
def __general_selection(
|
9
|
-
self, selection: str = "css", full_path: bool = False
|
10
|
-
) -> str:
|
8
|
+
def __general_selection(self, selection: str = "css", full_path: bool = False) -> str:
|
11
9
|
"""Generate a selector for the current element.
|
12
10
|
:return: A string of the generated selector.
|
13
11
|
"""
|
@@ -18,18 +16,10 @@ class SelectorsGeneration:
|
|
18
16
|
if target.parent:
|
19
17
|
if target.attrib.get("id"):
|
20
18
|
# id is enough
|
21
|
-
part =
|
22
|
-
f"#{target.attrib['id']}"
|
23
|
-
if css
|
24
|
-
else f"[@id='{target.attrib['id']}']"
|
25
|
-
)
|
19
|
+
part = f"#{target.attrib['id']}" if css else f"[@id='{target.attrib['id']}']"
|
26
20
|
selectorPath.append(part)
|
27
21
|
if not full_path:
|
28
|
-
return (
|
29
|
-
" > ".join(reversed(selectorPath))
|
30
|
-
if css
|
31
|
-
else "//*" + "/".join(reversed(selectorPath))
|
32
|
-
)
|
22
|
+
return " > ".join(reversed(selectorPath)) if css else "//*" + "/".join(reversed(selectorPath))
|
33
23
|
else:
|
34
24
|
part = f"{target.tag}"
|
35
25
|
# We won't use classes anymore because I some websites share exact classes between elements
|
@@ -45,28 +35,16 @@ class SelectorsGeneration:
|
|
45
35
|
break
|
46
36
|
|
47
37
|
if counter[target.tag] > 1:
|
48
|
-
part += (
|
49
|
-
f":nth-of-type({counter[target.tag]})"
|
50
|
-
if css
|
51
|
-
else f"[{counter[target.tag]}]"
|
52
|
-
)
|
38
|
+
part += f":nth-of-type({counter[target.tag]})" if css else f"[{counter[target.tag]}]"
|
53
39
|
|
54
40
|
selectorPath.append(part)
|
55
41
|
target = target.parent
|
56
42
|
if target is None or target.tag == "html":
|
57
|
-
return (
|
58
|
-
" > ".join(reversed(selectorPath))
|
59
|
-
if css
|
60
|
-
else "//" + "/".join(reversed(selectorPath))
|
61
|
-
)
|
43
|
+
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
62
44
|
else:
|
63
45
|
break
|
64
46
|
|
65
|
-
return (
|
66
|
-
" > ".join(reversed(selectorPath))
|
67
|
-
if css
|
68
|
-
else "//" + "/".join(reversed(selectorPath))
|
69
|
-
)
|
47
|
+
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
70
48
|
|
71
49
|
@property
|
72
50
|
def generate_css_selector(self) -> str:
|