scrapling 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +227 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +209 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/METADATA +54 -46
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
scrapling/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3.1"
2
+ __version__ = "0.3.2"
3
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
4
 
5
5
 
scrapling/cli.py CHANGED
@@ -2,14 +2,18 @@ from pathlib import Path
2
2
  from subprocess import check_output
3
3
  from sys import executable as python_executable
4
4
 
5
- from scrapling.core.utils import log
6
- from scrapling.engines.toolbelt import Response
5
+ from scrapling.engines.toolbelt.custom import Response
6
+ from scrapling.core.utils import log, _CookieParser, _ParseHeaders
7
7
  from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
8
- from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
9
- from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
10
8
 
11
9
  from orjson import loads as json_loads, JSONDecodeError
12
- from click import command, option, Choice, group, argument
10
+
11
+ try:
12
+ from click import command, option, Choice, group, argument
13
+ except (ImportError, ModuleNotFoundError) as e:
14
+ raise ModuleNotFoundError(
15
+ "You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation"
16
+ ) from e
13
17
 
14
18
  __OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
15
19
  __PACKAGE_DIR__ = Path(__file__).parent
@@ -40,6 +44,8 @@ def __Request_and_Save(
40
44
  **kwargs,
41
45
  ) -> None:
42
46
  """Make a request using the specified fetcher function and save the result"""
47
+ from scrapling.core.shell import Convertor
48
+
43
49
  # Handle relative paths - convert to an absolute path based on the current working directory
44
50
  output_path = Path(output_file)
45
51
  if not output_path.is_absolute():
@@ -72,14 +78,10 @@ def __ParseExtractArguments(
72
78
  return parsed_headers, parsed_cookies, parsed_params, parsed_json
73
79
 
74
80
 
75
- def __BuildRequest(
76
- headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs
77
- ) -> Dict:
81
+ def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:
78
82
  """Build a request object using the specified arguments"""
79
83
  # Parse parameters
80
- parsed_headers, parsed_cookies, parsed_params, parsed_json = (
81
- __ParseExtractArguments(headers, cookies, params, json)
82
- )
84
+ parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)
83
85
  # Build request arguments
84
86
  request_kwargs = {
85
87
  "headers": parsed_headers if parsed_headers else None,
@@ -106,10 +108,7 @@ def __BuildRequest(
106
108
  help="Force Scrapling to reinstall all Fetchers dependencies",
107
109
  )
108
110
  def install(force): # pragma: no cover
109
- if (
110
- force
111
- or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists()
112
- ):
111
+ if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists():
113
112
  __Execute(
114
113
  [python_executable, "-m", "playwright", "install", "chromium"],
115
114
  "Playwright browsers",
@@ -158,9 +157,7 @@ def mcp():
158
157
  "level",
159
158
  is_flag=False,
160
159
  default="debug",
161
- type=Choice(
162
- ["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False
163
- ),
160
+ type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False),
164
161
  help="Log level (default: DEBUG)",
165
162
  )
166
163
  def shell(code, level):
@@ -178,9 +175,7 @@ def extract():
178
175
  pass
179
176
 
180
177
 
181
- @extract.command(
182
- help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
183
- )
178
+ @extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
184
179
  @argument("url", required=True)
185
180
  @argument("output_file", required=True)
186
181
  @option(
@@ -190,9 +185,7 @@ def extract():
190
185
  help='HTTP headers in format "Key: Value" (can be used multiple times)',
191
186
  )
192
187
  @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
193
- @option(
194
- "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
195
- )
188
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
196
189
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
197
190
  @option(
198
191
  "--css-selector",
@@ -264,12 +257,12 @@ def get(
264
257
  impersonate=impersonate,
265
258
  proxy=proxy,
266
259
  )
260
+ from scrapling.fetchers import Fetcher
261
+
267
262
  __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
268
263
 
269
264
 
270
- @extract.command(
271
- help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
272
- )
265
+ @extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
273
266
  @argument("url", required=True)
274
267
  @argument("output_file", required=True)
275
268
  @option(
@@ -285,9 +278,7 @@ def get(
285
278
  help='HTTP headers in format "Key: Value" (can be used multiple times)',
286
279
  )
287
280
  @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
288
- @option(
289
- "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
290
- )
281
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
291
282
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
292
283
  @option(
293
284
  "--css-selector",
@@ -364,12 +355,12 @@ def post(
364
355
  proxy=proxy,
365
356
  data=data,
366
357
  )
358
+ from scrapling.fetchers import Fetcher
359
+
367
360
  __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
368
361
 
369
362
 
370
- @extract.command(
371
- help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
372
- )
363
+ @extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
373
364
  @argument("url", required=True)
374
365
  @argument("output_file", required=True)
375
366
  @option("--data", "-d", help="Form data to include in the request body")
@@ -381,9 +372,7 @@ def post(
381
372
  help='HTTP headers in format "Key: Value" (can be used multiple times)',
382
373
  )
383
374
  @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
384
- @option(
385
- "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
386
- )
375
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
387
376
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
388
377
  @option(
389
378
  "--css-selector",
@@ -460,12 +449,12 @@ def put(
460
449
  proxy=proxy,
461
450
  data=data,
462
451
  )
452
+ from scrapling.fetchers import Fetcher
453
+
463
454
  __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
464
455
 
465
456
 
466
- @extract.command(
467
- help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
468
- )
457
+ @extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
469
458
  @argument("url", required=True)
470
459
  @argument("output_file", required=True)
471
460
  @option(
@@ -475,9 +464,7 @@ def put(
475
464
  help='HTTP headers in format "Key: Value" (can be used multiple times)',
476
465
  )
477
466
  @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
478
- @option(
479
- "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
480
- )
467
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
481
468
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
482
469
  @option(
483
470
  "--css-selector",
@@ -549,12 +536,12 @@ def delete(
549
536
  impersonate=impersonate,
550
537
  proxy=proxy,
551
538
  )
539
+ from scrapling.fetchers import Fetcher
540
+
552
541
  __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
553
542
 
554
543
 
555
- @extract.command(
556
- help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}"
557
- )
544
+ @extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}")
558
545
  @argument("url", required=True)
559
546
  @argument("output_file", required=True)
560
547
  @option(
@@ -591,9 +578,7 @@ def delete(
591
578
  )
592
579
  @option("--wait-selector", help="CSS selector to wait for before proceeding")
593
580
  @option("--locale", default="en-US", help="Browser locale (default: en-US)")
594
- @option(
595
- "--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)"
596
- )
581
+ @option("--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)")
597
582
  @option(
598
583
  "--hide-canvas/--show-canvas",
599
584
  default=False,
@@ -672,12 +657,12 @@ def fetch(
672
657
  if parsed_headers:
673
658
  kwargs["extra_headers"] = parsed_headers
674
659
 
660
+ from scrapling.fetchers import DynamicFetcher
661
+
675
662
  __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
676
663
 
677
664
 
678
- @extract.command(
679
- help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}"
680
- )
665
+ @extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}")
681
666
  @argument("url", required=True)
682
667
  @argument("output_file", required=True)
683
668
  @option(
@@ -821,6 +806,8 @@ def stealthy_fetch(
821
806
  if parsed_headers:
822
807
  kwargs["extra_headers"] = parsed_headers
823
808
 
809
+ from scrapling.fetchers import StealthyFetcher
810
+
824
811
  __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
825
812
 
826
813
 
@@ -269,17 +269,13 @@ name2codepoint = {
269
269
  }
270
270
 
271
271
 
272
- def to_unicode(
273
- text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
274
- ) -> str:
272
+ def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict") -> str:
275
273
  """Return the Unicode representation of a bytes object `text`. If `text`
276
274
  is already a Unicode object, return it as-is."""
277
275
  if isinstance(text, str):
278
276
  return text
279
277
  if not isinstance(text, (bytes, str)):
280
- raise TypeError(
281
- f"to_unicode must receive bytes or str, got {type(text).__name__}"
282
- )
278
+ raise TypeError(f"to_unicode must receive bytes or str, got {type(text).__name__}")
283
279
  if encoding is None:
284
280
  encoding = "utf-8"
285
281
  return text.decode(encoding, errors)
@@ -328,9 +324,7 @@ def _replace_entities(
328
324
  entity_name = groups["named"]
329
325
  if entity_name.lower() in keep:
330
326
  return m.group(0)
331
- number = name2codepoint.get(entity_name) or name2codepoint.get(
332
- entity_name.lower()
333
- )
327
+ number = name2codepoint.get(entity_name) or name2codepoint.get(entity_name.lower())
334
328
  if number is not None:
335
329
  # Browsers typically
336
330
  # interpret numeric character references in the 80-9F range as representing the characters mapped
scrapling/core/ai.py CHANGED
@@ -4,7 +4,7 @@ from mcp.server.fastmcp import FastMCP
4
4
  from pydantic import BaseModel, Field
5
5
 
6
6
  from scrapling.core.shell import Convertor
7
- from scrapling.engines.toolbelt import Response as _ScraplingResponse
7
+ from scrapling.engines.toolbelt.custom import Response as _ScraplingResponse
8
8
  from scrapling.fetchers import (
9
9
  Fetcher,
10
10
  FetcherSession,
@@ -32,21 +32,13 @@ class ResponseModel(BaseModel):
32
32
  """Request's response information structure."""
33
33
 
34
34
  status: int = Field(description="The status code returned by the website.")
35
- content: list[str] = Field(
36
- description="The content as Markdown/HTML or the text content of the page."
37
- )
38
- url: str = Field(
39
- description="The URL given by the user that resulted in this response."
40
- )
35
+ content: list[str] = Field(description="The content as Markdown/HTML or the text content of the page.")
36
+ url: str = Field(description="The URL given by the user that resulted in this response.")
41
37
 
42
38
 
43
- def _ContentTranslator(
44
- content: Generator[str, None, None], page: _ScraplingResponse
45
- ) -> ResponseModel:
39
+ def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel:
46
40
  """Convert a content generator to a list of ResponseModel objects."""
47
- return ResponseModel(
48
- status=page.status, content=[result for result in content], url=page.url
49
- )
41
+ return ResponseModel(status=page.status, content=[result for result in content], url=page.url)
50
42
 
51
43
 
52
44
  class ScraplingMCPServer:
@@ -31,15 +31,11 @@ class TextHandler(str):
31
31
 
32
32
  __slots__ = ()
33
33
 
34
- def __getitem__(
35
- self, key: SupportsIndex | slice
36
- ) -> "TextHandler": # pragma: no cover
34
+ def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
37
35
  lst = super().__getitem__(key)
38
36
  return cast(_TextHandlerType, TextHandler(lst))
39
37
 
40
- def split(
41
- self, sep: str = None, maxsplit: SupportsIndex = -1
42
- ) -> "TextHandlers": # pragma: no cover
38
+ def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers": # pragma: no cover
43
39
  return TextHandlers(
44
40
  cast(
45
41
  List[_TextHandlerType],
@@ -50,14 +46,10 @@ class TextHandler(str):
50
46
  def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
51
47
  return TextHandler(super().strip(chars))
52
48
 
53
- def lstrip(
54
- self, chars: str = None
55
- ) -> Union[str, "TextHandler"]: # pragma: no cover
49
+ def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
56
50
  return TextHandler(super().lstrip(chars))
57
51
 
58
- def rstrip(
59
- self, chars: str = None
60
- ) -> Union[str, "TextHandler"]: # pragma: no cover
52
+ def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
61
53
  return TextHandler(super().rstrip(chars))
62
54
 
63
55
  def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -66,37 +58,25 @@ class TextHandler(str):
66
58
  def casefold(self) -> Union[str, "TextHandler"]: # pragma: no cover
67
59
  return TextHandler(super().casefold())
68
60
 
69
- def center(
70
- self, width: SupportsIndex, fillchar: str = " "
71
- ) -> Union[str, "TextHandler"]: # pragma: no cover
61
+ def center(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
72
62
  return TextHandler(super().center(width, fillchar))
73
63
 
74
- def expandtabs(
75
- self, tabsize: SupportsIndex = 8
76
- ) -> Union[str, "TextHandler"]: # pragma: no cover
64
+ def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
77
65
  return TextHandler(super().expandtabs(tabsize))
78
66
 
79
- def format(
80
- self, *args: str, **kwargs: str
81
- ) -> Union[str, "TextHandler"]: # pragma: no cover
67
+ def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
82
68
  return TextHandler(super().format(*args, **kwargs))
83
69
 
84
70
  def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
85
71
  return TextHandler(super().format_map(mapping))
86
72
 
87
- def join(
88
- self, iterable: Iterable[str]
89
- ) -> Union[str, "TextHandler"]: # pragma: no cover
73
+ def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]: # pragma: no cover
90
74
  return TextHandler(super().join(iterable))
91
75
 
92
- def ljust(
93
- self, width: SupportsIndex, fillchar: str = " "
94
- ) -> Union[str, "TextHandler"]: # pragma: no cover
76
+ def ljust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
95
77
  return TextHandler(super().ljust(width, fillchar))
96
78
 
97
- def rjust(
98
- self, width: SupportsIndex, fillchar: str = " "
99
- ) -> Union[str, "TextHandler"]: # pragma: no cover
79
+ def rjust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
100
80
  return TextHandler(super().rjust(width, fillchar))
101
81
 
102
82
  def swapcase(self) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -108,14 +88,10 @@ class TextHandler(str):
108
88
  def translate(self, table) -> Union[str, "TextHandler"]: # pragma: no cover
109
89
  return TextHandler(super().translate(table))
110
90
 
111
- def zfill(
112
- self, width: SupportsIndex
113
- ) -> Union[str, "TextHandler"]: # pragma: no cover
91
+ def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]: # pragma: no cover
114
92
  return TextHandler(super().zfill(width))
115
93
 
116
- def replace(
117
- self, old: str, new: str, count: SupportsIndex = -1
118
- ) -> Union[str, "TextHandler"]:
94
+ def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, "TextHandler"]:
119
95
  return TextHandler(super().replace(old, new, count))
120
96
 
121
97
  def upper(self) -> Union[str, "TextHandler"]:
@@ -203,11 +179,7 @@ class TextHandler(str):
203
179
  results = flatten(results)
204
180
 
205
181
  if not replace_entities:
206
- return TextHandlers(
207
- cast(
208
- List[_TextHandlerType], [TextHandler(string) for string in results]
209
- )
210
- )
182
+ return TextHandlers(cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
211
183
 
212
184
  return TextHandlers(
213
185
  cast(
@@ -257,9 +229,7 @@ class TextHandlers(List[TextHandler]):
257
229
  def __getitem__(self, pos: slice) -> "TextHandlers": # pragma: no cover
258
230
  pass
259
231
 
260
- def __getitem__(
261
- self, pos: SupportsIndex | slice
262
- ) -> Union[TextHandler, "TextHandlers"]:
232
+ def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
263
233
  lst = super().__getitem__(pos)
264
234
  if isinstance(pos, slice):
265
235
  return TextHandlers(cast(List[_TextHandlerType], lst))
@@ -280,9 +250,7 @@ class TextHandlers(List[TextHandler]):
280
250
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
281
251
  :param case_sensitive: if disabled, the function will set the regex to ignore the letters-case while compiling it
282
252
  """
283
- results = [
284
- n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
285
- ]
253
+ results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
286
254
  return TextHandlers(flatten(results))
287
255
 
288
256
  def re_first(
@@ -330,34 +298,24 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
330
298
 
331
299
  def __init__(self, mapping=None, **kwargs):
332
300
  mapping = (
333
- {
334
- key: TextHandler(value) if isinstance(value, str) else value
335
- for key, value in mapping.items()
336
- }
301
+ {key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
337
302
  if mapping is not None
338
303
  else {}
339
304
  )
340
305
 
341
306
  if kwargs:
342
307
  mapping.update(
343
- {
344
- key: TextHandler(value) if isinstance(value, str) else value
345
- for key, value in kwargs.items()
346
- }
308
+ {key: TextHandler(value) if isinstance(value, str) else value for key, value in kwargs.items()}
347
309
  )
348
310
 
349
311
  # Fastest read-only mapping type
350
312
  self._data = MappingProxyType(mapping)
351
313
 
352
- def get(
353
- self, key: str, default: Optional[str] = None
354
- ) -> Optional[_TextHandlerType]:
314
+ def get(self, key: str, default: Optional[str] = None) -> Optional[_TextHandlerType]:
355
315
  """Acts like the standard dictionary `.get()` method"""
356
316
  return self._data.get(key, default)
357
317
 
358
- def search_values(
359
- self, keyword: str, partial: bool = False
360
- ) -> Generator["AttributesHandler", None, None]:
318
+ def search_values(self, keyword: str, partial: bool = False) -> Generator["AttributesHandler", None, None]:
361
319
  """Search current attributes by values and return a dictionary of each matching item
362
320
  :param keyword: The keyword to search for in the attribute values
363
321
  :param partial: If True, the function will search if keyword in each value instead of perfect match
scrapling/core/mixins.py CHANGED
@@ -5,9 +5,7 @@ class SelectorsGeneration:
5
5
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
6
6
  """
7
7
 
8
- def __general_selection(
9
- self, selection: str = "css", full_path: bool = False
10
- ) -> str:
8
+ def __general_selection(self, selection: str = "css", full_path: bool = False) -> str:
11
9
  """Generate a selector for the current element.
12
10
  :return: A string of the generated selector.
13
11
  """
@@ -18,18 +16,10 @@ class SelectorsGeneration:
18
16
  if target.parent:
19
17
  if target.attrib.get("id"):
20
18
  # id is enough
21
- part = (
22
- f"#{target.attrib['id']}"
23
- if css
24
- else f"[@id='{target.attrib['id']}']"
25
- )
19
+ part = f"#{target.attrib['id']}" if css else f"[@id='{target.attrib['id']}']"
26
20
  selectorPath.append(part)
27
21
  if not full_path:
28
- return (
29
- " > ".join(reversed(selectorPath))
30
- if css
31
- else "//*" + "/".join(reversed(selectorPath))
32
- )
22
+ return " > ".join(reversed(selectorPath)) if css else "//*" + "/".join(reversed(selectorPath))
33
23
  else:
34
24
  part = f"{target.tag}"
35
25
  # We won't use classes anymore because I some websites share exact classes between elements
@@ -45,28 +35,16 @@ class SelectorsGeneration:
45
35
  break
46
36
 
47
37
  if counter[target.tag] > 1:
48
- part += (
49
- f":nth-of-type({counter[target.tag]})"
50
- if css
51
- else f"[{counter[target.tag]}]"
52
- )
38
+ part += f":nth-of-type({counter[target.tag]})" if css else f"[{counter[target.tag]}]"
53
39
 
54
40
  selectorPath.append(part)
55
41
  target = target.parent
56
42
  if target is None or target.tag == "html":
57
- return (
58
- " > ".join(reversed(selectorPath))
59
- if css
60
- else "//" + "/".join(reversed(selectorPath))
61
- )
43
+ return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
62
44
  else:
63
45
  break
64
46
 
65
- return (
66
- " > ".join(reversed(selectorPath))
67
- if css
68
- else "//" + "/".join(reversed(selectorPath))
69
- )
47
+ return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
70
48
 
71
49
  @property
72
50
  def generate_css_selector(self) -> str: