scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +29 -19
  2. scrapling/cli.py +21 -4
  3. scrapling/core/_types.py +3 -2
  4. scrapling/core/ai.py +24 -15
  5. scrapling/core/custom_types.py +20 -27
  6. scrapling/core/mixins.py +15 -9
  7. scrapling/core/shell.py +6 -4
  8. scrapling/core/storage.py +7 -6
  9. scrapling/core/translator.py +13 -8
  10. scrapling/core/utils/__init__.py +0 -1
  11. scrapling/engines/_browsers/__init__.py +0 -2
  12. scrapling/engines/_browsers/_base.py +45 -21
  13. scrapling/engines/_browsers/_camoufox.py +98 -43
  14. scrapling/engines/_browsers/_config_tools.py +1 -1
  15. scrapling/engines/_browsers/_controllers.py +34 -13
  16. scrapling/engines/_browsers/_validators.py +31 -10
  17. scrapling/engines/constants.py +0 -15
  18. scrapling/engines/static.py +749 -336
  19. scrapling/engines/toolbelt/convertor.py +13 -15
  20. scrapling/engines/toolbelt/custom.py +6 -9
  21. scrapling/engines/toolbelt/fingerprints.py +17 -10
  22. scrapling/engines/toolbelt/navigation.py +11 -3
  23. scrapling/fetchers/__init__.py +46 -0
  24. scrapling/fetchers/chrome.py +210 -0
  25. scrapling/fetchers/firefox.py +212 -0
  26. scrapling/fetchers/requests.py +28 -0
  27. scrapling/parser.py +109 -84
  28. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
  29. scrapling-0.3.7.dist-info/RECORD +47 -0
  30. scrapling/fetchers.py +0 -444
  31. scrapling-0.3.5.dist-info/RECORD +0 -44
  32. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
scrapling/__init__.py CHANGED
@@ -1,28 +1,38 @@
1
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3.5"
2
+ __version__ = "0.3.7"
3
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
4
 
5
+ from typing import Any, TYPE_CHECKING
5
6
 
6
- # A lightweight approach to create a lazy loader for each import for backward compatibility
7
- # This will reduces initial memory footprint significantly (only loads what's used)
8
- def __getattr__(name):
9
- lazy_imports = {
10
- "Fetcher": ("scrapling.fetchers", "Fetcher"),
11
- "Selector": ("scrapling.parser", "Selector"),
12
- "Selectors": ("scrapling.parser", "Selectors"),
13
- "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
14
- "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
15
- "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
16
- "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
17
- "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
18
- }
19
-
20
- if name in lazy_imports:
21
- module_path, class_name = lazy_imports[name]
7
+ if TYPE_CHECKING:
8
+ from scrapling.parser import Selector, Selectors
9
+ from scrapling.core.custom_types import AttributesHandler, TextHandler
10
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
11
+
12
+
13
+ # Lazy import mapping
14
+ _LAZY_IMPORTS = {
15
+ "Fetcher": ("scrapling.fetchers", "Fetcher"),
16
+ "Selector": ("scrapling.parser", "Selector"),
17
+ "Selectors": ("scrapling.parser", "Selectors"),
18
+ "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
19
+ "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
20
+ "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
21
+ "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
22
+ "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
23
+ }
24
+ __all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
25
+
26
+
27
+ def __getattr__(name: str) -> Any:
28
+ if name in _LAZY_IMPORTS:
29
+ module_path, class_name = _LAZY_IMPORTS[name]
22
30
  module = __import__(module_path, fromlist=[class_name])
23
31
  return getattr(module, class_name)
24
32
  else:
25
- raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
33
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
26
34
 
27
35
 
28
- __all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
36
+ def __dir__() -> list[str]:
37
+ """Support for dir() and autocomplete."""
38
+ return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])
scrapling/cli.py CHANGED
@@ -2,8 +2,9 @@ from pathlib import Path
2
2
  from subprocess import check_output
3
3
  from sys import executable as python_executable
4
4
 
5
+ from scrapling.core.utils import log
5
6
  from scrapling.engines.toolbelt.custom import Response
6
- from scrapling.core.utils import log, _CookieParser, _ParseHeaders
7
+ from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
7
8
  from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
8
9
 
9
10
  from orjson import loads as json_loads, JSONDecodeError
@@ -135,10 +136,26 @@ def install(force): # pragma: no cover
135
136
 
136
137
 
137
138
  @command(help="Run Scrapling's MCP server (Check the docs for more info).")
138
- def mcp():
139
+ @option(
140
+ "--http",
141
+ is_flag=True,
142
+ default=False,
143
+ help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
144
+ )
145
+ @option(
146
+ "--host",
147
+ type=str,
148
+ default="0.0.0.0",
149
+ help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
150
+ )
151
+ @option(
152
+ "--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
153
+ )
154
+ def mcp(http, host, port):
139
155
  from scrapling.core.ai import ScraplingMCPServer
140
156
 
141
- ScraplingMCPServer().serve()
157
+ server = ScraplingMCPServer()
158
+ server.serve(http, host, port)
142
159
 
143
160
 
144
161
  @command(help="Interactive scraping console")
@@ -766,7 +783,7 @@ def stealthy_fetch(
766
783
  :param disable_resources: Drop requests of unnecessary resources for a speed boost.
767
784
  :param block_webrtc: Blocks WebRTC entirely.
768
785
  :param humanize: Humanize the cursor movement.
769
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page.
786
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
770
787
  :param allow_webgl: Allow WebGL (recommended to keep enabled).
771
788
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
772
789
  :param disable_ads: Install the uBlock Origin addon on the browser.
scrapling/core/_types.py CHANGED
@@ -12,9 +12,11 @@ from typing import (
12
12
  Generator,
13
13
  Iterable,
14
14
  List,
15
+ Set,
15
16
  Literal,
16
17
  Optional,
17
18
  Pattern,
19
+ Sequence,
18
20
  Tuple,
19
21
  TypeVar,
20
22
  Union,
@@ -22,6 +24,7 @@ from typing import (
22
24
  Mapping,
23
25
  Awaitable,
24
26
  Protocol,
27
+ Coroutine,
25
28
  SupportsIndex,
26
29
  )
27
30
 
@@ -39,6 +42,4 @@ except ImportError: # pragma: no cover
39
42
  try:
40
43
  from typing_extensions import Self # Backport
41
44
  except ImportError:
42
- from typing import TypeVar
43
-
44
45
  Self = object
scrapling/core/ai.py CHANGED
@@ -20,6 +20,7 @@ from scrapling.core._types import (
20
20
  Mapping,
21
21
  Dict,
22
22
  List,
23
+ Any,
23
24
  SelectorWaitStates,
24
25
  Generator,
25
26
  )
@@ -42,10 +43,7 @@ def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResp
42
43
 
43
44
 
44
45
  class ScraplingMCPServer:
45
- _server = FastMCP(name="Scrapling")
46
-
47
46
  @staticmethod
48
- @_server.tool()
49
47
  def get(
50
48
  url: str,
51
49
  impersonate: Optional[BrowserTypeLiteral] = "chrome",
@@ -124,7 +122,6 @@ class ScraplingMCPServer:
124
122
  )
125
123
 
126
124
  @staticmethod
127
- @_server.tool()
128
125
  async def bulk_get(
129
126
  urls: Tuple[str, ...],
130
127
  impersonate: Optional[BrowserTypeLiteral] = "chrome",
@@ -175,7 +172,7 @@ class ScraplingMCPServer:
175
172
  :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
176
173
  """
177
174
  async with FetcherSession() as session:
178
- tasks = [
175
+ tasks: List[Any] = [
179
176
  session.get(
180
177
  url,
181
178
  auth=auth,
@@ -211,7 +208,6 @@ class ScraplingMCPServer:
211
208
  ]
212
209
 
213
210
  @staticmethod
214
- @_server.tool()
215
211
  async def fetch(
216
212
  url: str,
217
213
  extraction_type: extraction_types = "markdown",
@@ -263,7 +259,7 @@ class ScraplingMCPServer:
263
259
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
264
260
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
265
261
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
266
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
262
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
267
263
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
268
264
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
269
265
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
@@ -300,7 +296,6 @@ class ScraplingMCPServer:
300
296
  )
301
297
 
302
298
  @staticmethod
303
- @_server.tool()
304
299
  async def bulk_fetch(
305
300
  urls: Tuple[str, ...],
306
301
  extraction_type: extraction_types = "markdown",
@@ -352,7 +347,7 @@ class ScraplingMCPServer:
352
347
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
353
348
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
354
349
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
355
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
350
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
356
351
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
357
352
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
358
353
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
@@ -394,7 +389,6 @@ class ScraplingMCPServer:
394
389
  ]
395
390
 
396
391
  @staticmethod
397
- @_server.tool()
398
392
  async def stealthy_fetch(
399
393
  url: str,
400
394
  extraction_type: extraction_types = "markdown",
@@ -443,7 +437,7 @@ class ScraplingMCPServer:
443
437
  :param cookies: Set cookies for the next request.
444
438
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
445
439
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
446
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
440
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
447
441
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
448
442
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
449
443
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
@@ -494,7 +488,6 @@ class ScraplingMCPServer:
494
488
  )
495
489
 
496
490
  @staticmethod
497
- @_server.tool()
498
491
  async def bulk_stealthy_fetch(
499
492
  urls: Tuple[str, ...],
500
493
  extraction_type: extraction_types = "markdown",
@@ -543,7 +536,7 @@ class ScraplingMCPServer:
543
536
  :param cookies: Set cookies for the next request.
544
537
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
545
538
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
546
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
539
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
547
540
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
548
541
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
549
542
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
@@ -598,6 +591,22 @@ class ScraplingMCPServer:
598
591
  for page in responses
599
592
  ]
600
593
 
601
- def serve(self):
594
+ def serve(self, http: bool, host: str, port: int):
602
595
  """Serve the MCP server."""
603
- self._server.run(transport="stdio")
596
+ server = FastMCP(name="Scrapling", host=host, port=port)
597
+ server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
598
+ server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
599
+ server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
600
+ server.add_tool(
601
+ self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
602
+ )
603
+ server.add_tool(
604
+ self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
605
+ )
606
+ server.add_tool(
607
+ self.bulk_stealthy_fetch,
608
+ title="bulk_stealthy_fetch",
609
+ description=self.bulk_stealthy_fetch.__doc__,
610
+ structured_output=True,
611
+ )
612
+ server.run(transport="stdio" if not http else "streamable-http")
@@ -5,6 +5,7 @@ from re import compile as re_compile, UNICODE, IGNORECASE
5
5
  from orjson import dumps, loads
6
6
 
7
7
  from scrapling.core._types import (
8
+ Any,
8
9
  cast,
9
10
  Dict,
10
11
  List,
@@ -14,7 +15,6 @@ from scrapling.core._types import (
14
15
  Literal,
15
16
  Pattern,
16
17
  Iterable,
17
- Optional,
18
18
  Generator,
19
19
  SupportsIndex,
20
20
  )
@@ -33,23 +33,20 @@ class TextHandler(str):
33
33
 
34
34
  def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
35
35
  lst = super().__getitem__(key)
36
- return cast(_TextHandlerType, TextHandler(lst))
36
+ return TextHandler(lst)
37
37
 
38
- def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers": # pragma: no cover
39
- return TextHandlers(
40
- cast(
41
- List[_TextHandlerType],
42
- [TextHandler(s) for s in super().split(sep, maxsplit)],
43
- )
44
- )
38
+ def split(
39
+ self, sep: str | None = None, maxsplit: SupportsIndex = -1
40
+ ) -> Union[List, "TextHandlers"]: # pragma: no cover
41
+ return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
45
42
 
46
- def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
43
+ def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
47
44
  return TextHandler(super().strip(chars))
48
45
 
49
- def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
46
+ def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
50
47
  return TextHandler(super().lstrip(chars))
51
48
 
52
- def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
49
+ def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
53
50
  return TextHandler(super().rstrip(chars))
54
51
 
55
52
  def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -64,7 +61,7 @@ class TextHandler(str):
64
61
  def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
65
62
  return TextHandler(super().expandtabs(tabsize))
66
63
 
67
- def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
64
+ def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
68
65
  return TextHandler(super().format(*args, **kwargs))
69
66
 
70
67
  def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -131,10 +128,11 @@ class TextHandler(str):
131
128
  def re(
132
129
  self,
133
130
  regex: str | Pattern,
134
- check_match: Literal[True],
135
131
  replace_entities: bool = True,
136
132
  clean_match: bool = False,
137
133
  case_sensitive: bool = True,
134
+ *,
135
+ check_match: Literal[True],
138
136
  ) -> bool: ...
139
137
 
140
138
  @overload
@@ -179,19 +177,14 @@ class TextHandler(str):
179
177
  results = flatten(results)
180
178
 
181
179
  if not replace_entities:
182
- return TextHandlers(cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
180
+ return TextHandlers([TextHandler(string) for string in results])
183
181
 
184
- return TextHandlers(
185
- cast(
186
- List[_TextHandlerType],
187
- [TextHandler(_replace_entities(s)) for s in results],
188
- )
189
- )
182
+ return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
190
183
 
191
184
  def re_first(
192
185
  self,
193
186
  regex: str | Pattern,
194
- default=None,
187
+ default: Any = None,
195
188
  replace_entities: bool = True,
196
189
  clean_match: bool = False,
197
190
  case_sensitive: bool = True,
@@ -232,8 +225,8 @@ class TextHandlers(List[TextHandler]):
232
225
  def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
233
226
  lst = super().__getitem__(pos)
234
227
  if isinstance(pos, slice):
235
- return TextHandlers(cast(List[_TextHandlerType], lst))
236
- return cast(_TextHandlerType, TextHandler(lst))
228
+ return TextHandlers(cast(List[TextHandler], lst))
229
+ return TextHandler(cast(TextHandler, lst))
237
230
 
238
231
  def re(
239
232
  self,
@@ -256,7 +249,7 @@ class TextHandlers(List[TextHandler]):
256
249
  def re_first(
257
250
  self,
258
251
  regex: str | Pattern,
259
- default=None,
252
+ default: Any = None,
260
253
  replace_entities: bool = True,
261
254
  clean_match: bool = False,
262
255
  case_sensitive: bool = True,
@@ -309,9 +302,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
309
302
  )
310
303
 
311
304
  # Fastest read-only mapping type
312
- self._data = MappingProxyType(mapping)
305
+ self._data: Mapping[str, Any] = MappingProxyType(mapping)
313
306
 
314
- def get(self, key: str, default: Optional[str] = None) -> Optional[_TextHandlerType]:
307
+ def get(self, key: str, default: Any = None) -> _TextHandlerType:
315
308
  """Acts like the standard dictionary `.get()` method"""
316
309
  return self._data.get(key, default)
317
310
 
scrapling/core/mixins.py CHANGED
@@ -1,3 +1,9 @@
1
+ from scrapling.core._types import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from scrapling.parser import Selector
5
+
6
+
1
7
  class SelectorsGeneration:
2
8
  """
3
9
  Functions for generating selectors
@@ -5,7 +11,7 @@ class SelectorsGeneration:
5
11
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
6
12
  """
7
13
 
8
- def __general_selection(self, selection: str = "css", full_path: bool = False) -> str:
14
+ def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined]
9
15
  """Generate a selector for the current element.
10
16
  :return: A string of the generated selector.
11
17
  """
@@ -47,29 +53,29 @@ class SelectorsGeneration:
47
53
  return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
48
54
 
49
55
  @property
50
- def generate_css_selector(self) -> str:
56
+ def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
51
57
  """Generate a CSS selector for the current element
52
58
  :return: A string of the generated selector.
53
59
  """
54
- return self.__general_selection()
60
+ return self._general_selection()
55
61
 
56
62
  @property
57
- def generate_full_css_selector(self) -> str:
63
+ def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
58
64
  """Generate a complete CSS selector for the current element
59
65
  :return: A string of the generated selector.
60
66
  """
61
- return self.__general_selection(full_path=True)
67
+ return self._general_selection(full_path=True)
62
68
 
63
69
  @property
64
- def generate_xpath_selector(self) -> str:
70
+ def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
65
71
  """Generate an XPath selector for the current element
66
72
  :return: A string of the generated selector.
67
73
  """
68
- return self.__general_selection("xpath")
74
+ return self._general_selection("xpath")
69
75
 
70
76
  @property
71
- def generate_full_xpath_selector(self) -> str:
77
+ def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
72
78
  """Generate a complete XPath selector for the current element
73
79
  :return: A string of the generated selector.
74
80
  """
75
- return self.__general_selection("xpath", full_path=True)
81
+ return self._general_selection("xpath", full_path=True)
scrapling/core/shell.py CHANGED
@@ -22,14 +22,16 @@ from logging import (
22
22
  from orjson import loads as json_loads, JSONDecodeError
23
23
 
24
24
  from scrapling import __version__
25
+ from scrapling.core.utils import log
25
26
  from scrapling.parser import Selector, Selectors
26
27
  from scrapling.core.custom_types import TextHandler
27
28
  from scrapling.engines.toolbelt.custom import Response
28
- from scrapling.core.utils import log, _ParseHeaders, _CookieParser
29
+ from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
29
30
  from scrapling.core._types import (
30
31
  Optional,
31
32
  Dict,
32
33
  Any,
34
+ cast,
33
35
  extraction_types,
34
36
  Generator,
35
37
  )
@@ -539,15 +541,15 @@ class Convertor:
539
541
  raise ValueError(f"Unknown extraction type: {extraction_type}")
540
542
  else:
541
543
  if main_content_only:
542
- page = page.css_first("body") or page
544
+ page = cast(Selector, page.css_first("body")) or page
543
545
 
544
- pages = [page] if not css_selector else page.css(css_selector)
546
+ pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
545
547
  for page in pages:
546
548
  match extraction_type:
547
549
  case "markdown":
548
550
  yield cls._convert_to_markdown(page.html_content)
549
551
  case "html":
550
- yield page.body
552
+ yield page.html_content
551
553
  case "text":
552
554
  txt_content = page.get_all_text(strip=True)
553
555
  for s in (
scrapling/core/storage.py CHANGED
@@ -6,7 +6,6 @@ from sqlite3 import connect as db_connect
6
6
 
7
7
  from orjson import dumps, loads
8
8
  from lxml.html import HtmlElement
9
- from tldextract import extract as tld
10
9
 
11
10
  from scrapling.core.utils import _StorageTools, log
12
11
  from scrapling.core._types import Dict, Optional, Any
@@ -26,6 +25,8 @@ class StorageSystemMixin(ABC): # pragma: no cover
26
25
  return default_value
27
26
 
28
27
  try:
28
+ from tldextract import extract as tld
29
+
29
30
  extracted = tld(self.url)
30
31
  return extracted.top_domain_under_public_suffix or extracted.domain or default_value
31
32
  except AttributeError:
@@ -55,13 +56,13 @@ class StorageSystemMixin(ABC): # pragma: no cover
55
56
  @lru_cache(128, typed=True)
56
57
  def _get_hash(identifier: str) -> str:
57
58
  """If you want to hash identifier in your storage system, use this safer"""
58
- identifier = identifier.lower().strip()
59
- if isinstance(identifier, str):
59
+ _identifier = identifier.lower().strip()
60
+ if isinstance(_identifier, str):
60
61
  # Hash functions have to take bytes
61
- identifier = identifier.encode("utf-8")
62
+ _identifier = _identifier.encode("utf-8")
62
63
 
63
- hash_value = sha256(identifier).hexdigest()
64
- return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
64
+ hash_value = sha256(_identifier).hexdigest()
65
+ return f"{hash_value}_{len(_identifier)}" # Length to reduce collision chance
65
66
 
66
67
 
67
68
  @lru_cache(1, typed=True)
@@ -10,24 +10,23 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
10
10
 
11
11
  from functools import lru_cache
12
12
 
13
- from cssselect.xpath import ExpressionError
14
- from cssselect.xpath import XPathExpr as OriginalXPathExpr
15
13
  from cssselect import HTMLTranslator as OriginalHTMLTranslator
14
+ from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
16
15
  from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
17
16
 
18
- from scrapling.core._types import Any, Optional, Protocol, Self
17
+ from scrapling.core._types import Any, Protocol, Self
19
18
 
20
19
 
21
20
  class XPathExpr(OriginalXPathExpr):
22
21
  textnode: bool = False
23
- attribute: Optional[str] = None
22
+ attribute: str | None = None
24
23
 
25
24
  @classmethod
26
25
  def from_xpath(
27
26
  cls,
28
27
  xpath: OriginalXPathExpr,
29
28
  textnode: bool = False,
30
- attribute: Optional[str] = None,
29
+ attribute: str | None = None,
31
30
  ) -> Self:
32
31
  x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
33
32
  x.textnode = textnode
@@ -71,10 +70,10 @@ class XPathExpr(OriginalXPathExpr):
71
70
 
72
71
  # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
73
72
  class TranslatorProtocol(Protocol):
74
- def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pragma: no cover
73
+ def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
75
74
  pass
76
75
 
77
- def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pragma: no cover
76
+ def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
78
77
  pass
79
78
 
80
79
 
@@ -121,9 +120,15 @@ class TranslatorMixin:
121
120
 
122
121
 
123
122
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
124
- @lru_cache(maxsize=256)
125
123
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
126
124
  return super().css_to_xpath(css, prefix)
127
125
 
128
126
 
129
127
  translator = HTMLTranslator()
128
+ # Using a function instead of the translator directly to avoid Pyright override error
129
+
130
+
131
+ @lru_cache(maxsize=256)
132
+ def css_to_xpath(query: str) -> str:
133
+ """Return translated XPath version of a given CSS query"""
134
+ return translator.css_to_xpath(query)
@@ -7,4 +7,3 @@ from ._utils import (
7
7
  clean_spaces,
8
8
  html_forbidden,
9
9
  )
10
- from ._shell import _CookieParser, _ParseHeaders
@@ -1,2 +0,0 @@
1
- from ._controllers import DynamicSession, AsyncDynamicSession
2
- from ._camoufox import StealthySession, AsyncStealthySession