PyPI - scrapling - Versions diffs - 0.2.92__py3-none-any.whl → 0.2.93__py3-none-any.whl - Mend

scrapling 0.2.92py3-none-any.whl → 0.2.93py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

scrapling/__init__.py +1 -1
scrapling/core/_types.py +2 -1
scrapling/core/custom_types.py +91 -39
scrapling/core/translator.py +1 -1
scrapling/defaults.py +8 -5
scrapling/engines/camo.py +6 -2
scrapling/engines/pw.py +1 -1
scrapling/fetchers.py +5 -5
scrapling/parser.py +153 -189
{scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/METADATA +58 -32
{scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/RECORD +17 -17
{scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/WHEEL +1 -1
tests/fetchers/async/test_playwright.py +1 -1
tests/fetchers/sync/test_playwright.py +1 -1
{scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/LICENSE +0 -0
{scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/entry_points.txt +0 -0
{scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/top_level.txt +0 -0

scrapling/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
 from scrapling.parser import Adaptor, Adaptors
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.2.92"
+__version__ = "0.2.93"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"

scrapling/core/_types.py CHANGED Viewed

@@ -3,7 +3,8 @@ Type definitions for type checking purposes.
 """
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
-                    List, Literal, Optional, Pattern, Tuple, Type, Union)
+                    List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
+                    Union)
 SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]

scrapling/core/custom_types.py CHANGED Viewed

@@ -1,13 +1,18 @@
 import re
+import typing
 from collections.abc import Mapping
 from types import MappingProxyType
 from orjson import dumps, loads
 from w3lib.html import replace_entities as _replace_entities
-from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
+from scrapling.core._types import (Dict, Iterable, List, Literal, Optional,
+                                   Pattern, SupportsIndex, TypeVar, Union)
 from scrapling.core.utils import _is_iterable, flatten
+# Define type variable for AttributeHandler value type
+_TextHandlerType = TypeVar('_TextHandlerType', bound='TextHandler')
 class TextHandler(str):
     """Extends standard Python string by adding more functionality"""
@@ -18,72 +23,89 @@ class TextHandler(str):
             return super().__new__(cls, string)
         return super().__new__(cls, '')
-    # Make methods from original `str` class return `TextHandler` instead of returning `str` again
-    # Of course, this stupid workaround is only so we can keep the auto-completion working without issues in your IDE
-    # and I made sonnet write it for me :)
-    def strip(self, chars=None):
+    @typing.overload
+    def __getitem__(self, key: SupportsIndex) -> 'TextHandler':
+        pass
+    @typing.overload
+    def __getitem__(self, key: slice) -> "TextHandlers":
+        pass
+    def __getitem__(self, key: Union[SupportsIndex, slice]) -> Union["TextHandler", "TextHandlers"]:
+        lst = super().__getitem__(key)
+        if isinstance(key, slice):
+            lst = [TextHandler(s) for s in lst]
+            return TextHandlers(typing.cast(List[_TextHandlerType], lst))
+        return typing.cast(_TextHandlerType, TextHandler(lst))
+    def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> 'TextHandlers':
+        return TextHandlers(
+            typing.cast(List[_TextHandlerType], [TextHandler(s) for s in super().split(sep, maxsplit)])
+        )
+    def strip(self, chars: str = None) -> Union[str, 'TextHandler']:
         return TextHandler(super().strip(chars))
-    def lstrip(self, chars=None):
+    def lstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
         return TextHandler(super().lstrip(chars))
-    def rstrip(self, chars=None):
+    def rstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
         return TextHandler(super().rstrip(chars))
-    def capitalize(self):
+    def capitalize(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().capitalize())
-    def casefold(self):
+    def casefold(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().casefold())
-    def center(self, width, fillchar=' '):
+    def center(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
         return TextHandler(super().center(width, fillchar))
-    def expandtabs(self, tabsize=8):
+    def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, 'TextHandler']:
         return TextHandler(super().expandtabs(tabsize))
-    def format(self, *args, **kwargs):
+    def format(self, *args: str, **kwargs: str) -> Union[str, 'TextHandler']:
         return TextHandler(super().format(*args, **kwargs))
-    def format_map(self, mapping):
+    def format_map(self, mapping) -> Union[str, 'TextHandler']:
         return TextHandler(super().format_map(mapping))
-    def join(self, iterable):
+    def join(self, iterable: Iterable[str]) -> Union[str, 'TextHandler']:
         return TextHandler(super().join(iterable))
-    def ljust(self, width, fillchar=' '):
+    def ljust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
         return TextHandler(super().ljust(width, fillchar))
-    def rjust(self, width, fillchar=' '):
+    def rjust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
         return TextHandler(super().rjust(width, fillchar))
-    def swapcase(self):
+    def swapcase(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().swapcase())
-    def title(self):
+    def title(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().title())
-    def translate(self, table):
+    def translate(self, table) -> Union[str, 'TextHandler']:
         return TextHandler(super().translate(table))
-    def zfill(self, width):
+    def zfill(self, width: SupportsIndex) -> Union[str, 'TextHandler']:
         return TextHandler(super().zfill(width))
-    def replace(self, old, new, count=-1):
+    def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, 'TextHandler']:
         return TextHandler(super().replace(old, new, count))
-    def upper(self):
+    def upper(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().upper())
-    def lower(self):
+    def lower(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().lower())
     ##############
-    def sort(self, reverse: bool = False) -> str:
+    def sort(self, reverse: bool = False) -> Union[str, 'TextHandler']:
         """Return a sorted version of the string"""
         return self.__class__("".join(sorted(self, reverse=reverse)))
-    def clean(self) -> str:
+    def clean(self) -> Union[str, 'TextHandler']:
         """Return a new version of the string after removing all white spaces and consecutive spaces"""
         data = re.sub(r'[\t|\r|\n]', '', self)
         data = re.sub(' +', ' ', data)
@@ -105,10 +127,32 @@ class TextHandler(str):
         # Check this out: https://github.com/ijl/orjson/issues/445
         return loads(str(self))
+    @typing.overload
+    def re(
+        self,
+        regex: Union[str, Pattern[str]],
+        check_match: Literal[True],
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = False,
+    ) -> bool:
+        ...
+    @typing.overload
+    def re(
+        self,
+        regex: Union[str, Pattern[str]],
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = False,
+        check_match: Literal[False] = False,
+    ) -> "TextHandlers[TextHandler]":
+        ...
     def re(
             self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
             case_sensitive: bool = False, check_match: bool = False
-    ) -> Union[List[str], bool]:
+    ) -> Union["TextHandlers[TextHandler]", bool]:
         """Apply the given regex to the current text and return a list of strings with the matches.
         :param regex: Can be either a compiled regular expression or a string.
@@ -133,12 +177,12 @@ class TextHandler(str):
             results = flatten(results)
         if not replace_entities:
-            return [TextHandler(string) for string in results]
+            return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
-        return [TextHandler(_replace_entities(s)) for s in results]
+        return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
     def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
+                 clean_match: bool = False, case_sensitive: bool = False) -> "TextHandler":
         """Apply the given regex to text and return the first match if found, otherwise return the default value.
         :param regex: Can be either a compiled regular expression or a string.
@@ -158,15 +202,23 @@ class TextHandlers(List[TextHandler]):
     """
     __slots__ = ()
-    def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers[TextHandler]"]:
+    @typing.overload
+    def __getitem__(self, pos: SupportsIndex) -> TextHandler:
+        pass
+    @typing.overload
+    def __getitem__(self, pos: slice) -> "TextHandlers":
+        pass
+    def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
-            return self.__class__(lst)
-        else:
-            return lst
+            lst = [TextHandler(s) for s in lst]
+            return TextHandlers(typing.cast(List[_TextHandlerType], lst))
+        return typing.cast(_TextHandlerType, TextHandler(lst))
     def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
-            case_sensitive: bool = False) -> 'List[str]':
+            case_sensitive: bool = False) -> 'TextHandlers[TextHandler]':
         """Call the ``.re()`` method for each element in this list and return
         their results flattened as TextHandlers.
@@ -178,10 +230,10 @@ class TextHandlers(List[TextHandler]):
         results = [
             n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
         ]
-        return flatten(results)
+        return TextHandlers(flatten(results))
     def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
+                 clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
         """Call the ``.re_first()`` method for each element in this list and return
         the first result or the default value otherwise.
@@ -210,7 +262,7 @@ class TextHandlers(List[TextHandler]):
     get_all = extract
-class AttributesHandler(Mapping):
+class AttributesHandler(Mapping[str, _TextHandlerType]):
     """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
         If standard dictionary is needed, just convert this class to dictionary with `dict` function
     """
@@ -231,7 +283,7 @@ class AttributesHandler(Mapping):
         # Fastest read-only mapping type
         self._data = MappingProxyType(mapping)
-    def get(self, key, default=None):
+    def get(self, key: str, default: Optional[str] = None) -> Union[_TextHandlerType, None]:
         """Acts like standard dictionary `.get()` method"""
         return self._data.get(key, default)
@@ -253,7 +305,7 @@ class AttributesHandler(Mapping):
         """Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
         return dumps(dict(self._data))
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> _TextHandlerType:
         return self._data[key]
     def __iter__(self):

scrapling/core/translator.py CHANGED Viewed

@@ -139,6 +139,6 @@ class TranslatorMixin:
 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
-    @lru_cache(maxsize=256)
+    @lru_cache(maxsize=2048)
     def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
         return super().css_to_xpath(css, prefix)

scrapling/defaults.py CHANGED Viewed

@@ -1,7 +1,10 @@
-from .fetchers import AsyncFetcher, Fetcher, PlayWrightFetcher, StealthyFetcher
+from .fetchers import AsyncFetcher as _AsyncFetcher
+from .fetchers import Fetcher as _Fetcher
+from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
+from .fetchers import StealthyFetcher as _StealthyFetcher
 # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
-Fetcher = Fetcher()
-AsyncFetcher = AsyncFetcher()
-StealthyFetcher = StealthyFetcher()
-PlayWrightFetcher = PlayWrightFetcher()
+Fetcher = _Fetcher()
+AsyncFetcher = _AsyncFetcher()
+StealthyFetcher = _StealthyFetcher()
+PlayWrightFetcher = _PlayWrightFetcher()

scrapling/engines/camo.py CHANGED Viewed

@@ -19,7 +19,7 @@ class CamoufoxEngine:
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
             timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
             wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False,
             geoip: Optional[bool] = False,
             adaptor_arguments: Dict = None,
     ):
@@ -36,7 +36,7 @@ class CamoufoxEngine:
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
         :param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
-        :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
+        :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
         :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
@@ -95,6 +95,8 @@ class CamoufoxEngine:
         with Camoufox(
                 geoip=self.geoip,
                 proxy=self.proxy,
+                disable_coop=True,
+                enable_cache=True,
                 addons=self.addons,
                 exclude_addons=addons,
                 headless=self.headless,
@@ -174,6 +176,8 @@ class CamoufoxEngine:
         async with AsyncCamoufox(
                 geoip=self.geoip,
                 proxy=self.proxy,
+                disable_coop=True,
+                enable_cache=True,
                 addons=self.addons,
                 exclude_addons=addons,
                 headless=self.headless,

scrapling/engines/pw.py CHANGED Viewed

@@ -105,7 +105,7 @@ class PlaywrightEngine:
         """
         cdp_url = self.cdp_url
         if self.nstbrowser_mode:
-            if self.nstbrowser_config and type(self.nstbrowser_config) is Dict:
+            if self.nstbrowser_config and isinstance(self.nstbrowser_config, dict):
                 config = self.nstbrowser_config
             else:
                 query = NSTBROWSER_DEFAULT_QUERY.copy()

scrapling/fetchers.py CHANGED Viewed

@@ -143,7 +143,7 @@ class AsyncFetcher(Fetcher):
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         adaptor_arguments = tuple(self.adaptor_arguments.items())
-        response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
+        response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
         return response_object
     async def delete(
@@ -177,7 +177,7 @@ class StealthyFetcher(BaseFetcher):
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
             timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
             wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -191,7 +191,7 @@ class StealthyFetcher(BaseFetcher):
             This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
         :param block_webrtc: Blocks WebRTC entirely.
         :param addons: List of Firefox addons to use. Must be paths to extracted addons.
-        :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
+        :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
         :param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
         :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
@@ -235,7 +235,7 @@ class StealthyFetcher(BaseFetcher):
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
             timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
             wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -249,7 +249,7 @@ class StealthyFetcher(BaseFetcher):
             This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
         :param block_webrtc: Blocks WebRTC entirely.
         :param addons: List of Firefox addons to use. Must be paths to extracted addons.
-        :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
+        :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
         :param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
         :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.

scrapling 0.2.92__py3-none-any.whl → 0.2.93__py3-none-any.whl

scrapling 0.2.92py3-none-any.whl → 0.2.93py3-none-any.whl