PyPI - scrapling - Versions diffs - 0.1__py3-none-any.whl - Mend

scrapling 0.1__py3-none-any.whl

Files changed (12) hide show

scrapling/__init__.py +10 -0
scrapling/custom_types.py +146 -0
scrapling/mixins.py +74 -0
scrapling/parser.py +908 -0
scrapling/storage_adaptors.py +149 -0
scrapling/translator.py +153 -0
scrapling/utils.py +164 -0
scrapling-0.1.dist-info/LICENSE +28 -0
scrapling-0.1.dist-info/METADATA +475 -0
scrapling-0.1.dist-info/RECORD +12 -0
scrapling-0.1.dist-info/WHEEL +5 -0
scrapling-0.1.dist-info/top_level.txt +1 -0

scrapling/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+# Declare top-level shortcuts
+from scrapling.parser import Adaptor, Adaptors
+from scrapling.custom_types import TextHandler, AttributesHandler
+__author__ = "Karim Shoair (karim.shoair@pm.me)"
+__version__ = "0.1"
+__copyright__ = "Copyright (c) 2024 Karim Shoair"
+__all__ = ['Adaptor', 'Adaptors', 'TextHandler', 'AttributesHandler']

scrapling/custom_types.py ADDED Viewed

@@ -0,0 +1,146 @@
+import re
+from types import MappingProxyType
+from collections.abc import Mapping
+from typing import Dict, List, Union, Pattern
+from scrapling.utils import _is_iterable, flatten
+from orjson import loads, dumps
+from w3lib.html import replace_entities as _replace_entities
+class TextHandler(str):
+    """Extends standard Python string by adding more functionality"""
+    __slots__ = ()
+    def __new__(cls, string):
+        # Because str is immutable and we can't override __init__
+        if type(string) is str:
+            return super().__new__(cls, string)
+        else:
+            return super().__new__(cls, '')
+    def sort(self, reverse: bool = False) -> str:
+        """Return a sorted version of the string"""
+        return self.__class__("".join(sorted(self, reverse=reverse)))
+    def clean(self) -> str:
+        """Return a new version of the string after removing all white spaces and consecutive spaces"""
+        data = re.sub(r'[\t|\r|\n]', '', self)
+        data = re.sub(' +', ' ', data)
+        return self.__class__(data.strip())
+    def json(self) -> Dict:
+        """Return json response if the response is jsonable otherwise throw error"""
+        # Using __str__ function as a workaround for orjson issue with subclasses of str
+        # Check this out: https://github.com/ijl/orjson/issues/445
+        return loads(self.__str__())
+    def re(
+            self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
+            case_sensitive: bool = False, check_match: bool = False
+    ) -> Union[List[str], bool]:
+        """Apply the given regex to the current text and return a list of strings with the matches.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param replace_entities: if enabled character entity references are replaced by their corresponding character
+        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param check_match: used to quickly check if this regex matches or not without any operations on the results
+        """
+        if isinstance(regex, str):
+            if not case_sensitive:
+                regex = re.compile(regex, re.UNICODE)
+            else:
+                regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
+        input_text = self.clean() if clean_match else self
+        results = regex.findall(input_text)
+        if check_match:
+            return bool(results)
+        if all(_is_iterable(res) for res in results):
+            results = flatten(results)
+        if not replace_entities:
+            return [TextHandler(string) for string in results]
+        return [TextHandler(_replace_entities(s)) for s in results]
+    def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
+                 clean_match: bool = False, case_sensitive: bool = False,):
+        """Apply the given regex to text and return the first match if found, otherwise return the default value.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param default: The default value to be returned if there is no match
+        :param replace_entities: if enabled character entity references are replaced by their corresponding character
+        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        """
+        result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
+        return result[0] if result else default
+class AttributesHandler(Mapping):
+    """A read-only mapping to use instead of the standard dictionary for the speed boost but
+     at the same time I use it to add more functionalities.
+    If standard dictionary is needed, just convert this class to dictionary with `dict` function
+    """
+    __slots__ = ('_data',)
+    def __init__(self, mapping=None, **kwargs):
+        mapping = {
+            key: TextHandler(value) if type(value) is str else value
+            for key, value in mapping.items()
+        } if mapping is not None else {}
+        if kwargs:
+            mapping.update({
+                key: TextHandler(value) if type(value) is str else value
+                for key, value in kwargs.items()
+            })
+        # Fastest read-only mapping type
+        self._data = MappingProxyType(mapping)
+    def get(self, key, default=None):
+        """Acts like standard dictionary `.get()` method"""
+        return self._data.get(key, default)
+    def search_values(self, keyword, partial=False):
+        """Search current attributes by values and return dictionary of each matching item
+        :param keyword: The keyword to search for in the attributes values
+        :param partial: If True, the function will search if keyword in each value instead of perfect match
+        """
+        for key, value in self._data.items():
+            if partial:
+                if keyword in value:
+                    yield AttributesHandler({key: value})
+            else:
+                if keyword == value:
+                    yield AttributesHandler({key: value})
+    @property
+    def json_string(self):
+        """Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
+        return dumps(dict(self._data))
+    def __getitem__(self, key):
+        return self._data[key]
+    def __iter__(self):
+        return iter(self._data)
+    def __len__(self):
+        return len(self._data)
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._data})"
+    def __str__(self):
+        return str(self._data)
+    def __contains__(self, key):
+        return key in self._data

scrapling/mixins.py ADDED Viewed

@@ -0,0 +1,74 @@
+class SelectorsGeneration:
+    """Selectors generation functions
+    Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
+    Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
+    def __general_selection(self, selection: str = 'css') -> str:
+        """Generate a selector for the current element.
+        :return: A string of the generated selector.
+        """
+        selectorPath = []
+        target = self
+        css = selection.lower() == 'css'
+        while target is not None:
+            if target.parent:
+                if target.attrib.get('id'):
+                    # id is enough
+                    part = (
+                        f'#{target.attrib["id"]}' if css
+                        else f"[@id='{target.attrib['id']}']"
+                    )
+                    selectorPath.append(part)
+                    return (
+                        " > ".join(reversed(selectorPath)) if css
+                        else '//*' + "/".join(reversed(selectorPath))
+                    )
+                else:
+                    part = f'{target.tag}'
+                    # We won't use classes anymore because I some websites share exact classes between elements
+                    # classes = target.attrib.get('class', '').split()
+                    # if classes and css:
+                    #     part += f".{'.'.join(classes)}"
+                    # else:
+                    counter = {}
+                    for child in target.parent.children:
+                        counter.setdefault(child.tag, 0)
+                        counter[child.tag] += 1
+                        if child._root == target._root:
+                            break
+                    if counter[target.tag] > 1:
+                        part += (
+                            f":nth-of-type({counter[target.tag]})" if css
+                            else f"[{counter[target.tag]}]"
+                        )
+                selectorPath.append(part)
+                target = target.parent
+                if target is None or target.tag == 'html':
+                    return (
+                        " > ".join(reversed(selectorPath)) if css
+                        else '//' + "/".join(reversed(selectorPath))
+                    )
+            else:
+                break
+        return (
+            " > ".join(reversed(selectorPath)) if css
+            else '//' + "/".join(reversed(selectorPath))
+        )
+    @property
+    def css_selector(self) -> str:
+        """Generate a CSS selector for the current element
+        :return: A string of the generated selector.
+        """
+        return self.__general_selection()
+    @property
+    def xpath_selector(self) -> str:
+        """Generate a XPath selector for the current element
+        :return: A string of the generated selector.
+        """
+        return self.__general_selection('xpath')