PyPI - scrapling - Versions diffs - 0.1__py3-none-any.whl - Mend

scrapling 0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

scrapling/__init__.py +10 -0
scrapling/custom_types.py +146 -0
scrapling/mixins.py +74 -0
scrapling/parser.py +908 -0
scrapling/storage_adaptors.py +149 -0
scrapling/translator.py +153 -0
scrapling/utils.py +164 -0
scrapling-0.1.dist-info/LICENSE +28 -0
scrapling-0.1.dist-info/METADATA +475 -0
scrapling-0.1.dist-info/RECORD +12 -0
scrapling-0.1.dist-info/WHEEL +5 -0
scrapling-0.1.dist-info/top_level.txt +1 -0

scrapling/storage_adaptors.py ADDED Viewed

@@ -0,0 +1,149 @@
+import orjson
+import sqlite3
+import logging
+import threading
+from hashlib import sha256
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Union
+from scrapling.utils import _StorageTools, cache
+from lxml import html
+from tldextract import extract as tld
+class StorageSystemMixin(ABC):
+    # If you want to make your own storage system, you have to inherit from this
+    def __init__(self, url: Union[str, None] = None):
+        """
+        :param url: URL of the website we are working on to separate it from other websites data
+        """
+        self.url = url
+    @cache(None, typed=True)
+    def _get_base_url(self, default_value: str = 'default') -> str:
+        if not self.url or type(self.url) is not str:
+            return default_value
+        try:
+            extracted = tld(self.url)
+            return extracted.registered_domain or extracted.domain or default_value
+        except AttributeError:
+            return default_value
+    @abstractmethod
+    def save(self, element: html.HtmlElement, identifier: str) -> None:
+        """Saves the element's unique properties to the storage for retrieval and relocation later
+        :param element: The element itself that we want to save to storage.
+        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
+            the docs for more info.
+        """
+        raise NotImplementedError('Storage system must implement `save` method')
+    @abstractmethod
+    def retrieve(self, identifier: str) -> Optional[Dict]:
+        """Using the identifier, we search the storage and return the unique properties of the element
+        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
+            the docs for more info.
+        :return: A dictionary of the unique properties
+        """
+        raise NotImplementedError('Storage system must implement `save` method')
+    @staticmethod
+    @cache(None, typed=True)
+    def _get_hash(identifier: str) -> str:
+        """If you want to hash identifier in your storage system, use this safer"""
+        identifier = identifier.lower().strip()
+        if isinstance(identifier, str):
+            # Hash functions have to take bytes
+            identifier = identifier.encode('utf-8')
+        hash_value = sha256(identifier).hexdigest()
+        return f"{hash_value}_{len(identifier)}"  # Length to reduce collision chance
+@cache(None, typed=True)
+class SQLiteStorageSystem(StorageSystemMixin):
+    """The recommended system to use, it's race condition safe and thread safe.
+    Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
+    > It's optimized for threaded applications but running it without threads shouldn't make it slow."""
+    def __init__(self, storage_file: str, url: Union[str, None] = None):
+        """
+        :param storage_file: File to be used to store elements
+        :param url: URL of the website we are working on to separate it from other websites data
+        """
+        super().__init__(url)
+        self.storage_file = storage_file
+        # We use a threading.Lock to ensure thread-safety instead of relying on thread-local storage.
+        self.lock = threading.Lock()
+        # >SQLite default mode in earlier version is 1 not 2 (1=thread-safe 2=serialized)
+        # `check_same_thread=False` to allow it to be used across different threads.
+        self.connection = sqlite3.connect(self.storage_file, check_same_thread=False)
+        # WAL (Write-Ahead Logging) allows for better concurrency.
+        self.connection.execute("PRAGMA journal_mode=WAL")
+        self.cursor = self.connection.cursor()
+        self._setup_database()
+        logging.debug(
+            f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
+        )
+    def _setup_database(self) -> None:
+        self.cursor.execute("""
+            CREATE TABLE IF NOT EXISTS storage (
+                id INTEGER PRIMARY KEY,
+                url TEXT,
+                identifier TEXT,
+                element_data TEXT,
+                UNIQUE (url, identifier)
+            )
+        """)
+        self.connection.commit()
+    def save(self, element: html.HtmlElement, identifier: str):
+        """Saves the elements unique properties to the storage for retrieval and relocation later
+        :param element: The element itself that we want to save to storage.
+        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
+            the docs for more info.
+        """
+        url = self._get_base_url()
+        element_data = _StorageTools.element_to_dict(element)
+        with self.lock:
+            self.cursor.execute("""
+                INSERT OR REPLACE INTO storage (url, identifier, element_data)
+                VALUES (?, ?, ?)
+            """, (url, identifier, orjson.dumps(element_data)))
+            self.cursor.fetchall()
+            self.connection.commit()
+    def retrieve(self, identifier: str) -> Optional[Dict]:
+        """Using the identifier, we search the storage and return the unique properties of the element
+        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
+            the docs for more info.
+        :return: A dictionary of the unique properties
+        """
+        url = self._get_base_url()
+        with self.lock:
+            self.cursor.execute(
+                "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
+                (url, identifier)
+            )
+            result = self.cursor.fetchone()
+            if result:
+                return orjson.loads(result[0])
+            return None
+    def close(self):
+        """Close all connections, will be useful when with some things like scrapy Spider.closed() function/signal"""
+        with self.lock:
+            self.connection.commit()
+            self.cursor.close()
+            self.connection.close()
+    def __del__(self):
+        """To ensure all connections are closed when the object is destroyed."""
+        self.close()

scrapling/translator.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""
+Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
+To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
+which will be important in future releases but most importantly...
+    so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
+> if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
+"""
+import re
+from w3lib.html import HTML5_WHITESPACE
+from typing import TYPE_CHECKING, Any, Optional
+try:
+    from typing import Protocol
+except ImportError:
+    # Added in Python 3.8
+    Protocol = object
+from scrapling.utils import cache
+from cssselect.xpath import ExpressionError
+from cssselect.xpath import XPathExpr as OriginalXPathExpr
+from cssselect import HTMLTranslator as OriginalHTMLTranslator
+from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
+if TYPE_CHECKING:
+    # typing.Self requires Python 3.11
+    from typing_extensions import Self
+regex = f"[{HTML5_WHITESPACE}]+"
+replace_html5_whitespaces = re.compile(regex).sub
+class XPathExpr(OriginalXPathExpr):
+    textnode: bool = False
+    attribute: Optional[str] = None
+    @classmethod
+    def from_xpath(
+        cls,
+        xpath: OriginalXPathExpr,
+        textnode: bool = False,
+        attribute: Optional[str] = None,
+    ) -> "Self":
+        x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
+        x.textnode = textnode
+        x.attribute = attribute
+        return x
+    def __str__(self) -> str:
+        path = super().__str__()
+        if self.textnode:
+            if path == "*":
+                path = "text()"
+            elif path.endswith("::*/*"):
+                path = path[:-3] + "text()"
+            else:
+                path += "/text()"
+        if self.attribute is not None:
+            if path.endswith("::*/*"):
+                path = path[:-2]
+            path += f"/@{self.attribute}"
+        return path
+    def join(
+        self: "Self",
+        combiner: str,
+        other: OriginalXPathExpr,
+        *args: Any,
+        **kwargs: Any,
+    ) -> "Self":
+        if not isinstance(other, XPathExpr):
+            raise ValueError(
+                f"Expressions of type {__name__}.XPathExpr can ony join expressions"
+                f" of the same type (or its descendants), got {type(other)}"
+            )
+        super().join(combiner, other, *args, **kwargs)
+        self.textnode = other.textnode
+        self.attribute = other.attribute
+        return self
+# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
+class TranslatorProtocol(Protocol):
+    def xpath_element(self, selector: Element) -> OriginalXPathExpr:
+        pass
+    def css_to_xpath(self, css: str, prefix: str = ...) -> str:
+        pass
+class TranslatorMixin:
+    """This mixin adds support to CSS pseudo elements via dynamic dispatch.
+    Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
+    """
+    def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
+        # https://github.com/python/mypy/issues/12344
+        xpath = super().xpath_element(selector)  # type: ignore[safe-super]
+        return XPathExpr.from_xpath(xpath)
+    def xpath_pseudo_element(
+        self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
+    ) -> OriginalXPathExpr:
+        """
+        Dispatch method that transforms XPath to support pseudo-elements.
+        """
+        if isinstance(pseudo_element, FunctionalPseudoElement):
+            method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
+            method = getattr(self, method_name, None)
+            if not method:
+                raise ExpressionError(
+                    f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
+                )
+            xpath = method(xpath, pseudo_element)
+        else:
+            method_name = (
+                f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
+            )
+            method = getattr(self, method_name, None)
+            if not method:
+                raise ExpressionError(
+                    f"The pseudo-element ::{pseudo_element} is unknown"
+                )
+            xpath = method(xpath)
+        return xpath
+    @staticmethod
+    def xpath_attr_functional_pseudo_element(
+            xpath: OriginalXPathExpr, function: FunctionalPseudoElement
+    ) -> XPathExpr:
+        """Support selecting attribute values using ::attr() pseudo-element"""
+        if function.argument_types() not in (["STRING"], ["IDENT"]):
+            raise ExpressionError(
+                f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
+            )
+        return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
+    @staticmethod
+    def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
+        """Support selecting text nodes using ::text pseudo-element"""
+        return XPathExpr.from_xpath(xpath, textnode=True)
+class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
+    @cache(maxsize=256)
+    def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
+        return super().css_to_xpath(css, prefix)

scrapling/utils.py ADDED Viewed

@@ -0,0 +1,164 @@
+import re
+import os
+import logging
+from itertools import chain
+from logging import handlers
+# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
+from functools import lru_cache as cache  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
+from typing import Dict, Iterable, Any
+from lxml import html
+html_forbidden = {html.HtmlComment, }
+logging.basicConfig(
+        level=logging.ERROR,
+        format='%(asctime)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.StreamHandler()
+        ]
+    )
+@cache(None, typed=True)
+def setup_basic_logging(level: str = 'debug'):
+    levels = {
+        'debug': logging.DEBUG,
+        'info': logging.INFO,
+        'warning': logging.WARNING,
+        'error': logging.ERROR,
+        'critical': logging.CRITICAL
+    }
+    formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
+    lvl = levels[level.lower()]
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+    # Configure the root logger
+    logging.basicConfig(level=lvl, handlers=[handler])
+def flatten(lst: Iterable):
+    return list(chain.from_iterable(lst))
+def _is_iterable(s: Any):
+    # This will be used only in regex functions to make sure it's iterable but not string/bytes
+    return isinstance(s, (list, tuple,))
+@cache(None, typed=True)
+class _Logger(object):
+    # I will leave this class here for now in case I decide I want to come back to use it :)
+    __slots__ = ('console_logger', 'logger_file_path',)
+    levels = {
+        'debug': logging.DEBUG,
+        'info': logging.INFO,
+        'warning': logging.WARNING,
+        'error': logging.ERROR,
+        'critical': logging.CRITICAL
+    }
+    def __init__(self, filename: str = 'debug.log', level: str = 'debug', when: str = 'midnight', backcount: int = 1):
+        os.makedirs(os.path.join(os.path.dirname(__file__), 'logs'), exist_ok=True)
+        format_str = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
+        # on-screen output
+        lvl = self.levels[level.lower()]
+        self.console_logger = logging.getLogger('Scrapling')
+        self.console_logger.setLevel(lvl)
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(lvl)
+        console_handler.setFormatter(format_str)
+        self.console_logger.addHandler(console_handler)
+        if lvl == logging.DEBUG:
+            filename = os.path.join(os.path.dirname(__file__), 'logs', filename)
+            self.logger_file_path = filename
+            # Automatically generates the logging file at specified intervals
+            file_handler = handlers.TimedRotatingFileHandler(
+                # If more than (backcount+1) existed, oldest logs will be deleted
+                filename=filename, when=when, backupCount=backcount, encoding='utf-8'
+            )
+            file_handler.setLevel(lvl)
+            file_handler.setFormatter(format_str)
+            # This for the logger when it appends the date to the new log
+            file_handler.namer = lambda name: name.replace(".log", "") + ".log"
+            self.console_logger.addHandler(file_handler)
+            self.debug(f'Debug log path: {self.logger_file_path}')
+        else:
+            self.logger_file_path = None
+    def debug(self, message: str) -> None:
+        self.console_logger.debug(message)
+    def info(self, message: str) -> None:
+        self.console_logger.info(message)
+    def warning(self, message: str) -> None:
+        self.console_logger.warning(message)
+    def error(self, message: str) -> None:
+        self.console_logger.error(message)
+    def critical(self, message: str) -> None:
+        self.console_logger.critical(message)
+class _StorageTools:
+    @staticmethod
+    def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
+        if not element.attrib:
+            return {}
+        return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
+    @classmethod
+    def element_to_dict(cls, element: html.HtmlElement) -> Dict:
+        parent = element.getparent()
+        result = {
+            'tag': str(element.tag),
+            'attributes': cls.__clean_attributes(element),
+            'text': element.text.strip() if element.text else None,
+            'path': cls._get_element_path(element)
+        }
+        if parent is not None:
+            result.update({
+                'parent_name': parent.tag,
+                'parent_attribs': dict(parent.attrib),
+                'parent_text': parent.text.strip() if parent.text else None
+            })
+            siblings = [child.tag for child in parent.iterchildren() if child != element]
+            if siblings:
+                result.update({'siblings': tuple(siblings)})
+        children = [child.tag for child in element.iterchildren() if type(child) not in html_forbidden]
+        if children:
+            result.update({'children': tuple(children)})
+        return result
+    @classmethod
+    def _get_element_path(cls, element: html.HtmlElement):
+        parent = element.getparent()
+        return tuple(
+            (element.tag,) if parent is None else (
+                    cls._get_element_path(parent) + (element.tag,)
+            )
+        )
+# def _root_type_verifier(method):
+#     # Just to make sure we are safe
+#     @wraps(method)
+#     def _impl(self, *args, **kw):
+#         # All html types inherits from HtmlMixin so this to check for all at once
+#         if not issubclass(type(self._root), html.HtmlMixin):
+#             raise ValueError(f"Cannot use function on a Node of type {type(self._root)!r}")
+#         return method(self, *args, **kw)
+#     return _impl
+@cache(None, typed=True)
+def clean_spaces(string):
+    string = string.replace('\t', ' ')
+    string = re.sub('[\n|\r]', '', string)
+    return re.sub(' +', ' ', string)

scrapling-0.1.dist-info/LICENSE ADDED Viewed

@@ -0,0 +1,28 @@
+BSD 3-Clause License
+Copyright (c) 2024, Karim shoair
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.