PyPI - scrapling - Versions diffs - 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl - Mend

scrapling 0.2.99py3-none-any.whl → 0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

scrapling/__init__.py +18 -31
scrapling/cli.py +818 -20
scrapling/core/_html_utils.py +348 -0
scrapling/core/_types.py +34 -17
scrapling/core/ai.py +611 -0
scrapling/core/custom_types.py +183 -100
scrapling/core/mixins.py +27 -19
scrapling/core/shell.py +647 -0
scrapling/core/{storage_adaptors.py → storage.py} +41 -33
scrapling/core/translator.py +20 -26
scrapling/core/utils.py +49 -54
scrapling/engines/__init__.py +15 -6
scrapling/engines/_browsers/__init__.py +2 -0
scrapling/engines/_browsers/_camoufox.py +745 -0
scrapling/engines/_browsers/_config_tools.py +130 -0
scrapling/engines/_browsers/_controllers.py +630 -0
scrapling/engines/_browsers/_page.py +93 -0
scrapling/engines/_browsers/_validators.py +150 -0
scrapling/engines/constants.py +101 -88
scrapling/engines/static.py +667 -110
scrapling/engines/toolbelt/__init__.py +20 -6
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
scrapling/engines/toolbelt/convertor.py +254 -0
scrapling/engines/toolbelt/custom.py +158 -175
scrapling/engines/toolbelt/fingerprints.py +32 -46
scrapling/engines/toolbelt/navigation.py +68 -39
scrapling/fetchers.py +227 -333
scrapling/parser.py +781 -449
scrapling-0.3.dist-info/METADATA +409 -0
scrapling-0.3.dist-info/RECORD +41 -0
{scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
{scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
scrapling/defaults.py +0 -25
scrapling/engines/camo.py +0 -339
scrapling/engines/pw.py +0 -465
scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
scrapling-0.2.99.dist-info/METADATA +0 -290
scrapling-0.2.99.dist-info/RECORD +0 -49
tests/__init__.py +0 -1
tests/fetchers/__init__.py +0 -1
tests/fetchers/async/__init__.py +0 -0
tests/fetchers/async/test_camoufox.py +0 -97
tests/fetchers/async/test_httpx.py +0 -85
tests/fetchers/async/test_playwright.py +0 -101
tests/fetchers/sync/__init__.py +0 -0
tests/fetchers/sync/test_camoufox.py +0 -70
tests/fetchers/sync/test_httpx.py +0 -84
tests/fetchers/sync/test_playwright.py +0 -89
tests/fetchers/test_utils.py +0 -97
tests/parser/__init__.py +0 -0
tests/parser/test_automatch.py +0 -111
tests/parser/test_general.py +0 -330
{scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
{scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0

scrapling/core/{storage_adaptors.py → storage.py} RENAMED Viewed

@@ -1,44 +1,49 @@
-import sqlite3
-import threading
-from abc import ABC, abstractmethod
 from hashlib import sha256
+from threading import RLock
+from functools import lru_cache
+from abc import ABC, abstractmethod
+from sqlite3 import connect as db_connect
-import orjson
-from lxml import html
+from orjson import dumps, loads
+from lxml.html import HtmlElement
 from tldextract import extract as tld
-from scrapling.core._types import Dict, Optional, Union
-from scrapling.core.utils import _StorageTools, log, lru_cache
+from scrapling.core.utils import _StorageTools, log
+from scrapling.core._types import Dict, Optional, Any
-class StorageSystemMixin(ABC):
+class StorageSystemMixin(ABC):  # pragma: no cover
     # If you want to make your own storage system, you have to inherit from this
-    def __init__(self, url: Union[str, None] = None):
+    def __init__(self, url: Optional[str] = None):
         """
         :param url: URL of the website we are working on to separate it from other websites data
         """
         self.url = url
     @lru_cache(64, typed=True)
-    def _get_base_url(self, default_value: str = 'default') -> str:
-        if not self.url or type(self.url) is not str:
+    def _get_base_url(self, default_value: str = "default") -> str:
+        if not self.url or not isinstance(self.url, str):
             return default_value
         try:
             extracted = tld(self.url)
-            return extracted.registered_domain or extracted.domain or default_value
+            return (
+                extracted.top_domain_under_public_suffix
+                or extracted.domain
+                or default_value
+            )
         except AttributeError:
             return default_value
     @abstractmethod
-    def save(self, element: html.HtmlElement, identifier: str) -> None:
+    def save(self, element: HtmlElement, identifier: str) -> None:
         """Saves the element's unique properties to the storage for retrieval and relocation later
-        :param element: The element itself that we want to save to storage.
+        :param element: The element itself which we want to save to storage.
         :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
             the docs for more info.
         """
-        raise NotImplementedError('Storage system must implement `save` method')
+        raise NotImplementedError("Storage system must implement `save` method")
     @abstractmethod
     def retrieve(self, identifier: str) -> Optional[Dict]:
@@ -48,7 +53,7 @@ class StorageSystemMixin(ABC):
             the docs for more info.
         :return: A dictionary of the unique properties
         """
-        raise NotImplementedError('Storage system must implement `save` method')
+        raise NotImplementedError("Storage system must implement `save` method")
     @staticmethod
     @lru_cache(128, typed=True)
@@ -57,7 +62,7 @@ class StorageSystemMixin(ABC):
         identifier = identifier.lower().strip()
         if isinstance(identifier, str):
             # Hash functions have to take bytes
-            identifier = identifier.encode('utf-8')
+            identifier = identifier.encode("utf-8")
         hash_value = sha256(identifier).hexdigest()
         return f"{hash_value}_{len(identifier)}"  # Length to reduce collision chance
@@ -66,21 +71,21 @@ class StorageSystemMixin(ABC):
 @lru_cache(1, typed=True)
 class SQLiteStorageSystem(StorageSystemMixin):
     """The recommended system to use, it's race condition safe and thread safe.
-    Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
-    > It's optimized for threaded applications but running it without threads shouldn't make it slow."""
-    def __init__(self, storage_file: str, url: Union[str, None] = None):
+    Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools
+    > It's optimized for threaded applications, but running it without threads shouldn't make it slow."""
+    def __init__(self, storage_file: str, url: Optional[str] = None):
         """
-        :param storage_file: File to be used to store elements
+        :param storage_file: File to be used to store elements' data.
         :param url: URL of the website we are working on to separate it from other websites data
         """
         super().__init__(url)
         self.storage_file = storage_file
-        # We use a threading.Lock to ensure thread-safety instead of relying on thread-local storage.
-        self.lock = threading.Lock()
-        # >SQLite default mode in earlier version is 1 not 2 (1=thread-safe 2=serialized)
+        self.lock = RLock()  # Better than Lock for reentrancy
+        # >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
         # `check_same_thread=False` to allow it to be used across different threads.
-        self.connection = sqlite3.connect(self.storage_file, check_same_thread=False)
+        self.connection = db_connect(self.storage_file, check_same_thread=False)
         # WAL (Write-Ahead Logging) allows for better concurrency.
         self.connection.execute("PRAGMA journal_mode=WAL")
         self.cursor = self.connection.cursor()
@@ -101,24 +106,27 @@ class SQLiteStorageSystem(StorageSystemMixin):
         """)
         self.connection.commit()
-    def save(self, element: html.HtmlElement, identifier: str):
+    def save(self, element: HtmlElement, identifier: str) -> None:
         """Saves the elements unique properties to the storage for retrieval and relocation later
-        :param element: The element itself that we want to save to storage.
+        :param element: The element itself which we want to save to storage.
         :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
             the docs for more info.
         """
         url = self._get_base_url()
         element_data = _StorageTools.element_to_dict(element)
         with self.lock:
-            self.cursor.execute("""
+            self.cursor.execute(
+                """
                 INSERT OR REPLACE INTO storage (url, identifier, element_data)
                 VALUES (?, ?, ?)
-            """, (url, identifier, orjson.dumps(element_data)))
+            """,
+                (url, identifier, dumps(element_data)),
+            )
             self.cursor.fetchall()
             self.connection.commit()
-    def retrieve(self, identifier: str) -> Optional[Dict]:
+    def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
         """Using the identifier, we search the storage and return the unique properties of the element
         :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
@@ -129,15 +137,15 @@ class SQLiteStorageSystem(StorageSystemMixin):
         with self.lock:
             self.cursor.execute(
                 "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
-                (url, identifier)
+                (url, identifier),
             )
             result = self.cursor.fetchone()
             if result:
-                return orjson.loads(result[0])
+                return loads(result[0])
             return None
     def close(self):
-        """Close all connections, will be useful when with some things like scrapy Spider.closed() function/signal"""
+        """Close all connections. It will be useful when with some things like scrapy Spider.closed() function/signal"""
         with self.lock:
             self.connection.commit()
             self.cursor.close()

scrapling/core/translator.py CHANGED Viewed

@@ -1,30 +1,24 @@
 """
-Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
+Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...
-To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
+To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...
 So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
-    if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
+    If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
 """
-import re
+from functools import lru_cache
 from cssselect import HTMLTranslator as OriginalHTMLTranslator
 from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
 from cssselect.xpath import ExpressionError
 from cssselect.xpath import XPathExpr as OriginalXPathExpr
-from w3lib.html import HTML5_WHITESPACE
 from scrapling.core._types import Any, Optional, Protocol, Self
-from scrapling.core.utils import lru_cache
-regex = f"[{HTML5_WHITESPACE}]+"
-replace_html5_whitespaces = re.compile(regex).sub
 class XPathExpr(OriginalXPathExpr):
     textnode: bool = False
     attribute: Optional[str] = None
@@ -34,7 +28,7 @@ class XPathExpr(OriginalXPathExpr):
         xpath: OriginalXPathExpr,
         textnode: bool = False,
         attribute: Optional[str] = None,
-    ) -> "Self":
+    ) -> Self:
         x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
         x.textnode = textnode
         x.attribute = attribute
@@ -43,29 +37,29 @@ class XPathExpr(OriginalXPathExpr):
     def __str__(self) -> str:
         path = super().__str__()
         if self.textnode:
-            if path == "*":
+            if path == "*":  # pragma: no cover
                 path = "text()"
-            elif path.endswith("::*/*"):
+            elif path.endswith("::*/*"):  # pragma: no cover
                 path = path[:-3] + "text()"
             else:
                 path += "/text()"
         if self.attribute is not None:
-            if path.endswith("::*/*"):
+            if path.endswith("::*/*"):  # pragma: no cover
                 path = path[:-2]
             path += f"/@{self.attribute}"
         return path
     def join(
-        self: "Self",
+        self: Self,
         combiner: str,
         other: OriginalXPathExpr,
         *args: Any,
         **kwargs: Any,
-    ) -> "Self":
+    ) -> Self:
         if not isinstance(other, XPathExpr):
-            raise ValueError(
+            raise ValueError(  # pragma: no cover
                 f"Expressions of type {__name__}.XPathExpr can ony join expressions"
                 f" of the same type (or its descendants), got {type(other)}"
             )
@@ -77,10 +71,10 @@ class XPathExpr(OriginalXPathExpr):
 # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
 class TranslatorProtocol(Protocol):
-    def xpath_element(self, selector: Element) -> OriginalXPathExpr:
+    def xpath_element(self, selector: Element) -> OriginalXPathExpr:  # pragma: no cover
         pass
-    def css_to_xpath(self, css: str, prefix: str = ...) -> str:
+    def css_to_xpath(self, css: str, prefix: str = ...) -> str:  # pragma: no cover
         pass
@@ -91,7 +85,7 @@ class TranslatorMixin:
     """
     def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
-        # https://github.com/python/mypy/issues/12344
+        # https://github.com/python/mypy/issues/14757
         xpath = super().xpath_element(selector)  # type: ignore[safe-super]
         return XPathExpr.from_xpath(xpath)
@@ -99,12 +93,12 @@ class TranslatorMixin:
         self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
     ) -> OriginalXPathExpr:
         """
-        Dispatch method that transforms XPath to support pseudo-elements.
+        Dispatch method that transforms XPath to support the pseudo-element.
         """
         if isinstance(pseudo_element, FunctionalPseudoElement):
             method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
             method = getattr(self, method_name, None)
-            if not method:
+            if not method:  # pragma: no cover
                 raise ExpressionError(
                     f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
                 )
@@ -114,7 +108,7 @@ class TranslatorMixin:
                 f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
             )
             method = getattr(self, method_name, None)
-            if not method:
+            if not method:  # pragma: no cover
                 raise ExpressionError(
                     f"The pseudo-element ::{pseudo_element} is unknown"
                 )
@@ -123,10 +117,10 @@ class TranslatorMixin:
     @staticmethod
     def xpath_attr_functional_pseudo_element(
-            xpath: OriginalXPathExpr, function: FunctionalPseudoElement
+        xpath: OriginalXPathExpr, function: FunctionalPseudoElement
     ) -> XPathExpr:
         """Support selecting attribute values using ::attr() pseudo-element"""
-        if function.argument_types() not in (["STRING"], ["IDENT"]):
+        if function.argument_types() not in (["STRING"], ["IDENT"]):  # pragma: no cover
             raise ExpressionError(
                 f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
             )
@@ -144,4 +138,4 @@ class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
         return super().css_to_xpath(css, prefix)
-translator_instance = HTMLTranslator()
+translator = HTMLTranslator()

scrapling/core/utils.py CHANGED Viewed

@@ -1,17 +1,18 @@
 import logging
-import re
 from itertools import chain
+from re import compile as re_compile
-import orjson
 from lxml import html
-from scrapling.core._types import Any, Dict, Iterable, Union
+from scrapling.core._types import Any, Dict, Iterable, List
-# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
-# functools.cache is available on Python 3.9+ only so let's keep lru_cache
+# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
 from functools import lru_cache  # isort:skip
-html_forbidden = {html.HtmlComment, }
+html_forbidden = (html.HtmlComment,)
+__CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
+__CONSECUTIVE_SPACES_REGEX__ = re_compile(r" +")
 @lru_cache(1, typed=True)
@@ -20,12 +21,11 @@ def setup_logger():
     :returns: logging.Logger: Configured logger instance
     """
-    logger = logging.getLogger('scrapling')
+    logger = logging.getLogger("scrapling")
     logger.setLevel(logging.INFO)
     formatter = logging.Formatter(
-        fmt="[%(asctime)s] %(levelname)s: %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S"
+        fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
     )
     console_handler = logging.StreamHandler()
@@ -41,24 +41,19 @@ def setup_logger():
 log = setup_logger()
-def is_jsonable(content: Union[bytes, str]) -> bool:
-    if type(content) is bytes:
-        content = content.decode()
-    try:
-        _ = orjson.loads(content)
-        return True
-    except orjson.JSONDecodeError:
-        return False
-def flatten(lst: Iterable):
+def flatten(lst: Iterable[Any]) -> List[Any]:
     return list(chain.from_iterable(lst))
-def _is_iterable(s: Any):
+def _is_iterable(obj: Any) -> bool:
     # This will be used only in regex functions to make sure it's iterable but not string/bytes
-    return isinstance(s, (list, tuple,))
+    return isinstance(
+        obj,
+        (
+            list,
+            tuple,
+        ),
+    )
 class _StorageTools:
@@ -66,31 +61,43 @@ class _StorageTools:
     def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
         if not element.attrib:
             return {}
-        return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
+        return {
+            k: v.strip()
+            for k, v in element.attrib.items()
+            if v and v.strip() and k not in forbidden
+        }
     @classmethod
     def element_to_dict(cls, element: html.HtmlElement) -> Dict:
         parent = element.getparent()
         result = {
-            'tag': str(element.tag),
-            'attributes': cls.__clean_attributes(element),
-            'text': element.text.strip() if element.text else None,
-            'path': cls._get_element_path(element)
+            "tag": str(element.tag),
+            "attributes": cls.__clean_attributes(element),
+            "text": element.text.strip() if element.text else None,
+            "path": cls._get_element_path(element),
         }
         if parent is not None:
-            result.update({
-                'parent_name': parent.tag,
-                'parent_attribs': dict(parent.attrib),
-                'parent_text': parent.text.strip() if parent.text else None
-            })
+            result.update(
+                {
+                    "parent_name": parent.tag,
+                    "parent_attribs": dict(parent.attrib),
+                    "parent_text": parent.text.strip() if parent.text else None,
+                }
+            )
-            siblings = [child.tag for child in parent.iterchildren() if child != element]
+            siblings = [
+                child.tag for child in parent.iterchildren() if child != element
+            ]
             if siblings:
-                result.update({'siblings': tuple(siblings)})
+                result.update({"siblings": tuple(siblings)})
-        children = [child.tag for child in element.iterchildren() if type(child) not in html_forbidden]
+        children = [
+            child.tag
+            for child in element.iterchildren()
+            if not isinstance(child, html_forbidden)
+        ]
         if children:
-            result.update({'children': tuple(children)})
+            result.update({"children": tuple(children)})
         return result
@@ -98,25 +105,13 @@ class _StorageTools:
     def _get_element_path(cls, element: html.HtmlElement):
         parent = element.getparent()
         return tuple(
-            (element.tag,) if parent is None else (
-                cls._get_element_path(parent) + (element.tag,)
-            )
+            (element.tag,)
+            if parent is None
+            else (cls._get_element_path(parent) + (element.tag,))
         )
-# def _root_type_verifier(method):
-#     # Just to make sure we are safe
-#     @wraps(method)
-#     def _impl(self, *args, **kw):
-#         # All html types inherits from HtmlMixin so this to check for all at once
-#         if not issubclass(type(self._root), html.HtmlMixin):
-#             raise ValueError(f"Cannot use function on a Node of type {type(self._root)!r}")
-#         return method(self, *args, **kw)
-#     return _impl
 @lru_cache(128, typed=True)
 def clean_spaces(string):
-    string = string.replace('\t', ' ')
-    string = re.sub('[\n|\r]', '', string)
-    return re.sub(' +', ' ', string)
+    string = string.translate(__CLEANING_TABLE__)
+    return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)

scrapling/engines/__init__.py CHANGED Viewed

@@ -1,7 +1,16 @@
-from .camo import CamoufoxEngine
-from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
-from .pw import PlaywrightEngine
-from .static import StaticEngine
-from .toolbelt import check_if_engine_usable
+from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS, DEFAULT_FLAGS
+from .static import FetcherSession, FetcherClient, AsyncFetcherClient
+from ._browsers import (
+    DynamicSession,
+    AsyncDynamicSession,
+    StealthySession,
+    AsyncStealthySession,
+)
-__all__ = ['CamoufoxEngine', 'PlaywrightEngine']
+__all__ = [
+    "FetcherSession",
+    "DynamicSession",
+    "AsyncDynamicSession",
+    "StealthySession",
+    "AsyncStealthySession",
+]

scrapling/engines/_browsers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from ._controllers import DynamicSession, AsyncDynamicSession
2	+ from ._camoufox import StealthySession, AsyncStealthySession

scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl

scrapling 0.2.99py3-none-any.whl → 0.3py3-none-any.whl