PyPI - scrapling - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

scrapling 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

scrapling/__init__.py +1 -1
scrapling/cli.py +38 -51
scrapling/core/_html_utils.py +3 -9
scrapling/core/ai.py +5 -13
scrapling/core/custom_types.py +19 -61
scrapling/core/mixins.py +6 -28
scrapling/core/shell.py +49 -127
scrapling/core/storage.py +2 -8
scrapling/core/translator.py +8 -20
scrapling/core/utils/__init__.py +10 -0
scrapling/core/utils/_shell.py +48 -0
scrapling/core/{utils.py → utils/_utils.py} +5 -21
scrapling/engines/__init__.py +0 -16
scrapling/engines/_browsers/_base.py +297 -0
scrapling/engines/_browsers/_camoufox.py +219 -296
scrapling/engines/_browsers/_config_tools.py +2 -1
scrapling/engines/_browsers/_controllers.py +201 -281
scrapling/engines/_browsers/_page.py +37 -15
scrapling/engines/_browsers/_validators.py +9 -15
scrapling/engines/constants.py +3 -6
scrapling/engines/static.py +25 -75
scrapling/engines/toolbelt/__init__.py +1 -20
scrapling/engines/toolbelt/convertor.py +95 -86
scrapling/engines/toolbelt/custom.py +7 -99
scrapling/engines/toolbelt/fingerprints.py +1 -3
scrapling/engines/toolbelt/navigation.py +4 -58
scrapling/fetchers.py +29 -24
scrapling/parser.py +45 -122
{scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
scrapling-0.3.3.dist-info/RECORD +44 -0
scrapling-0.3.1.dist-info/RECORD +0 -41
{scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
{scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0

scrapling/engines/toolbelt/convertor.py CHANGED Viewed

@@ -1,10 +1,15 @@
+from functools import lru_cache
+from re import compile as re_compile
 from curl_cffi.requests import Response as CurlResponse
 from playwright.sync_api import Page as SyncPage, Response as SyncResponse
 from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
 from scrapling.core.utils import log
-from scrapling.core._types import Dict, Optional
 from .custom import Response, StatusText
+from scrapling.core._types import Dict, Optional
+__CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
 class ResponseFactory:
@@ -18,9 +23,19 @@ class ResponseFactory:
     """
     @classmethod
-    def _process_response_history(
-        cls, first_response: SyncResponse, parser_arguments: Dict
-    ) -> list[Response]:
+    @lru_cache(maxsize=16)
+    def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
+        """Extract browser encoding from headers.
+        Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
+        """
+        if content_type:
+            # Because Playwright can't do that by themselves like all libraries for some reason :3
+            match = __CHARSET_RE__.search(content_type)
+            return match.group(1) if match else None
+        return None
+    @classmethod
+    def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
         """Process response history to build a list of `Response` objects"""
         history = []
         current_request = first_response.request.redirected_from
@@ -32,24 +47,23 @@ class ResponseFactory:
                     history.insert(
                         0,
                         Response(
-                            url=current_request.url,
-                            # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                            content="",
-                            status=current_response.status if current_response else 301,
-                            reason=(
-                                current_response.status_text
-                                or StatusText.get(current_response.status)
-                            )
-                            if current_response
-                            else StatusText.get(301),
-                            encoding=current_response.headers.get("content-type", "")
-                            or "utf-8",
-                            cookies=tuple(),
-                            headers=current_response.all_headers()
-                            if current_response
-                            else {},
-                            request_headers=current_request.all_headers(),
-                            **parser_arguments,
+                            **{
+                                "url": current_request.url,
+                                # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                                "content": "",
+                                "status": current_response.status if current_response else 301,
+                                "reason": (current_response.status_text or StatusText.get(current_response.status))
+                                if current_response
+                                else StatusText.get(301),
+                                "encoding": cls.__extract_browser_encoding(
+                                    current_response.headers.get("content-type", "")
+                                )
+                                or "utf-8",
+                                "cookies": tuple(),
+                                "headers": current_response.all_headers() if current_response else {},
+                                "request_headers": current_request.all_headers(),
+                                **parser_arguments,
+                            }
                         ),
                     )
                 except Exception as e:  # pragma: no cover
@@ -93,14 +107,11 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        # This will be parsed inside `Response`
         encoding = (
-            final_response.headers.get("content-type", "") or "utf-8"
+            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
         )  # default encoding
         # PlayWright API sometimes give empty status text for some reason!
-        status_text = final_response.status_text or StatusText.get(
-            final_response.status
-        )
+        status_text = final_response.status_text or StatusText.get(final_response.status)
         history = cls._process_response_history(first_response, parser_arguments)
         try:
@@ -110,16 +121,18 @@ class ResponseFactory:
             page_content = ""
         return Response(
-            url=page.url,
-            content=page_content,
-            status=final_response.status,
-            reason=status_text,
-            encoding=encoding,
-            cookies=tuple(dict(cookie) for cookie in page.context.cookies()),
-            headers=first_response.all_headers(),
-            request_headers=first_response.request.all_headers(),
-            history=history,
-            **parser_arguments,
+            **{
+                "url": page.url,
+                "content": page_content,
+                "status": final_response.status,
+                "reason": status_text,
+                "encoding": encoding,
+                "cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
+                "headers": first_response.all_headers(),
+                "request_headers": first_response.request.all_headers(),
+                "history": history,
+                **parser_arguments,
+            }
         )
     @classmethod
@@ -137,24 +150,23 @@ class ResponseFactory:
                     history.insert(
                         0,
                         Response(
-                            url=current_request.url,
-                            # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                            content="",
-                            status=current_response.status if current_response else 301,
-                            reason=(
-                                current_response.status_text
-                                or StatusText.get(current_response.status)
-                            )
-                            if current_response
-                            else StatusText.get(301),
-                            encoding=current_response.headers.get("content-type", "")
-                            or "utf-8",
-                            cookies=tuple(),
-                            headers=await current_response.all_headers()
-                            if current_response
-                            else {},
-                            request_headers=await current_request.all_headers(),
-                            **parser_arguments,
+                            **{
+                                "url": current_request.url,
+                                # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                                "content": "",
+                                "status": current_response.status if current_response else 301,
+                                "reason": (current_response.status_text or StatusText.get(current_response.status))
+                                if current_response
+                                else StatusText.get(301),
+                                "encoding": cls.__extract_browser_encoding(
+                                    current_response.headers.get("content-type", "")
+                                )
+                                or "utf-8",
+                                "cookies": tuple(),
+                                "headers": await current_response.all_headers() if current_response else {},
+                                "request_headers": await current_request.all_headers(),
+                                **parser_arguments,
+                            }
                         ),
                     )
                 except Exception as e:  # pragma: no cover
@@ -198,18 +210,13 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        # This will be parsed inside `Response`
         encoding = (
-            final_response.headers.get("content-type", "") or "utf-8"
+            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
         )  # default encoding
         # PlayWright API sometimes give empty status text for some reason!
-        status_text = final_response.status_text or StatusText.get(
-            final_response.status
-        )
+        status_text = final_response.status_text or StatusText.get(final_response.status)
-        history = await cls._async_process_response_history(
-            first_response, parser_arguments
-        )
+        history = await cls._async_process_response_history(first_response, parser_arguments)
         try:
             page_content = await page.content()
         except Exception as e:  # pragma: no cover
@@ -217,16 +224,18 @@ class ResponseFactory:
             page_content = ""
         return Response(
-            url=page.url,
-            content=page_content,
-            status=final_response.status,
-            reason=status_text,
-            encoding=encoding,
-            cookies=tuple(dict(cookie) for cookie in await page.context.cookies()),
-            headers=await first_response.all_headers(),
-            request_headers=await first_response.request.all_headers(),
-            history=history,
-            **parser_arguments,
+            **{
+                "url": page.url,
+                "content": page_content,
+                "status": final_response.status,
+                "reason": status_text,
+                "encoding": encoding,
+                "cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
+                "headers": await first_response.all_headers(),
+                "request_headers": await first_response.request.all_headers(),
+                "history": history,
+                **parser_arguments,
+            }
         )
     @staticmethod
@@ -238,17 +247,17 @@ class ResponseFactory:
         :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         return Response(
-            url=response.url,
-            content=response.content
-            if isinstance(response.content, bytes)
-            else response.content.encode(),
-            status=response.status_code,
-            reason=response.reason,
-            encoding=response.encoding or "utf-8",
-            cookies=dict(response.cookies),
-            headers=dict(response.headers),
-            request_headers=dict(response.request.headers),
-            method=response.request.method,
-            history=response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
-            **parser_arguments,
+            **{
+                "url": response.url,
+                "content": response.content,
+                "status": response.status_code,
+                "reason": response.reason,
+                "encoding": response.encoding or "utf-8",
+                "cookies": dict(response.cookies),
+                "headers": dict(response.headers),
+                "request_headers": dict(response.request.headers),
+                "method": response.request.method,
+                "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
+                **parser_arguments,
+            }
         )

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -2,8 +2,9 @@
 Functions related to custom types or type checking
 """
-from email.message import Message
+from functools import lru_cache
+from scrapling.core.utils import log
 from scrapling.core._types import (
     Any,
     Dict,
@@ -12,89 +13,9 @@ from scrapling.core._types import (
     Tuple,
 )
 from scrapling.core.custom_types import MappingProxyType
-from scrapling.core.utils import log, lru_cache
 from scrapling.parser import Selector, SQLiteStorageSystem
-class ResponseEncoding:
-    __DEFAULT_ENCODING = "utf-8"
-    __ISO_8859_1_CONTENT_TYPES = {
-        "text/plain",
-        "text/html",
-        "text/css",
-        "text/javascript",
-    }
-    @classmethod
-    @lru_cache(maxsize=128)
-    def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
-        """Parse content type and parameters from a content-type header value.
-            Uses `email.message.Message` for robust header parsing according to RFC 2045.
-        :param header_value: Raw content-type header string
-        :return: Tuple of (content_type, parameters_dict)
-        """
-        # Create a Message object and set the Content-Type header then get the content type and parameters
-        msg = Message()
-        msg["content-type"] = header_value
-        content_type = msg.get_content_type()
-        params = dict(msg.get_params(failobj=[]))
-        # Remove the content-type from params if present somehow
-        params.pop("content-type", None)
-        return content_type, params
-    @classmethod
-    @lru_cache(maxsize=128)
-    def get_value(
-        cls, content_type: Optional[str], text: Optional[str] = "test"
-    ) -> str:
-        """Determine the appropriate character encoding from a content-type header.
-        The encoding is determined by these rules in order:
-            1. If no content-type is provided, use UTF-8
-            2. If charset parameter is present, use that encoding
-            3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
-            4. If content-type is application/json, use UTF-8 per RFC 4627
-            5. Default to UTF-8 if nothing else matches
-        :param content_type: Content-Type header value or None
-        :param text: A text to test the encoding on it
-        :return: String naming the character encoding
-        """
-        if not content_type:
-            return cls.__DEFAULT_ENCODING
-        try:
-            encoding = None
-            content_type, params = cls.__parse_content_type(content_type)
-            # First check for explicit charset parameter
-            if "charset" in params:
-                encoding = params["charset"].strip("'\"")
-            # Apply content-type specific rules
-            elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
-                encoding = "ISO-8859-1"
-            elif content_type == "application/json":
-                encoding = cls.__DEFAULT_ENCODING
-            if encoding:
-                _ = text.encode(
-                    encoding
-                )  # Validate encoding and validate it can encode the given text
-                return encoding
-            return cls.__DEFAULT_ENCODING
-        except (ValueError, LookupError, UnicodeEncodeError):
-            return cls.__DEFAULT_ENCODING
 class Response(Selector):
     """This class is returned by all engines as a way to unify response type between different libraries."""
@@ -119,9 +40,6 @@ class Response(Selector):
         self.headers = headers
         self.request_headers = request_headers
         self.history = history or []
-        encoding = ResponseEncoding.get_value(
-            encoding, content.decode("utf-8") if isinstance(content, bytes) else content
-        )
         super().__init__(
             content=content,
             url=adaptive_domain or url,
@@ -129,9 +47,7 @@ class Response(Selector):
             **selector_config,
         )
         # For easier debugging while working from a Python shell
-        log.info(
-            f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
-        )
+        log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
 class BaseFetcher:
@@ -190,18 +106,12 @@ class BaseFetcher:
                     setattr(cls, key, value)
                 else:
                     # Yup, no fun allowed LOL
-                    raise AttributeError(
-                        f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
-                    )
+                    raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
             else:
-                raise ValueError(
-                    f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
-                )
+                raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
         if not kwargs:
-            raise AttributeError(
-                f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?"
-            )
+            raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")
     @classmethod
     def _generate_parser_arguments(cls) -> Dict:
@@ -217,9 +127,7 @@ class BaseFetcher:
         )
         if cls.adaptive_domain:
             if not isinstance(cls.adaptive_domain, str):
-                log.warning(
-                    '[Ignored] The argument "adaptive_domain" must be of string type'
-                )
+                log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
             else:
                 parser_arguments.update({"adaptive_domain": cls.adaptive_domain})

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -2,13 +2,13 @@
 Functions related to generating headers and fingerprints generally
 """
+from functools import lru_cache
 from platform import system as platform_system
 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
 from scrapling.core._types import Dict, Optional
-from scrapling.core.utils import lru_cache
 __OS_NAME__ = platform_system()
@@ -37,8 +37,6 @@ def get_os_name() -> Optional[str]:
         "Linux": "linux",
         "Darwin": "macos",
         "Windows": "windows",
-        # For the future? because why not?
-        "iOS": "ios",
     }.get(__OS_NAME__)

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -30,9 +30,7 @@ def intercept_route(route: Route):
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        log.debug(
-            f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
-        )
+        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
         route.abort()
     else:
         route.continue_()
@@ -45,17 +43,13 @@ async def async_intercept_route(route: async_Route):
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        log.debug(
-            f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
-        )
+        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
         await route.abort()
     else:
         await route.continue_()
-def construct_proxy_dict(
-    proxy_string: str | Dict[str, str], as_tuple=False
-) -> Optional[Dict | Tuple]:
+def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
     """Validate a proxy and return it in the acceptable format for Playwright
     Reference: https://playwright.dev/python/docs/network#http-proxy
@@ -65,10 +59,7 @@ def construct_proxy_dict(
     """
     if isinstance(proxy_string, str):
         proxy = urlparse(proxy_string)
-        if (
-            proxy.scheme not in ("http", "https", "socks4", "socks5")
-            or not proxy.hostname
-        ):
+        if proxy.scheme not in ("http", "https", "socks4", "socks5") or not proxy.hostname:
             raise ValueError("Invalid proxy string!")
         try:
@@ -95,51 +86,6 @@ def construct_proxy_dict(
     return None
-def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
-    """Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
-    :param cdp_url: The target URL.
-    :param query_params: A dictionary of the parameters to add.
-    :return: The new CDP URL.
-    """
-    try:
-        # Validate the base URL structure
-        parsed = urlparse(cdp_url)
-        # Check scheme
-        if parsed.scheme not in ("ws", "wss"):
-            raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
-        # Validate hostname and port
-        if not parsed.netloc:
-            raise ValueError("Invalid hostname for the CDP URL")
-        try:
-            # Checking if the port is valid (if available)
-            _ = parsed.port
-        except ValueError:
-            # urlparse will raise `ValueError` if the port can't be casted to integer
-            raise ValueError("Invalid port for the CDP URL")
-        # Ensure the path starts with /
-        path = parsed.path
-        if not path.startswith("/"):
-            path = "/" + path
-        # Reconstruct the base URL with validated parts
-        validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
-        # Add query parameters
-        if query_params:
-            query_string = urlencode(query_params)
-            return f"{validated_base}?{query_string}"
-        return validated_base
-    except Exception as e:
-        raise ValueError(f"Invalid CDP URL: {str(e)}")
 @lru_cache(10, typed=True)
 def js_bypass_path(filename: str) -> str:
     """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it

scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

scrapling 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl