PyPI - scrapling - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl - Mend

scrapling 0.3.6py3-none-any.whl → 0.3.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

scrapling/__init__.py +1 -1
scrapling/core/_types.py +3 -0
scrapling/core/ai.py +2 -1
scrapling/core/custom_types.py +20 -27
scrapling/core/mixins.py +15 -9
scrapling/core/shell.py +4 -3
scrapling/core/storage.py +5 -5
scrapling/core/translator.py +13 -8
scrapling/engines/_browsers/_base.py +175 -21
scrapling/engines/_browsers/_camoufox.py +95 -171
scrapling/engines/_browsers/_config_tools.py +9 -3
scrapling/engines/_browsers/_controllers.py +51 -101
scrapling/engines/_browsers/_validators.py +95 -63
scrapling/engines/static.py +678 -668
scrapling/engines/toolbelt/convertor.py +48 -15
scrapling/engines/toolbelt/custom.py +6 -21
scrapling/engines/toolbelt/fingerprints.py +14 -9
scrapling/engines/toolbelt/navigation.py +11 -3
scrapling/fetchers/__init__.py +11 -1
scrapling/fetchers/chrome.py +15 -4
scrapling/fetchers/firefox.py +0 -4
scrapling/parser.py +105 -80
{scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/METADATA +7 -6
scrapling-0.3.8.dist-info/RECORD +47 -0
scrapling-0.3.6.dist-info/RECORD +0 -47
{scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/WHEEL +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/top_level.txt +0 -0

scrapling/engines/toolbelt/convertor.py CHANGED Viewed

@@ -2,6 +2,7 @@ from functools import lru_cache
 from re import compile as re_compile
 from curl_cffi.requests import Response as CurlResponse
+from playwright._impl._errors import Error as PlaywrightError
 from playwright.sync_api import Page as SyncPage, Response as SyncResponse
 from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
@@ -24,15 +25,15 @@ class ResponseFactory:
     @classmethod
     @lru_cache(maxsize=16)
-    def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
+    def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
         """Extract browser encoding from headers.
         Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
         """
         if content_type:
             # Because Playwright can't do that by themselves like all libraries for some reason :3
             match = __CHARSET_RE__.search(content_type)
-            return match.group(1) if match else None
-        return None
+            return match.group(1) if match else default
+        return default
     @classmethod
     def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
@@ -58,7 +59,8 @@ class ResponseFactory:
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
-                                or "utf-8",
+                                if current_response
+                                else "utf-8",
                                 "cookies": tuple(),
                                 "headers": current_response.all_headers() if current_response else {},
                                 "request_headers": current_request.all_headers(),
@@ -83,6 +85,7 @@ class ResponseFactory:
         first_response: SyncResponse,
         final_response: Optional[SyncResponse],
         parser_arguments: Dict,
+        automated_page: bool = False,
     ) -> Response:
         """
         Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -98,6 +101,7 @@ class ResponseFactory:
         :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
         :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
             the `Response` object.
+        :param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
         :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
         :rtype: Response
@@ -107,15 +111,13 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        encoding = (
-            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
-        )  # default encoding
+        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
         history = cls._process_response_history(first_response, parser_arguments)
         try:
-            page_content = page.content()
+            page_content = final_response.text() if not automated_page else cls._get_page_content(page)
         except Exception as e:  # pragma: no cover
             log.error(f"Error getting page content: {e}")
             page_content = ""
@@ -161,7 +163,8 @@ class ResponseFactory:
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
-                                or "utf-8",
+                                if current_response
+                                else "utf-8",
                                 "cookies": tuple(),
                                 "headers": await current_response.all_headers() if current_response else {},
                                 "request_headers": await current_request.all_headers(),
@@ -179,6 +182,36 @@ class ResponseFactory:
         return history
+    @classmethod
+    def _get_page_content(cls, page: SyncPage) -> str:
+        """
+        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        :param page: The page to extract content from.
+        :return:
+        """
+        while True:
+            try:
+                return page.content() or ""
+            except PlaywrightError:
+                page.wait_for_timeout(500)
+                continue
+        return ""  # pyright: ignore
+    @classmethod
+    async def _get_async_page_content(cls, page: AsyncPage) -> str:
+        """
+        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        :param page: The page to extract content from.
+        :return:
+        """
+        while True:
+            try:
+                return (await page.content()) or ""
+            except PlaywrightError:
+                await page.wait_for_timeout(500)
+                continue
+        return ""  # pyright: ignore
     @classmethod
     async def from_async_playwright_response(
         cls,
@@ -186,6 +219,7 @@ class ResponseFactory:
         first_response: AsyncResponse,
         final_response: Optional[AsyncResponse],
         parser_arguments: Dict,
+        automated_page: bool = False,
     ) -> Response:
         """
         Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -201,6 +235,7 @@ class ResponseFactory:
         :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
         :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
             the `Response` object.
+        :param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
         :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
         :rtype: Response
@@ -210,15 +245,13 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        encoding = (
-            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
-        )  # default encoding
+        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
         history = await cls._async_process_response_history(first_response, parser_arguments)
         try:
-            page_content = await page.content()
+            page_content = await (final_response.text() if not automated_page else cls._get_async_page_content(page))
         except Exception as e:  # pragma: no cover
             log.error(f"Error getting page content in async: {e}")
             page_content = ""
@@ -255,8 +288,8 @@ class ResponseFactory:
                 "encoding": response.encoding or "utf-8",
                 "cookies": dict(response.cookies),
                 "headers": dict(response.headers),
-                "request_headers": dict(response.request.headers),
-                "method": response.request.method,
+                "request_headers": dict(response.request.headers) if response.request else {},
+                "method": response.request.method if response.request else "GET",
                 "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
                 **parser_arguments,
             }

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -8,6 +8,7 @@ from scrapling.core.utils import log
 from scrapling.core._types import (
     Any,
     Dict,
+    cast,
     List,
     Optional,
     Tuple,
@@ -30,10 +31,10 @@ class Response(Selector):
         request_headers: Dict,
         encoding: str = "utf-8",
         method: str = "GET",
-        history: List = None,
-        **selector_config: Dict,
+        history: List | None = None,
+        **selector_config: Any,
     ):
-        adaptive_domain = selector_config.pop("adaptive_domain", None)
+        adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
         self.status = status
         self.reason = reason
         self.cookies = cookies
@@ -58,7 +59,7 @@ class BaseFetcher:
     keep_cdata: Optional[bool] = False
     storage_args: Optional[Dict] = None
     keep_comments: Optional[bool] = False
-    adaptive_domain: Optional[str] = None
+    adaptive_domain: str = ""
     parser_keywords: Tuple = (
         "huge_tree",
         "adaptive",
@@ -124,12 +125,8 @@ class BaseFetcher:
             adaptive=cls.adaptive,
             storage=cls.storage,
             storage_args=cls.storage_args,
+            adaptive_domain=cls.adaptive_domain,
         )
-        if cls.adaptive_domain:
-            if not isinstance(cls.adaptive_domain, str):
-                log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
-            else:
-                parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
         return parser_arguments
@@ -212,15 +209,3 @@ class StatusText:
     def get(cls, status_code: int) -> str:
         """Get the phrase for a given HTTP status code."""
         return cls._phrases.get(status_code, "Unknown Status Code")
-def get_variable_name(var: Any) -> Optional[str]:
-    """Get the name of a variable using global and local scopes.
-    :param var: The variable to find the name for
-    :return: The name of the variable if found, None otherwise
-    """
-    for scope in [globals(), locals()]:
-        for name, value in scope.items():
-            if value is var:
-                return name
-    return None

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -7,10 +7,12 @@ from platform import system as platform_system
 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
+from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
-from scrapling.core._types import Dict, Optional
+from scrapling.core._types import Dict, Literal, Tuple
 __OS_NAME__ = platform_system()
+OSName = Literal["linux", "macos", "windows"]
 @lru_cache(10, typed=True)
@@ -28,16 +30,20 @@ def generate_convincing_referer(url: str) -> str:
 @lru_cache(1, typed=True)
-def get_os_name() -> Optional[str]:
-    """Get the current OS name in the same format needed for browserforge
+def get_os_name() -> OSName | Tuple:
+    """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
     :return: Current OS name or `None` otherwise
     """
-    return {
-        "Linux": "linux",
-        "Darwin": "macos",
-        "Windows": "windows",
-    }.get(__OS_NAME__)
+    match __OS_NAME__:  # pragma: no cover
+        case "Linux":
+            return "linux"
+        case "Darwin":
+            return "macos"
+        case "Windows":
+            return "windows"
+        case _:
+            return SUPPORTED_OPERATING_SYSTEMS
 def generate_headers(browser_mode: bool = False) -> Dict:
@@ -58,7 +64,6 @@ def generate_headers(browser_mode: bool = False) -> Dict:
                 Browser(name="edge", min_version=130),
             ]
         )
     return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
 from playwright.sync_api import Route
 from scrapling.core.utils import log
-from scrapling.core._types import Dict, Optional, Tuple
+from scrapling.core._types import Dict, Tuple, overload, Literal
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
 __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
         await route.continue_()
-def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
+@overload
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
+@overload
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
     """Validate a proxy and return it in the acceptable format for Playwright
     Reference: https://playwright.dev/python/docs/network#http-proxy
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
         except ValidationError as e:
             raise TypeError(f"Invalid proxy dictionary: {e}")
-    return None
+    raise TypeError(f"Invalid proxy string: {proxy_string}")
 @lru_cache(10, typed=True)

scrapling/fetchers/__init__.py CHANGED Viewed

@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
     "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
 }
-__all__ = ["Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
+__all__ = [
+    "Fetcher",
+    "AsyncFetcher",
+    "FetcherSession",
+    "DynamicFetcher",
+    "DynamicSession",
+    "AsyncDynamicSession",
+    "StealthyFetcher",
+    "StealthySession",
+    "AsyncStealthySession",
+]
 def __getattr__(name: str) -> Any:

scrapling/fetchers/chrome.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from scrapling.core._types import (
     Callable,
-    Dict,
     List,
+    Dict,
     Optional,
     SelectorWaitStates,
-    Iterable,
 )
 from scrapling.engines.toolbelt.custom import BaseFetcher, Response
 from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
@@ -47,10 +46,12 @@ class DynamicFetcher(BaseFetcher):
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
         init_script: Optional[str] = None,
-        cookies: Optional[Iterable[Dict]] = None,
+        cookies: Optional[List[Dict]] = None,
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
+        extra_flags: Optional[List[str]] = None,
+        additional_args: Optional[Dict] = None,
         custom_config: Optional[Dict] = None,
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -79,7 +80,9 @@ class DynamicFetcher(BaseFetcher):
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
         :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
         :return: A `Response` object.
         """
         if not custom_config:
@@ -107,6 +110,8 @@ class DynamicFetcher(BaseFetcher):
             extra_headers=extra_headers,
             wait_selector=wait_selector,
             disable_webgl=disable_webgl,
+            extra_flags=extra_flags,
+            additional_args=additional_args,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
             selector_config={**cls._generate_parser_arguments(), **custom_config},
@@ -134,10 +139,12 @@ class DynamicFetcher(BaseFetcher):
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
         init_script: Optional[str] = None,
-        cookies: Optional[Iterable[Dict]] = None,
+        cookies: Optional[List[Dict]] = None,
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
+        extra_flags: Optional[List[str]] = None,
+        additional_args: Optional[Dict] = None,
         custom_config: Optional[Dict] = None,
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -166,7 +173,9 @@ class DynamicFetcher(BaseFetcher):
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
         :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
         :return: A `Response` object.
         """
         if not custom_config:
@@ -195,6 +204,8 @@ class DynamicFetcher(BaseFetcher):
             extra_headers=extra_headers,
             wait_selector=wait_selector,
             disable_webgl=disable_webgl,
+            extra_flags=extra_flags,
+            additional_args=additional_args,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
             selector_config={**cls._generate_parser_arguments(), **custom_config},

scrapling/fetchers/firefox.py CHANGED Viewed

@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
         """
         if not custom_config:
             custom_config = {}
-        elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         with StealthySession(
             wait=wait,
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
         """
         if not custom_config:
             custom_config = {}
-        elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         async with AsyncStealthySession(
             wait=wait,

scrapling 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

scrapling 0.3.6py3-none-any.whl → 0.3.8py3-none-any.whl