PyPI - scrapling - Versions diffs - 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl - Mend

scrapling 0.3.7py3-none-any.whl → 0.3.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

scrapling/__init__.py +1 -1
scrapling/engines/_browsers/_base.py +140 -9
scrapling/engines/_browsers/_camoufox.py +47 -164
scrapling/engines/_browsers/_config_tools.py +8 -2
scrapling/engines/_browsers/_controllers.py +25 -96
scrapling/engines/_browsers/_validators.py +72 -61
scrapling/engines/toolbelt/convertor.py +37 -2
scrapling/engines/toolbelt/custom.py +0 -12
scrapling/engines/toolbelt/fingerprints.py +6 -8
scrapling/fetchers/chrome.py +6 -0
{scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/METADATA +6 -4
{scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/RECORD +16 -16
{scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/WHEEL +0 -0
{scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/top_level.txt +0 -0

scrapling/engines/_browsers/_controllers.py CHANGED Viewed

@@ -1,23 +1,20 @@
 from playwright.sync_api import (
-    Response as SyncPlaywrightResponse,
-    sync_playwright,
-    Playwright,
     Locator,
+    Playwright,
+    sync_playwright,
 )
 from playwright.async_api import (
     async_playwright,
-    Response as AsyncPlaywrightResponse,
-    BrowserContext as AsyncBrowserContext,
-    Playwright as AsyncPlaywright,
     Locator as AsyncLocator,
-    Page as async_Page,
+    Playwright as AsyncPlaywright,
+    BrowserContext as AsyncBrowserContext,
 )
 from patchright.sync_api import sync_playwright as sync_patchright
 from patchright.async_api import async_playwright as async_patchright
 from scrapling.core.utils import log
 from ._base import SyncSession, AsyncSession, DynamicSessionMixin
-from ._validators import validate_fetch as _validate
+from ._validators import validate_fetch as _validate, PlaywrightConfig
 from scrapling.core._types import (
     Any,
     Dict,
@@ -98,6 +95,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
         user_data_dir: str = "",
+        extra_flags: Optional[List[str]] = None,
         selector_config: Optional[Dict] = None,
         additional_args: Optional[Dict] = None,
     ):
@@ -127,6 +125,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
         """
@@ -152,6 +151,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
             extra_headers=extra_headers,
             wait_selector=wait_selector,
             disable_webgl=disable_webgl,
+            extra_flags=extra_flags,
             selector_config=selector_config,
             additional_args=additional_args,
             disable_resources=disable_resources,
@@ -178,28 +178,6 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         if self.cookies:  # pragma: no cover
             self.context.add_cookies(self.cookies)
-    def __enter__(self):
-        self.__create__()
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.close()
-    def close(self):  # pragma: no cover
-        """Close all resources"""
-        if self._closed:
-            return
-        if self.context:
-            self.context.close()
-            self.context = None
-        if self.playwright:
-            self.playwright.stop()
-            self.playwright = None  # pyright: ignore
-        self._closed = True
     def fetch(
         self,
         url: str,
@@ -247,38 +225,26 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
                 ("load_dom", load_dom, self.load_dom),
                 ("selector_config", selector_config, self.selector_config),
             ],
+            PlaywrightConfig,
             _UNSET,
         )
         if self._closed:  # pragma: no cover
             raise RuntimeError("Context manager has been closed")
-        final_response = None
         referer = (
             generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
         )
-        def handle_response(finished_response: SyncPlaywrightResponse):
-            nonlocal final_response
-            if (
-                finished_response.request.resource_type == "document"
-                and finished_response.request.is_navigation_request()
-                and finished_response.request.frame == page_info.page.main_frame
-            ):
-                final_response = finished_response
         page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
-        page_info.mark_busy(url=url)
+        final_response = [None]
+        handle_response = self._create_response_handler(page_info, final_response)
         try:  # pragma: no cover
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)
             first_response = page_info.page.goto(url, referer=referer)
-            if params.load_dom:
-                page_info.page.wait_for_load_state(state="domcontentloaded")
-            if params.network_idle:
-                page_info.page.wait_for_load_state("networkidle")
+            self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
             if not first_response:
                 raise RuntimeError(f"Failed to get response for {url}")
@@ -294,11 +260,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
                     waiter: Locator = page_info.page.locator(params.wait_selector)
                     waiter.first.wait_for(state=params.wait_selector_state)
                     # Wait again after waiting for the selector, helpful with protections like Cloudflare
-                    page_info.page.wait_for_load_state(state="load")
-                    if params.load_dom:
-                        page_info.page.wait_for_load_state(state="domcontentloaded")
-                    if params.network_idle:
-                        page_info.page.wait_for_load_state("networkidle")
+                    self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
                 except Exception as e:  # pragma: no cover
                     log.error(f"Error waiting for selector {params.wait_selector}: {e}")
@@ -306,7 +268,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
             # Create response object
             response = ResponseFactory.from_playwright_response(
-                page_info.page, first_response, final_response, params.selector_config
+                page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
             )
             # Close the page to free up resources
@@ -348,6 +310,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
         user_data_dir: str = "",
+        extra_flags: Optional[List[str]] = None,
         selector_config: Optional[Dict] = None,
         additional_args: Optional[Dict] = None,
     ):
@@ -378,6 +341,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
         :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
         """
@@ -404,6 +368,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
             extra_headers=extra_headers,
             wait_selector=wait_selector,
             disable_webgl=disable_webgl,
+            extra_flags=extra_flags,
             selector_config=selector_config,
             additional_args=additional_args,
             disable_resources=disable_resources,
@@ -431,28 +396,6 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         if self.cookies:
             await self.context.add_cookies(self.cookies)  # pyright: ignore
-    async def __aenter__(self):
-        await self.__create__()
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        await self.close()
-    async def close(self):
-        """Close all resources"""
-        if self._closed:  # pragma: no cover
-            return
-        if self.context:
-            await self.context.close()
-            self.context = None  # pyright: ignore
-        if self.playwright:
-            await self.playwright.stop()
-            self.playwright = None  # pyright: ignore
-        self._closed = True
     async def fetch(
         self,
         url: str,
@@ -500,30 +443,24 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
                 ("load_dom", load_dom, self.load_dom),
                 ("selector_config", selector_config, self.selector_config),
             ],
+            PlaywrightConfig,
             _UNSET,
         )
         if self._closed:  # pragma: no cover
             raise RuntimeError("Context manager has been closed")
-        final_response = None
         referer = (
             generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
         )
-        async def handle_response(finished_response: AsyncPlaywrightResponse):
-            nonlocal final_response
-            if (
-                finished_response.request.resource_type == "document"
-                and finished_response.request.is_navigation_request()
-                and finished_response.request.frame == page_info.page.main_frame
-            ):
-                final_response = finished_response
         page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
-        page_info.mark_busy(url=url)
+        final_response = [None]
+        handle_response = self._create_response_handler(page_info, final_response)
         if TYPE_CHECKING:
+            from playwright.async_api import Page as async_Page
             if not isinstance(page_info.page, async_Page):
                 raise TypeError
@@ -531,11 +468,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)
             first_response = await page_info.page.goto(url, referer=referer)
-            if self.load_dom:
-                await page_info.page.wait_for_load_state(state="domcontentloaded")
-            if params.network_idle:
-                await page_info.page.wait_for_load_state("networkidle")
+            await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
             if not first_response:
                 raise RuntimeError(f"Failed to get response for {url}")
@@ -551,11 +484,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
                     waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
                     await waiter.first.wait_for(state=params.wait_selector_state)
                     # Wait again after waiting for the selector, helpful with protections like Cloudflare
-                    await page_info.page.wait_for_load_state(state="load")
-                    if self.load_dom:
-                        await page_info.page.wait_for_load_state(state="domcontentloaded")
-                    if params.network_idle:
-                        await page_info.page.wait_for_load_state("networkidle")
+                    await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
                 except Exception as e:
                     log.error(f"Error waiting for selector {params.wait_selector}: {e}")
@@ -563,7 +492,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
             # Create response object
             response = await ResponseFactory.from_async_playwright_response(
-                page_info.page, first_response, final_response, params.selector_config
+                page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
             )
             # Close the page to free up resources

scrapling/engines/_browsers/_validators.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from pathlib import Path
 from typing import Annotated
-from dataclasses import dataclass
+from functools import lru_cache
 from urllib.parse import urlparse
+from dataclasses import dataclass, fields
 from msgspec import Struct, Meta, convert, ValidationError
@@ -19,18 +20,20 @@ from scrapling.engines.toolbelt.navigation import construct_proxy_dict
 # Custom validators for msgspec
-def _validate_file_path(value: str):
+@lru_cache(8)
+def _is_invalid_file_path(value: str) -> bool | str:
     """Fast file path validation"""
     path = Path(value)
     if not path.exists():
-        raise ValueError(f"Init script path not found: {value}")
+        return f"Init script path not found: {value}"
     if not path.is_file():
-        raise ValueError(f"Init script is not a file: {value}")
+        return f"Init script is not a file: {value}"
     if not path.is_absolute():
-        raise ValueError(f"Init script is not a absolute path: {value}")
+        return f"Init script is not a absolute path: {value}"
+    return False
-def _validate_addon_path(value: str):
+def _validate_addon_path(value: str) -> None:
     """Fast addon path validation"""
     path = Path(value)
     if not path.exists():
@@ -39,22 +42,16 @@ def _validate_addon_path(value: str):
         raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
-def _validate_cdp_url(cdp_url: str):
+@lru_cache(2)
+def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
     """Fast CDP URL validation"""
-    try:
-        # Check the scheme
-        if not cdp_url.startswith(("ws://", "wss://")):
-            raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
-        # Validate hostname and port
-        if not urlparse(cdp_url).netloc:
-            raise ValueError("Invalid hostname for the CDP URL")
+    if not cdp_url.startswith(("ws://", "wss://")):
+        return "CDP URL must use 'ws://' or 'wss://' scheme"
-    except AttributeError as e:
-        raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
-    except Exception as e:
-        raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
+    netloc = urlparse(cdp_url).netloc
+    if not netloc:
+        return "Invalid hostname for the CDP URL"
+    return False
 # Type aliases for cleaner annotations
@@ -62,7 +59,7 @@ PagesCount = Annotated[int, Meta(ge=1, le=50)]
 Seconds = Annotated[int, float, Meta(ge=0)]
-class PlaywrightConfig(Struct, kw_only=True, frozen=False):
+class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
     """Configuration struct for validation"""
     max_pages: PagesCount = 1
@@ -88,6 +85,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
     load_dom: bool = True
     wait_selector_state: SelectorWaitStates = "attached"
     user_data_dir: str = ""
+    extra_flags: Optional[List[str]] = None
     selector_config: Optional[Dict] = {}
     additional_args: Optional[Dict] = {}
@@ -98,20 +96,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
         if self.proxy:
             self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
         if self.cdp_url:
-            _validate_cdp_url(self.cdp_url)
+            cdp_msg = _is_invalid_cdp_url(self.cdp_url)
+            if cdp_msg:
+                raise ValueError(cdp_msg)
         if not self.cookies:
             self.cookies = []
+        if not self.extra_flags:
+            self.extra_flags = []
         if not self.selector_config:
             self.selector_config = {}
         if not self.additional_args:
             self.additional_args = {}
         if self.init_script is not None:
-            _validate_file_path(self.init_script)
+            validation_msg = _is_invalid_file_path(self.init_script)
+            if validation_msg:
+                raise ValueError(validation_msg)
-class CamoufoxConfig(Struct, kw_only=True, frozen=False):
+class CamoufoxConfig(Struct, kw_only=True, frozen=False, weakref=True):
     """Configuration struct for validation"""
     max_pages: PagesCount = 1
@@ -149,14 +153,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
         if self.proxy:
             self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
-        if self.addons and isinstance(self.addons, list):
+        if self.addons:
             for addon in self.addons:
                 _validate_addon_path(addon)
         else:
             self.addons = []
         if self.init_script is not None:
-            _validate_file_path(self.init_script)
+            validation_msg = _is_invalid_file_path(self.init_script)
+            if validation_msg:
+                raise ValueError(validation_msg)
         if not self.cookies:
             self.cookies = []
@@ -169,27 +175,6 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
             self.additional_args = {}
-# Code parts to validate `fetch` in the least possible numbers of lines overall
-class FetchConfig(Struct, kw_only=True):
-    """Configuration struct for `fetch` calls validation"""
-    google_search: bool = True
-    timeout: Seconds = 30000
-    wait: Seconds = 0
-    page_action: Optional[Callable] = None
-    extra_headers: Optional[Dict[str, str]] = None
-    disable_resources: bool = False
-    wait_selector: Optional[str] = None
-    wait_selector_state: SelectorWaitStates = "attached"
-    network_idle: bool = False
-    load_dom: bool = True
-    solve_cloudflare: bool = False
-    selector_config: Dict = {}
-    def to_dict(self):
-        return {f: getattr(self, f) for f in self.__struct_fields__}
 @dataclass
 class _fetch_params:
     """A dataclass of all parameters used by `fetch` calls"""
@@ -208,7 +193,9 @@ class _fetch_params:
     selector_config: Dict
-def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
+def validate_fetch(
+    params: List[Tuple], model: type[PlaywrightConfig] | type[CamoufoxConfig], sentinel=None
+) -> _fetch_params:
     result = {}
     overrides = {}
@@ -219,32 +206,56 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
             result[arg] = session_value
     if overrides:
-        overrides = validate(overrides, FetchConfig).to_dict()
-        overrides.update(result)
-        return _fetch_params(**overrides)
+        validated_config = validate(overrides, model)
+        # Extract only the fields that _fetch_params needs from validated_config
+        validated_dict = {
+            f.name: getattr(validated_config, f.name)
+            for f in fields(_fetch_params)
+            if hasattr(validated_config, f.name)
+        }
+        # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
+        validated_dict.setdefault("solve_cloudflare", False)
-    if not result.get("solve_cloudflare"):
-        result["solve_cloudflare"] = False
+        validated_dict.update(result)
+        return _fetch_params(**validated_dict)
+    result.setdefault("solve_cloudflare", False)
     return _fetch_params(**result)
-@overload
-def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
+# Cache default values for each model to reduce validation overhead
+models_default_values = {}
+for _model in (CamoufoxConfig, PlaywrightConfig):
+    _defaults = {}
+    if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
+        for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__):  # type: ignore
+            # Skip factory defaults - these are msgspec._core.Factory instances
+            if type(default_value).__name__ != "Factory":
+                _defaults[field_name] = default_value
+    models_default_values[_model.__name__] = _defaults.copy()
+def _filter_defaults(params: Dict, model: str) -> Dict:
+    """Filter out parameters that match their default values to reduce validation overhead."""
+    defaults = models_default_values[model]
+    return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
 @overload
-def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
+def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
 @overload
-def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
+def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
-def validate(
-    params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
-) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
+def validate(params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig]) -> PlaywrightConfig | CamoufoxConfig:
     try:
-        return convert(params, model)
+        # Filter out params with the default values (no need to validate them) to speed up validation
+        filtered = _filter_defaults(params, model.__name__)
+        return convert(filtered, model)
     except ValidationError as e:
         raise TypeError(f"Invalid argument type: {e}") from e

scrapling/engines/toolbelt/convertor.py CHANGED Viewed

@@ -2,6 +2,7 @@ from functools import lru_cache
 from re import compile as re_compile
 from curl_cffi.requests import Response as CurlResponse
+from playwright._impl._errors import Error as PlaywrightError
 from playwright.sync_api import Page as SyncPage, Response as SyncResponse
 from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
@@ -84,6 +85,7 @@ class ResponseFactory:
         first_response: SyncResponse,
         final_response: Optional[SyncResponse],
         parser_arguments: Dict,
+        automated_page: bool = False,
     ) -> Response:
         """
         Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -99,6 +101,7 @@ class ResponseFactory:
         :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
         :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
             the `Response` object.
+        :param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
         :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
         :rtype: Response
@@ -114,7 +117,7 @@ class ResponseFactory:
         history = cls._process_response_history(first_response, parser_arguments)
         try:
-            page_content = final_response.text()
+            page_content = final_response.text() if not automated_page else cls._get_page_content(page)
         except Exception as e:  # pragma: no cover
             log.error(f"Error getting page content: {e}")
             page_content = ""
@@ -179,6 +182,36 @@ class ResponseFactory:
         return history
+    @classmethod
+    def _get_page_content(cls, page: SyncPage) -> str:
+        """
+        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        :param page: The page to extract content from.
+        :return:
+        """
+        while True:
+            try:
+                return page.content() or ""
+            except PlaywrightError:
+                page.wait_for_timeout(500)
+                continue
+        return ""  # pyright: ignore
+    @classmethod
+    async def _get_async_page_content(cls, page: AsyncPage) -> str:
+        """
+        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        :param page: The page to extract content from.
+        :return:
+        """
+        while True:
+            try:
+                return (await page.content()) or ""
+            except PlaywrightError:
+                await page.wait_for_timeout(500)
+                continue
+        return ""  # pyright: ignore
     @classmethod
     async def from_async_playwright_response(
         cls,
@@ -186,6 +219,7 @@ class ResponseFactory:
         first_response: AsyncResponse,
         final_response: Optional[AsyncResponse],
         parser_arguments: Dict,
+        automated_page: bool = False,
     ) -> Response:
         """
         Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -201,6 +235,7 @@ class ResponseFactory:
         :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
         :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
             the `Response` object.
+        :param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
         :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
         :rtype: Response
@@ -216,7 +251,7 @@ class ResponseFactory:
         history = await cls._async_process_response_history(first_response, parser_arguments)
         try:
-            page_content = await final_response.text()
+            page_content = await (final_response.text() if not automated_page else cls._get_async_page_content(page))
         except Exception as e:  # pragma: no cover
             log.error(f"Error getting page content in async: {e}")
             page_content = ""

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -209,15 +209,3 @@ class StatusText:
     def get(cls, status_code: int) -> str:
         """Get the phrase for a given HTTP status code."""
         return cls._phrases.get(status_code, "Unknown Status Code")
-def get_variable_name(var: Any) -> Optional[str]:
-    """Get the name of a variable using global and local scopes.
-    :param var: The variable to find the name for
-    :return: The name of the variable if found, None otherwise
-    """
-    for scope in [globals(), locals()]:
-        for name, value in scope.items():
-            if value is var:
-                return name
-    return None

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -7,8 +7,9 @@ from platform import system as platform_system
 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
+from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
-from scrapling.core._types import Dict, Literal
+from scrapling.core._types import Dict, Literal, Tuple
 __OS_NAME__ = platform_system()
 OSName = Literal["linux", "macos", "windows"]
@@ -29,12 +30,12 @@ def generate_convincing_referer(url: str) -> str:
 @lru_cache(1, typed=True)
-def get_os_name() -> OSName | None:
+def get_os_name() -> OSName | Tuple:
     """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
     :return: Current OS name or `None` otherwise
     """
-    match __OS_NAME__:
+    match __OS_NAME__:  # pragma: no cover
         case "Linux":
             return "linux"
         case "Darwin":
@@ -42,7 +43,7 @@ def get_os_name() -> OSName | None:
         case "Windows":
             return "windows"
         case _:
-            return None
+            return SUPPORTED_OPERATING_SYSTEMS
 def generate_headers(browser_mode: bool = False) -> Dict:
@@ -63,10 +64,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
                 Browser(name="edge", min_version=130),
             ]
         )
-    if os_name:
-        return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
-    else:
-        return HeaderGenerator(browser=browsers, device="desktop").generate()
+    return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
 __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")

scrapling 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl

scrapling 0.3.7py3-none-any.whl → 0.3.8py3-none-any.whl