PyPI - scrapling - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

scrapling 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

scrapling/__init__.py +1 -1
scrapling/core/_types.py +3 -0
scrapling/core/ai.py +2 -1
scrapling/core/custom_types.py +20 -27
scrapling/core/mixins.py +15 -9
scrapling/core/shell.py +4 -3
scrapling/core/storage.py +5 -5
scrapling/core/translator.py +13 -8
scrapling/engines/_browsers/_base.py +37 -14
scrapling/engines/_browsers/_camoufox.py +76 -35
scrapling/engines/_browsers/_config_tools.py +1 -1
scrapling/engines/_browsers/_controllers.py +32 -11
scrapling/engines/_browsers/_validators.py +31 -10
scrapling/engines/static.py +678 -668
scrapling/engines/toolbelt/convertor.py +13 -15
scrapling/engines/toolbelt/custom.py +6 -9
scrapling/engines/toolbelt/fingerprints.py +17 -10
scrapling/engines/toolbelt/navigation.py +11 -3
scrapling/fetchers/__init__.py +11 -1
scrapling/fetchers/chrome.py +9 -4
scrapling/fetchers/firefox.py +0 -4
scrapling/parser.py +105 -80
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/METADATA +3 -4
scrapling-0.3.7.dist-info/RECORD +47 -0
scrapling-0.3.6.dist-info/RECORD +0 -47
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0

scrapling/engines/_browsers/_camoufox.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from random import randint
 from re import compile as re_compile
 from playwright.sync_api import (
@@ -20,10 +21,12 @@ from ._validators import validate_fetch as _validate
 from ._base import SyncSession, AsyncSession, StealthySessionMixin
 from scrapling.core.utils import log
 from scrapling.core._types import (
+    Any,
     Dict,
     List,
     Optional,
     Callable,
+    TYPE_CHECKING,
     SelectorWaitStates,
 )
 from scrapling.engines.toolbelt.convertor import (
@@ -33,7 +36,7 @@ from scrapling.engines.toolbelt.convertor import (
 from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
 __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
-_UNSET = object()
+_UNSET: Any = object()
 class StealthySession(StealthySessionMixin, SyncSession):
@@ -101,6 +104,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
         os_randomize: bool = False,
         disable_ads: bool = False,
         geoip: bool = False,
+        user_data_dir: str = "",
         selector_config: Optional[Dict] = None,
         additional_args: Optional[Dict] = None,
     ):
@@ -133,6 +137,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
         """
@@ -156,6 +161,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
             block_images=block_images,
             block_webrtc=block_webrtc,
             os_randomize=os_randomize,
+            user_data_dir=user_data_dir,
             wait_selector=wait_selector,
             google_search=google_search,
             extra_headers=extra_headers,
@@ -170,9 +176,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
     def __create__(self):
         """Create a browser for this instance and context."""
         self.playwright = sync_playwright().start()
-        self.context = self.playwright.firefox.launch_persistent_context(  # pragma: no cover
-            **self.launch_options
-        )
+        self.context = self.playwright.firefox.launch_persistent_context(**self.launch_options)
         if self.init_script:  # pragma: no cover
             self.context.add_init_script(path=self.init_script)
@@ -203,9 +207,9 @@ class StealthySession(StealthySessionMixin, SyncSession):
         self._closed = True
     @staticmethod
-    def _get_page_content(page: Page) -> str | None:
+    def _get_page_content(page: Page) -> str:
         """
-        A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
         :param page: The page to extract content from.
         :return:
         """
@@ -215,6 +219,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
             except PlaywrightError:
                 page.wait_for_timeout(1000)
                 continue
+        return ""  # pyright: ignore
     def _solve_cloudflare(self, page: Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed
@@ -222,6 +227,10 @@ class StealthySession(StealthySessionMixin, SyncSession):
         :param page: The targeted page
         :return:
         """
+        try:
+            page.wait_for_load_state("networkidle", timeout=5000)
+        except PlaywrightError:
+            pass
         challenge_type = self._detect_cloudflare(self._get_page_content(page))
         if not challenge_type:
             log.error("No Cloudflare challenge found.")
@@ -244,26 +253,35 @@ class StealthySession(StealthySessionMixin, SyncSession):
                         # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
                         page.wait_for_timeout(500)
+                outer_box = {}
                 iframe = page.frame(url=__CF_PATTERN__)
-                if iframe is None:
-                    log.error("Didn't find Cloudflare iframe!")
-                    return
+                if iframe is not None:
+                    iframe.wait_for_load_state(state="domcontentloaded")
+                    iframe.wait_for_load_state("networkidle")
-                if challenge_type != "embedded":
-                    while not iframe.frame_element().is_visible():
-                        # Double-checking that the iframe is loaded
-                        page.wait_for_timeout(500)
+                    if challenge_type != "embedded":
+                        while not iframe.frame_element().is_visible():
+                            # Double-checking that the iframe is loaded
+                            page.wait_for_timeout(500)
+                    outer_box: Any = iframe.frame_element().bounding_box()
+                if not iframe or not outer_box:
+                    outer_box: Any = page.locator(box_selector).last.bounding_box()
-                iframe.wait_for_load_state(state="domcontentloaded")
-                iframe.wait_for_load_state("networkidle")
                 # Calculate the Captcha coordinates for any viewport
-                outer_box = page.locator(box_selector).last.bounding_box()
-                captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
+                captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
                 # Move the mouse to the center of the window, then press and hold the left mouse button
                 page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
+                page.wait_for_load_state("networkidle")
+                if iframe is not None:
+                    # Wait for the frame to be removed from the page
+                    while iframe in page.frames:
+                        page.wait_for_timeout(100)
                 if challenge_type != "embedded":
+                    page.locator(box_selector).last.wait_for(state="detached")
                     page.locator(".zone-name-title").wait_for(state="hidden")
+                page.wait_for_load_state(state="load")
                 page.wait_for_load_state(state="domcontentloaded")
                 log.info("Cloudflare captcha is solved")
@@ -335,6 +353,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
             if (
                 finished_response.request.resource_type == "document"
                 and finished_response.request.is_navigation_request()
+                and finished_response.request.frame == page_info.page.main_frame
             ):
                 final_response = finished_response
@@ -387,7 +406,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
                 page_info.page, first_response, final_response, params.selector_config
             )
-            # Close the page, to free up resources
+            # Close the page to free up resources
             page_info.page.close()
             self.page_pool.pages.remove(page_info)
@@ -427,6 +446,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
         os_randomize: bool = False,
         disable_ads: bool = False,
         geoip: bool = False,
+        user_data_dir: str = "",
         selector_config: Optional[Dict] = None,
         additional_args: Optional[Dict] = None,
     ):
@@ -460,6 +480,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
         """
@@ -485,6 +506,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
             wait_selector=wait_selector,
             google_search=google_search,
             extra_headers=extra_headers,
+            user_data_dir=user_data_dir,
             additional_args=additional_args,
             selector_config=selector_config,
             solve_cloudflare=solve_cloudflare,
@@ -504,7 +526,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
             await self.context.add_init_script(path=self.init_script)
         if self.cookies:
-            await self.context.add_cookies(self.cookies)
+            await self.context.add_cookies(self.cookies)  # pyright: ignore [reportArgumentType]
     async def __aenter__(self):
         await self.__create__()
@@ -520,18 +542,18 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
         if self.context:
             await self.context.close()
-            self.context = None
+            self.context = None  # pyright: ignore
         if self.playwright:
             await self.playwright.stop()
-            self.playwright = None
+            self.playwright = None  # pyright: ignore
         self._closed = True
     @staticmethod
-    async def _get_page_content(page: async_Page) -> str | None:
+    async def _get_page_content(page: async_Page) -> str:
         """
-        A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
         :param page: The page to extract content from.
         :return:
         """
@@ -541,6 +563,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
             except PlaywrightError:
                 await page.wait_for_timeout(1000)
                 continue
+        return ""  # pyright: ignore
     async def _solve_cloudflare(self, page: async_Page):
         """Solve the cloudflare challenge displayed on the playwright page passed. The async version
@@ -548,6 +571,10 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
         :param page: The async targeted page
         :return:
         """
+        try:
+            await page.wait_for_load_state("networkidle", timeout=5000)
+        except PlaywrightError:
+            pass
         challenge_type = self._detect_cloudflare(await self._get_page_content(page))
         if not challenge_type:
             log.error("No Cloudflare challenge found.")
@@ -570,26 +597,35 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
                         # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
                         await page.wait_for_timeout(500)
+                outer_box = {}
                 iframe = page.frame(url=__CF_PATTERN__)
-                if iframe is None:
-                    log.error("Didn't find Cloudflare iframe!")
-                    return
+                if iframe is not None:
+                    await iframe.wait_for_load_state(state="domcontentloaded")
+                    await iframe.wait_for_load_state("networkidle")
-                if challenge_type != "embedded":
-                    while not await (await iframe.frame_element()).is_visible():
-                        # Double-checking that the iframe is loaded
-                        await page.wait_for_timeout(500)
+                    if challenge_type != "embedded":
+                        while not await (await iframe.frame_element()).is_visible():
+                            # Double-checking that the iframe is loaded
+                            await page.wait_for_timeout(500)
+                    outer_box: Any = await (await iframe.frame_element()).bounding_box()
+                if not iframe or not outer_box:
+                    outer_box: Any = await page.locator(box_selector).last.bounding_box()
-                await iframe.wait_for_load_state(state="domcontentloaded")
-                await iframe.wait_for_load_state("networkidle")
                 # Calculate the Captcha coordinates for any viewport
-                outer_box = await page.locator(box_selector).last.bounding_box()
-                captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
+                captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
                 # Move the mouse to the center of the window, then press and hold the left mouse button
                 await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
+                await page.wait_for_load_state("networkidle")
+                if iframe is not None:
+                    # Wait for the frame to be removed from the page
+                    while iframe in page.frames:
+                        await page.wait_for_timeout(100)
                 if challenge_type != "embedded":
+                    await page.locator(box_selector).wait_for(state="detached")
                     await page.locator(".zone-name-title").wait_for(state="hidden")
+                await page.wait_for_load_state(state="load")
                 await page.wait_for_load_state(state="domcontentloaded")
                 log.info("Cloudflare captcha is solved")
@@ -661,12 +697,17 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
             if (
                 finished_response.request.resource_type == "document"
                 and finished_response.request.is_navigation_request()
+                and finished_response.request.frame == page_info.page.main_frame
             ):
                 final_response = finished_response
         page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
+        if TYPE_CHECKING:
+            if not isinstance(page_info.page, async_Page):
+                raise TypeError
         try:
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)
@@ -715,7 +756,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
                 page_info.page, first_response, final_response, params.selector_config
             )
-            # Close the page, to free up resources
+            # Close the page to free up resources
             await page_info.page.close()
             self.page_pool.pages.remove(page_info)

scrapling/engines/_browsers/_config_tools.py CHANGED Viewed

@@ -62,7 +62,7 @@ def _set_flags(hide_canvas, disable_webgl):  # pragma: no cover
 @lru_cache(2, typed=True)
 def _launch_kwargs(
     headless,
-    proxy,
+    proxy: Tuple,
     locale,
     extra_headers,
     useragent,

scrapling/engines/_browsers/_controllers.py CHANGED Viewed

@@ -10,6 +10,7 @@ from playwright.async_api import (
     BrowserContext as AsyncBrowserContext,
     Playwright as AsyncPlaywright,
     Locator as AsyncLocator,
+    Page as async_Page,
 )
 from patchright.sync_api import sync_playwright as sync_patchright
 from patchright.async_api import async_playwright as async_patchright
@@ -18,10 +19,12 @@ from scrapling.core.utils import log
 from ._base import SyncSession, AsyncSession, DynamicSessionMixin
 from ._validators import validate_fetch as _validate
 from scrapling.core._types import (
+    Any,
     Dict,
     List,
     Optional,
     Callable,
+    TYPE_CHECKING,
     SelectorWaitStates,
 )
 from scrapling.engines.toolbelt.convertor import (
@@ -30,7 +33,7 @@ from scrapling.engines.toolbelt.convertor import (
 )
 from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
-_UNSET = object()
+_UNSET: Any = object()
 class DynamicSession(DynamicSessionMixin, SyncSession):
@@ -94,7 +97,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
+        user_data_dir: str = "",
         selector_config: Optional[Dict] = None,
+        additional_args: Optional[Dict] = None,
     ):
         """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
@@ -121,7 +126,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
         """
         self.__validate__(
             wait=wait,
@@ -140,11 +147,13 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
             hide_canvas=hide_canvas,
             init_script=init_script,
             network_idle=network_idle,
+            user_data_dir=user_data_dir,
             google_search=google_search,
             extra_headers=extra_headers,
             wait_selector=wait_selector,
             disable_webgl=disable_webgl,
             selector_config=selector_config,
+            additional_args=additional_args,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
         )
@@ -154,14 +163,14 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         """Create a browser for this instance and context."""
         sync_context = sync_patchright if self.stealth else sync_playwright
-        self.playwright: Playwright = sync_context().start()
+        self.playwright: Playwright = sync_context().start()  # pyright: ignore [reportAttributeAccessIssue]
         if self.cdp_url:  # pragma: no cover
             self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
                 **self.context_options
             )
         else:
-            self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
+            self.context = self.playwright.chromium.launch_persistent_context(**self.launch_options)
         if self.init_script:  # pragma: no cover
             self.context.add_init_script(path=self.init_script)
@@ -187,7 +196,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         if self.playwright:
             self.playwright.stop()
-            self.playwright = None
+            self.playwright = None  # pyright: ignore
         self._closed = True
@@ -254,6 +263,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
             if (
                 finished_response.request.resource_type == "document"
                 and finished_response.request.is_navigation_request()
+                and finished_response.request.frame == page_info.page.main_frame
             ):
                 final_response = finished_response
@@ -299,7 +309,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
                 page_info.page, first_response, final_response, params.selector_config
             )
-            # Close the page, to free up resources
+            # Close the page to free up resources
             page_info.page.close()
             self.page_pool.pages.remove(page_info)
@@ -337,7 +347,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
+        user_data_dir: str = "",
         selector_config: Optional[Dict] = None,
+        additional_args: Optional[Dict] = None,
     ):
         """A Browser session manager with page pooling
@@ -365,7 +377,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
         """
         self.__validate__(
@@ -385,11 +399,13 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
             hide_canvas=hide_canvas,
             init_script=init_script,
             network_idle=network_idle,
+            user_data_dir=user_data_dir,
             google_search=google_search,
             extra_headers=extra_headers,
             wait_selector=wait_selector,
             disable_webgl=disable_webgl,
             selector_config=selector_config,
+            additional_args=additional_args,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
         )
@@ -399,21 +415,21 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         """Create a browser for this instance and context."""
         async_context = async_patchright if self.stealth else async_playwright
-        self.playwright: AsyncPlaywright = await async_context().start()
+        self.playwright: AsyncPlaywright = await async_context().start()  # pyright: ignore [reportAttributeAccessIssue]
         if self.cdp_url:
             browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
             self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
         else:
             self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
-                user_data_dir="", **self.launch_options
+                **self.launch_options
             )
         if self.init_script:  # pragma: no cover
             await self.context.add_init_script(path=self.init_script)
         if self.cookies:
-            await self.context.add_cookies(self.cookies)
+            await self.context.add_cookies(self.cookies)  # pyright: ignore
     async def __aenter__(self):
         await self.__create__()
@@ -429,11 +445,11 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         if self.context:
             await self.context.close()
-            self.context = None
+            self.context = None  # pyright: ignore
         if self.playwright:
             await self.playwright.stop()
-            self.playwright = None
+            self.playwright = None  # pyright: ignore
         self._closed = True
@@ -500,12 +516,17 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
             if (
                 finished_response.request.resource_type == "document"
                 and finished_response.request.is_navigation_request()
+                and finished_response.request.frame == page_info.page.main_frame
             ):
                 final_response = finished_response
         page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
+        if TYPE_CHECKING:
+            if not isinstance(page_info.page, async_Page):
+                raise TypeError
         try:
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)
@@ -545,7 +566,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
                 page_info.page, first_response, final_response, params.selector_config
             )
-            # Close the page, to free up resources
+            # Close the page to free up resources
             await page_info.page.close()
             self.page_pool.pages.remove(page_info)
             return response

scrapling/engines/_browsers/_validators.py CHANGED Viewed

@@ -11,7 +11,9 @@ from scrapling.core._types import (
     Tuple,
     Optional,
     Callable,
+    Iterable,
     SelectorWaitStates,
+    overload,
 )
 from scrapling.engines.toolbelt.navigation import construct_proxy_dict
@@ -73,7 +75,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
     stealth: bool = False
     wait: Seconds = 0
     page_action: Optional[Callable] = None
-    proxy: Optional[str | Dict[str, str]] = None  # The default value for proxy in Playwright's source is `None`
+    proxy: Optional[str | Dict[str, str] | Tuple] = None  # The default value for proxy in Playwright's source is `None`
     locale: str = "en-US"
     extra_headers: Optional[Dict[str, str]] = None
     useragent: Optional[str] = None
@@ -81,11 +83,13 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
     init_script: Optional[str] = None
     disable_resources: bool = False
     wait_selector: Optional[str] = None
-    cookies: Optional[List[Dict]] = None
+    cookies: Optional[Iterable[Dict]] = None
     network_idle: bool = False
     load_dom: bool = True
     wait_selector_state: SelectorWaitStates = "attached"
-    selector_config: Optional[Dict] = None
+    user_data_dir: str = ""
+    selector_config: Optional[Dict] = {}
+    additional_args: Optional[Dict] = {}
     def __post_init__(self):
         """Custom validation after msgspec validation"""
@@ -100,6 +104,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
             self.cookies = []
         if not self.selector_config:
             self.selector_config = {}
+        if not self.additional_args:
+            self.additional_args = {}
         if self.init_script is not None:
             _validate_file_path(self.init_script)
@@ -125,15 +131,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
     wait_selector: Optional[str] = None
     addons: Optional[List[str]] = None
     wait_selector_state: SelectorWaitStates = "attached"
-    cookies: Optional[List[Dict]] = None
+    cookies: Optional[Iterable[Dict]] = None
     google_search: bool = True
     extra_headers: Optional[Dict[str, str]] = None
-    proxy: Optional[str | Dict[str, str]] = None  # The default value for proxy in Playwright's source is `None`
+    proxy: Optional[str | Dict[str, str] | Tuple] = None  # The default value for proxy in Playwright's source is `None`
     os_randomize: bool = False
     disable_ads: bool = False
     geoip: bool = False
-    selector_config: Optional[Dict] = None
-    additional_args: Optional[Dict] = None
+    user_data_dir: str = ""
+    selector_config: Optional[Dict] = {}
+    additional_args: Optional[Dict] = {}
     def __post_init__(self):
         """Custom validation after msgspec validation"""
@@ -177,7 +184,7 @@ class FetchConfig(Struct, kw_only=True):
     network_idle: bool = False
     load_dom: bool = True
     solve_cloudflare: bool = False
-    selector_config: Optional[Dict] = {}
+    selector_config: Dict = {}
     def to_dict(self):
         return {f: getattr(self, f) for f in self.__struct_fields__}
@@ -198,7 +205,7 @@ class _fetch_params:
     network_idle: bool
     load_dom: bool
     solve_cloudflare: bool
-    selector_config: Optional[Dict]
+    selector_config: Dict
 def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
@@ -222,7 +229,21 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
     return _fetch_params(**result)
-def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
+@overload
+def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
+@overload
+def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
+@overload
+def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
+def validate(
+    params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
+) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
     try:
         return convert(params, model)
     except ValidationError as e:

scrapling 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

scrapling 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl