PyPI - scrapling - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

scrapling 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

scrapling/__init__.py +1 -1
scrapling/cli.py +38 -51
scrapling/core/_html_utils.py +3 -9
scrapling/core/ai.py +5 -13
scrapling/core/custom_types.py +19 -61
scrapling/core/mixins.py +6 -28
scrapling/core/shell.py +49 -127
scrapling/core/storage.py +2 -8
scrapling/core/translator.py +8 -20
scrapling/core/utils/__init__.py +10 -0
scrapling/core/utils/_shell.py +48 -0
scrapling/core/{utils.py → utils/_utils.py} +5 -21
scrapling/engines/__init__.py +0 -16
scrapling/engines/_browsers/_base.py +297 -0
scrapling/engines/_browsers/_camoufox.py +227 -296
scrapling/engines/_browsers/_config_tools.py +2 -1
scrapling/engines/_browsers/_controllers.py +209 -281
scrapling/engines/_browsers/_page.py +37 -15
scrapling/engines/_browsers/_validators.py +9 -15
scrapling/engines/constants.py +3 -6
scrapling/engines/static.py +25 -75
scrapling/engines/toolbelt/__init__.py +1 -20
scrapling/engines/toolbelt/convertor.py +95 -86
scrapling/engines/toolbelt/custom.py +7 -99
scrapling/engines/toolbelt/fingerprints.py +1 -3
scrapling/engines/toolbelt/navigation.py +4 -58
scrapling/fetchers.py +29 -24
scrapling/parser.py +45 -122
{scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/METADATA +54 -46
scrapling-0.3.2.dist-info/RECORD +44 -0
scrapling-0.3.1.dist-info/RECORD +0 -41
{scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
{scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0

scrapling/engines/_browsers/_camoufox.py CHANGED Viewed

@@ -1,14 +1,8 @@
-from time import time, sleep
 from re import compile as re_compile
-from asyncio import sleep as asyncio_sleep, Lock
-from camoufox import DefaultAddons
-from camoufox.utils import launch_options as generate_launch_options
 from playwright.sync_api import (
     Response as SyncPlaywrightResponse,
     sync_playwright,
-    BrowserContext,
-    Playwright,
     Locator,
     Page,
 )
@@ -21,9 +15,9 @@ from playwright.async_api import (
     Page as async_Page,
 )
-from scrapling.core.utils import log
-from ._page import PageInfo, PagePool
 from ._validators import validate, CamoufoxConfig
+from ._base import SyncSession, AsyncSession, StealthySessionMixin
+from scrapling.core.utils import log
 from scrapling.core._types import (
     Dict,
     List,
@@ -31,19 +25,17 @@ from scrapling.core._types import (
     Callable,
     SelectorWaitStates,
 )
-from scrapling.engines.toolbelt import (
+from scrapling.engines.toolbelt.convertor import (
     Response,
     ResponseFactory,
-    async_intercept_route,
-    generate_convincing_referer,
-    get_os_name,
-    intercept_route,
 )
+from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
 __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
+_UNSET = object()
-class StealthySession:
+class StealthySession(StealthySessionMixin, SyncSession):
     """A Stealthy session manager with page pooling."""
     __slots__ = (
@@ -54,6 +46,7 @@ class StealthySession:
         "block_webrtc",
         "allow_webgl",
         "network_idle",
+        "load_dom",
         "humanize",
         "solve_cloudflare",
         "wait",
@@ -83,13 +76,14 @@ class StealthySession:
     def __init__(
         self,
-        max_pages: int = 1,
+        __max_pages: int = 1,
         headless: bool = True,  # noqa: F821
         block_images: bool = False,
         disable_resources: bool = False,
         block_webrtc: bool = False,
         allow_webgl: bool = True,
         network_idle: bool = False,
+        load_dom: bool = True,
         humanize: bool | float = True,
         solve_cloudflare: bool = False,
         wait: int | float = 0,
@@ -124,11 +118,12 @@ class StealthySession:
         :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
         :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
         :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
         :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
         :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
-        :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
         :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
@@ -137,108 +132,51 @@ class StealthySession:
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
-        :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
         """
-        params = {
-            "max_pages": max_pages,
-            "headless": headless,
-            "block_images": block_images,
-            "disable_resources": disable_resources,
-            "block_webrtc": block_webrtc,
-            "allow_webgl": allow_webgl,
-            "network_idle": network_idle,
-            "humanize": humanize,
-            "solve_cloudflare": solve_cloudflare,
-            "wait": wait,
-            "timeout": timeout,
-            "page_action": page_action,
-            "wait_selector": wait_selector,
-            "init_script": init_script,
-            "addons": addons,
-            "wait_selector_state": wait_selector_state,
-            "cookies": cookies,
-            "google_search": google_search,
-            "extra_headers": extra_headers,
-            "proxy": proxy,
-            "os_randomize": os_randomize,
-            "disable_ads": disable_ads,
-            "geoip": geoip,
-            "selector_config": selector_config,
-            "additional_args": additional_args,
-        }
-        config = validate(params, CamoufoxConfig)
-        self.max_pages = config.max_pages
-        self.headless = config.headless
-        self.block_images = config.block_images
-        self.disable_resources = config.disable_resources
-        self.block_webrtc = config.block_webrtc
-        self.allow_webgl = config.allow_webgl
-        self.network_idle = config.network_idle
-        self.humanize = config.humanize
-        self.solve_cloudflare = config.solve_cloudflare
-        self.wait = config.wait
-        self.timeout = config.timeout
-        self.page_action = config.page_action
-        self.wait_selector = config.wait_selector
-        self.init_script = config.init_script
-        self.addons = config.addons
-        self.wait_selector_state = config.wait_selector_state
-        self.cookies = config.cookies
-        self.google_search = config.google_search
-        self.extra_headers = config.extra_headers
-        self.proxy = config.proxy
-        self.os_randomize = config.os_randomize
-        self.disable_ads = config.disable_ads
-        self.geoip = config.geoip
-        self.selector_config = config.selector_config
-        self.additional_args = config.additional_args
-        self.playwright: Optional[Playwright] = None
-        self.context: Optional[BrowserContext] = None
-        self.page_pool = PagePool(self.max_pages)
-        self._closed = False
-        self.selector_config = config.selector_config
-        self.page_action = config.page_action
-        self._headers_keys = (
-            set(map(str.lower, self.extra_headers.keys()))
-            if self.extra_headers
-            else set()
-        )
-        self.__initiate_browser_options__()
-    def __initiate_browser_options__(self):
-        """Initiate browser options."""
-        self.launch_options = generate_launch_options(
-            **{
-                "geoip": self.geoip,
-                "proxy": dict(self.proxy) if self.proxy else self.proxy,
-                "enable_cache": True,
-                "addons": self.addons,
-                "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
-                "headless": self.headless,
-                "humanize": True if self.solve_cloudflare else self.humanize,
-                "i_know_what_im_doing": True,  # To turn warnings off with the user configurations
-                "allow_webgl": self.allow_webgl,
-                "block_webrtc": self.block_webrtc,
-                "block_images": self.block_images,  # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
-                "os": None if self.os_randomize else get_os_name(),
-                "user_data_dir": "",
-                **self.additional_args,
-            }
+        self.__validate__(
+            wait=wait,
+            proxy=proxy,
+            geoip=geoip,
+            addons=addons,
+            timeout=timeout,
+            cookies=cookies,
+            headless=headless,
+            humanize=humanize,
+            load_dom=load_dom,
+            max_pages=__max_pages,
+            disable_ads=disable_ads,
+            allow_webgl=allow_webgl,
+            page_action=page_action,
+            init_script=init_script,
+            network_idle=network_idle,
+            block_images=block_images,
+            block_webrtc=block_webrtc,
+            os_randomize=os_randomize,
+            wait_selector=wait_selector,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            additional_args=additional_args,
+            selector_config=selector_config,
+            solve_cloudflare=solve_cloudflare,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
         )
+        super().__init__(max_pages=self.max_pages)
     def __create__(self):
         """Create a browser for this instance and context."""
         self.playwright = sync_playwright().start()
-        self.context = (
-            self.playwright.firefox.launch_persistent_context(  # pragma: no cover
-                **self.launch_options
-            )
+        self.context = self.playwright.firefox.launch_persistent_context(  # pragma: no cover
+            **self.launch_options
         )
+        # Get the default page and close it
+        default_page = self.context.pages[0]
+        default_page.close()
         if self.init_script:  # pragma: no cover
             self.context.add_init_script(path=self.init_script)
@@ -267,68 +205,6 @@ class StealthySession:
         self._closed = True
-    def _get_or_create_page(self) -> PageInfo:  # pragma: no cover
-        """Get an available page or create a new one"""
-        # Try to get a ready page first
-        page_info = self.page_pool.get_ready_page()
-        if page_info:
-            return page_info
-        # Create a new page if under limit
-        if self.page_pool.pages_count < self.max_pages:
-            page = self.context.new_page()
-            page.set_default_navigation_timeout(self.timeout)
-            page.set_default_timeout(self.timeout)
-            if self.extra_headers:
-                page.set_extra_http_headers(self.extra_headers)
-            if self.disable_resources:
-                page.route("**/*", intercept_route)
-            return self.page_pool.add_page(page)
-        # Wait for a page to become available
-        max_wait = 30
-        start_time = time()
-        while time() - start_time < max_wait:
-            page_info = self.page_pool.get_ready_page()
-            if page_info:
-                return page_info
-            sleep(0.05)
-        raise TimeoutError("No pages available within timeout period")
-    @staticmethod
-    def _detect_cloudflare(page_content):
-        """
-        Detect the type of Cloudflare challenge present in the provided page content.
-        This function analyzes the given page content to identify whether a specific
-        type of Cloudflare challenge is present. It checks for three predefined
-        challenge types: non-interactive, managed, and interactive. If a challenge
-        type is detected, it returns the corresponding type as a string. If no
-        challenge type is detected, it returns None.
-        Args:
-            page_content (str): The content of the page to analyze for Cloudflare
-                challenge types.
-        Returns:
-            str: A string representing the detected Cloudflare challenge type, if
-                found. Returns None if no challenge matches.
-        """
-        challenge_types = (
-            "non-interactive",
-            "managed",
-            "interactive",
-        )
-        for ctype in challenge_types:
-            if f"cType: '{ctype}'" in page_content:
-                return ctype
-        return None
     def _solve_cloudflare(self, page: Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed
@@ -375,20 +251,66 @@ class StealthySession:
                 log.info("Cloudflare captcha is solved")
                 return
-    def fetch(self, url: str) -> Response:
+    def fetch(
+        self,
+        url: str,
+        google_search: bool = _UNSET,
+        timeout: int | float = _UNSET,
+        wait: int | float = _UNSET,
+        page_action: Optional[Callable] = _UNSET,
+        extra_headers: Optional[Dict[str, str]] = _UNSET,
+        disable_resources: bool = _UNSET,
+        wait_selector: Optional[str] = _UNSET,
+        wait_selector_state: SelectorWaitStates = _UNSET,
+        network_idle: bool = _UNSET,
+        load_dom: bool = _UNSET,
+        solve_cloudflare: bool = _UNSET,
+        selector_config: Optional[Dict] = _UNSET,
+    ) -> Response:
         """Opens up the browser and do your request based on your chosen options.
         :param url: The Target url.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :return: A `Response` object.
         """
+        # Validate all resolved parameters
+        params = validate(
+            dict(
+                google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
+                timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
+                wait=self._get_with_precedence(wait, self.wait, _UNSET),
+                page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
+                extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
+                disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
+                wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
+                wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
+                network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
+                load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
+                solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
+                selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
+            ),
+            CamoufoxConfig,
+        )
         if self._closed:  # pragma: no cover
             raise RuntimeError("Context manager has been closed")
         final_response = None
         referer = (
-            generate_convincing_referer(url)
-            if (self.google_search and "referer" not in self._headers_keys)
-            else None
+            generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
         )
         def handle_response(finished_response: SyncPlaywrightResponse):
@@ -399,54 +321,57 @@ class StealthySession:
             ):
                 final_response = finished_response
-        page_info = self._get_or_create_page()
+        page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
         try:  # pragma: no cover
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)
             first_response = page_info.page.goto(url, referer=referer)
-            page_info.page.wait_for_load_state(state="domcontentloaded")
+            if params.load_dom:
+                page_info.page.wait_for_load_state(state="domcontentloaded")
-            if self.network_idle:
+            if params.network_idle:
                 page_info.page.wait_for_load_state("networkidle")
             if not first_response:
                 raise RuntimeError(f"Failed to get response for {url}")
-            if self.solve_cloudflare:
+            if params.solve_cloudflare:
                 self._solve_cloudflare(page_info.page)
                 # Make sure the page is fully loaded after the captcha
                 page_info.page.wait_for_load_state(state="load")
-                page_info.page.wait_for_load_state(state="domcontentloaded")
-                if self.network_idle:
+                if params.load_dom:
+                    page_info.page.wait_for_load_state(state="domcontentloaded")
+                if params.network_idle:
                     page_info.page.wait_for_load_state("networkidle")
-            if self.page_action is not None:
+            if params.page_action:
                 try:
-                    page_info.page = self.page_action(page_info.page)
+                    _ = params.page_action(page_info.page)
                 except Exception as e:
                     log.error(f"Error executing page_action: {e}")
-            if self.wait_selector:
+            if params.wait_selector:
                 try:
-                    waiter: Locator = page_info.page.locator(self.wait_selector)
-                    waiter.first.wait_for(state=self.wait_selector_state)
+                    waiter: Locator = page_info.page.locator(params.wait_selector)
+                    waiter.first.wait_for(state=params.wait_selector_state)
                     # Wait again after waiting for the selector, helpful with protections like Cloudflare
                     page_info.page.wait_for_load_state(state="load")
-                    page_info.page.wait_for_load_state(state="domcontentloaded")
-                    if self.network_idle:
+                    if params.load_dom:
+                        page_info.page.wait_for_load_state(state="domcontentloaded")
+                    if params.network_idle:
                         page_info.page.wait_for_load_state("networkidle")
                 except Exception as e:
-                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
+                    log.error(f"Error waiting for selector {params.wait_selector}: {e}")
-            page_info.page.wait_for_timeout(self.wait)
+            page_info.page.wait_for_timeout(params.wait)
             response = ResponseFactory.from_playwright_response(
-                page_info.page, first_response, final_response, self.selector_config
+                page_info.page, first_response, final_response, params.selector_config
             )
-            # Mark the page as ready for next use
-            page_info.mark_ready()
+            # Mark the page as finished for next use
+            page_info.mark_finished()
             return response
@@ -454,17 +379,8 @@ class StealthySession:
             page_info.mark_error()
             raise e
-    def get_pool_stats(self) -> Dict[str, int]:
-        """Get statistics about the current page pool"""
-        return {
-            "total_pages": self.page_pool.pages_count,
-            "ready_pages": self.page_pool.ready_count,
-            "busy_pages": self.page_pool.busy_count,
-            "max_pages": self.max_pages,
-        }
-class AsyncStealthySession(StealthySession):
+class AsyncStealthySession(StealthySessionMixin, AsyncSession):
     """A Stealthy session manager with page pooling."""
     def __init__(
@@ -476,6 +392,7 @@ class AsyncStealthySession(StealthySession):
         block_webrtc: bool = False,
         allow_webgl: bool = True,
         network_idle: bool = False,
+        load_dom: bool = True,
         humanize: bool | float = True,
         solve_cloudflare: bool = False,
         wait: int | float = 0,
@@ -510,11 +427,12 @@ class AsyncStealthySession(StealthySession):
         :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
         :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
         :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
         :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
         :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
-        :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
         :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
@@ -527,47 +445,47 @@ class AsyncStealthySession(StealthySession):
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
         """
-        super().__init__(
-            max_pages,
-            headless,
-            block_images,
-            disable_resources,
-            block_webrtc,
-            allow_webgl,
-            network_idle,
-            humanize,
-            solve_cloudflare,
-            wait,
-            timeout,
-            page_action,
-            wait_selector,
-            init_script,
-            addons,
-            wait_selector_state,
-            cookies,
-            google_search,
-            extra_headers,
-            proxy,
-            os_randomize,
-            disable_ads,
-            geoip,
-            selector_config,
-            additional_args,
+        self.__validate__(
+            wait=wait,
+            proxy=proxy,
+            geoip=geoip,
+            addons=addons,
+            timeout=timeout,
+            cookies=cookies,
+            headless=headless,
+            load_dom=load_dom,
+            humanize=humanize,
+            max_pages=max_pages,
+            disable_ads=disable_ads,
+            allow_webgl=allow_webgl,
+            page_action=page_action,
+            init_script=init_script,
+            network_idle=network_idle,
+            block_images=block_images,
+            block_webrtc=block_webrtc,
+            os_randomize=os_randomize,
+            wait_selector=wait_selector,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            additional_args=additional_args,
+            selector_config=selector_config,
+            solve_cloudflare=solve_cloudflare,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
         )
-        self.playwright: Optional[AsyncPlaywright] = None
-        self.context: Optional[AsyncBrowserContext] = None
-        self._lock = Lock()
-        self.__enter__ = None
-        self.__exit__ = None
+        super().__init__(max_pages=self.max_pages)
     async def __create__(self):
         """Create a browser for this instance and context."""
         self.playwright: AsyncPlaywright = await async_playwright().start()
-        self.context: AsyncBrowserContext = (
-            await self.playwright.firefox.launch_persistent_context(
-                **self.launch_options
-            )
+        self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
+            **self.launch_options
         )
+        # Get the default page and close it
+        default_page = self.context.pages[0]
+        await default_page.close()
         if self.init_script:  # pragma: no cover
             await self.context.add_init_script(path=self.init_script)
@@ -596,39 +514,6 @@ class AsyncStealthySession(StealthySession):
         self._closed = True
-    async def _get_or_create_page(self) -> PageInfo:
-        """Get an available page or create a new one"""
-        async with self._lock:
-            # Try to get a ready page first
-            page_info = self.page_pool.get_ready_page()
-            if page_info:
-                return page_info
-            # Create a new page if under limit
-            if self.page_pool.pages_count < self.max_pages:
-                page = await self.context.new_page()
-                page.set_default_navigation_timeout(self.timeout)
-                page.set_default_timeout(self.timeout)
-                if self.extra_headers:
-                    await page.set_extra_http_headers(self.extra_headers)
-                if self.disable_resources:
-                    await page.route("**/*", async_intercept_route)
-                return self.page_pool.add_page(page)
-        # Wait for a page to become available
-        max_wait = 30
-        start_time = time()
-        while time() - start_time < max_wait:  # pragma: no cover
-            page_info = self.page_pool.get_ready_page()
-            if page_info:
-                return page_info
-            await asyncio_sleep(0.05)
-        raise TimeoutError("No pages available within timeout period")
     async def _solve_cloudflare(self, page: async_Page):
         """Solve the cloudflare challenge displayed on the playwright page passed. The async version
@@ -664,9 +549,7 @@ class AsyncStealthySession(StealthySession):
                     await page.wait_for_timeout(500)
                 # Calculate the Captcha coordinates for any viewport
-                outer_box = await page.locator(
-                    ".main-content p+div>div>div"
-                ).bounding_box()
+                outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
                 captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
                 # Move the mouse to the center of the window, then press and hold the left mouse button
@@ -677,20 +560,65 @@ class AsyncStealthySession(StealthySession):
                 log.info("Cloudflare captcha is solved")
                 return
-    async def fetch(self, url: str) -> Response:
+    async def fetch(
+        self,
+        url: str,
+        google_search: bool = _UNSET,
+        timeout: int | float = _UNSET,
+        wait: int | float = _UNSET,
+        page_action: Optional[Callable] = _UNSET,
+        extra_headers: Optional[Dict[str, str]] = _UNSET,
+        disable_resources: bool = _UNSET,
+        wait_selector: Optional[str] = _UNSET,
+        wait_selector_state: SelectorWaitStates = _UNSET,
+        network_idle: bool = _UNSET,
+        load_dom: bool = _UNSET,
+        solve_cloudflare: bool = _UNSET,
+        selector_config: Optional[Dict] = _UNSET,
+    ) -> Response:
         """Opens up the browser and do your request based on your chosen options.
         :param url: The Target url.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :return: A `Response` object.
         """
+        params = validate(
+            dict(
+                google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
+                timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
+                wait=self._get_with_precedence(wait, self.wait, _UNSET),
+                page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
+                extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
+                disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
+                wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
+                wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
+                network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
+                load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
+                solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
+                selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
+            ),
+            CamoufoxConfig,
+        )
         if self._closed:  # pragma: no cover
             raise RuntimeError("Context manager has been closed")
         final_response = None
         referer = (
-            generate_convincing_referer(url)
-            if (self.google_search and "referer" not in self._headers_keys)
-            else None
+            generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
         )
         async def handle_response(finished_response: AsyncPlaywrightResponse):
@@ -701,56 +629,59 @@ class AsyncStealthySession(StealthySession):
             ):
                 final_response = finished_response
-        page_info = await self._get_or_create_page()
+        page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
         try:
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)
             first_response = await page_info.page.goto(url, referer=referer)
-            await page_info.page.wait_for_load_state(state="domcontentloaded")
+            if params.load_dom:
+                await page_info.page.wait_for_load_state(state="domcontentloaded")
-            if self.network_idle:
+            if params.network_idle:
                 await page_info.page.wait_for_load_state("networkidle")
             if not first_response:
                 raise RuntimeError(f"Failed to get response for {url}")
-            if self.solve_cloudflare:
+            if params.solve_cloudflare:
                 await self._solve_cloudflare(page_info.page)
                 # Make sure the page is fully loaded after the captcha
                 await page_info.page.wait_for_load_state(state="load")
-                await page_info.page.wait_for_load_state(state="domcontentloaded")
-                if self.network_idle:
+                if params.load_dom:
+                    await page_info.page.wait_for_load_state(state="domcontentloaded")
+                if params.network_idle:
                     await page_info.page.wait_for_load_state("networkidle")
-            if self.page_action is not None:
+            if params.page_action:
                 try:
-                    page_info.page = await self.page_action(page_info.page)
+                    _ = await params.page_action(page_info.page)
                 except Exception as e:
                     log.error(f"Error executing page_action: {e}")
-            if self.wait_selector:
+            if params.wait_selector:
                 try:
-                    waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
-                    await waiter.first.wait_for(state=self.wait_selector_state)
+                    waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
+                    await waiter.first.wait_for(state=params.wait_selector_state)
                     # Wait again after waiting for the selector, helpful with protections like Cloudflare
                     await page_info.page.wait_for_load_state(state="load")
-                    await page_info.page.wait_for_load_state(state="domcontentloaded")
-                    if self.network_idle:
+                    if params.load_dom:
+                        await page_info.page.wait_for_load_state(state="domcontentloaded")
+                    if params.network_idle:
                         await page_info.page.wait_for_load_state("networkidle")
                 except Exception as e:
-                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
+                    log.error(f"Error waiting for selector {params.wait_selector}: {e}")
-            await page_info.page.wait_for_timeout(self.wait)
+            await page_info.page.wait_for_timeout(params.wait)
             # Create response object
             response = await ResponseFactory.from_async_playwright_response(
-                page_info.page, first_response, final_response, self.selector_config
+                page_info.page, first_response, final_response, params.selector_config
             )
-            # Mark the page as ready for next use
-            page_info.mark_ready()
+            # Mark the page as finished for next use
+            page_info.mark_finished()
             return response

scrapling 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

scrapling 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl