PyPI - scrapling - Versions diffs - 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

scrapling 0.3py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

scrapling/__init__.py +1 -1
scrapling/cli.py +38 -51
scrapling/core/_html_utils.py +3 -9
scrapling/core/ai.py +5 -13
scrapling/core/custom_types.py +19 -61
scrapling/core/mixins.py +6 -28
scrapling/core/shell.py +51 -129
scrapling/core/storage.py +2 -8
scrapling/core/translator.py +8 -20
scrapling/core/utils/__init__.py +10 -0
scrapling/core/utils/_shell.py +48 -0
scrapling/core/{utils.py → utils/_utils.py} +5 -21
scrapling/engines/__init__.py +0 -16
scrapling/engines/_browsers/_base.py +297 -0
scrapling/engines/_browsers/_camoufox.py +238 -293
scrapling/engines/_browsers/_config_tools.py +2 -1
scrapling/engines/_browsers/_controllers.py +220 -278
scrapling/engines/_browsers/_page.py +37 -15
scrapling/engines/_browsers/_validators.py +29 -15
scrapling/engines/constants.py +3 -6
scrapling/engines/static.py +25 -75
scrapling/engines/toolbelt/__init__.py +1 -20
scrapling/engines/toolbelt/convertor.py +95 -86
scrapling/engines/toolbelt/custom.py +7 -99
scrapling/engines/toolbelt/fingerprints.py +1 -3
scrapling/engines/toolbelt/navigation.py +4 -58
scrapling/fetchers.py +41 -24
scrapling/parser.py +45 -122
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
scrapling-0.3.2.dist-info/RECORD +44 -0
scrapling-0.3.dist-info/RECORD +0 -41
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0

scrapling/engines/_browsers/_controllers.py CHANGED Viewed

@@ -1,10 +1,6 @@
-from time import time, sleep
-from asyncio import sleep as asyncio_sleep, Lock
 from playwright.sync_api import (
     Response as SyncPlaywrightResponse,
     sync_playwright,
-    BrowserContext,
     Playwright,
     Locator,
 )
@@ -21,9 +17,8 @@ from rebrowser_playwright.async_api import (
 )
 from scrapling.core.utils import log
-from ._page import PageInfo, PagePool
+from ._base import SyncSession, AsyncSession, DynamicSessionMixin
 from ._validators import validate, PlaywrightConfig
-from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
 from scrapling.core._types import (
     Dict,
     List,
@@ -31,16 +26,16 @@ from scrapling.core._types import (
     Callable,
     SelectorWaitStates,
 )
-from scrapling.engines.toolbelt import (
+from scrapling.engines.toolbelt.convertor import (
     Response,
     ResponseFactory,
-    generate_convincing_referer,
-    intercept_route,
-    async_intercept_route,
 )
+from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
+_UNSET = object()
-class DynamicSession:
+class DynamicSession(DynamicSessionMixin, SyncSession):
     """A Browser session manager with page pooling."""
     __slots__ = (
@@ -59,7 +54,9 @@ class DynamicSession:
         "cookies",
         "disable_resources",
         "network_idle",
+        "load_dom",
         "wait_selector",
+        "init_script",
         "wait_selector_state",
         "wait",
         "playwright",
@@ -94,8 +91,10 @@ class DynamicSession:
         timeout: int | float = 30000,
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
+        init_script: Optional[str] = None,
         cookies: Optional[List[Dict]] = None,
         network_idle: bool = False,
+        load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
         selector_config: Optional[Dict] = None,
     ):
@@ -110,120 +109,48 @@ class DynamicSession:
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
         :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
-        :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
         :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
         :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
         :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
         :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
         :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         """
-        params = {
-            "max_pages": __max_pages,
-            "headless": headless,
-            "google_search": google_search,
-            "hide_canvas": hide_canvas,
-            "disable_webgl": disable_webgl,
-            "real_chrome": real_chrome,
-            "stealth": stealth,
-            "wait": wait,
-            "page_action": page_action,
-            "proxy": proxy,
-            "locale": locale,
-            "extra_headers": extra_headers,
-            "useragent": useragent,
-            "timeout": timeout,
-            "selector_config": selector_config,
-            "disable_resources": disable_resources,
-            "wait_selector": wait_selector,
-            "cookies": cookies,
-            "network_idle": network_idle,
-            "wait_selector_state": wait_selector_state,
-            "cdp_url": cdp_url,
-        }
-        config = validate(params, PlaywrightConfig)
-        self.max_pages = config.max_pages
-        self.headless = config.headless
-        self.hide_canvas = config.hide_canvas
-        self.disable_webgl = config.disable_webgl
-        self.real_chrome = config.real_chrome
-        self.stealth = config.stealth
-        self.google_search = config.google_search
-        self.wait = config.wait
-        self.proxy = config.proxy
-        self.locale = config.locale
-        self.extra_headers = config.extra_headers
-        self.useragent = config.useragent
-        self.timeout = config.timeout
-        self.cookies = config.cookies
-        self.disable_resources = config.disable_resources
-        self.cdp_url = config.cdp_url
-        self.network_idle = config.network_idle
-        self.wait_selector = config.wait_selector
-        self.wait_selector_state = config.wait_selector_state
-        self.playwright: Optional[Playwright] = None
-        self.context: Optional[BrowserContext] = None
-        self.page_pool = PagePool(self.max_pages)
-        self._closed = False
-        self.selector_config = config.selector_config
-        self.page_action = config.page_action
-        self._headers_keys = (
-            set(map(str.lower, self.extra_headers.keys()))
-            if self.extra_headers
-            else set()
+        self.__validate__(
+            wait=wait,
+            proxy=proxy,
+            locale=locale,
+            timeout=timeout,
+            stealth=stealth,
+            cdp_url=cdp_url,
+            cookies=cookies,
+            load_dom=load_dom,
+            headless=headless,
+            useragent=useragent,
+            max_pages=__max_pages,
+            real_chrome=real_chrome,
+            page_action=page_action,
+            hide_canvas=hide_canvas,
+            init_script=init_script,
+            network_idle=network_idle,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            wait_selector=wait_selector,
+            disable_webgl=disable_webgl,
+            selector_config=selector_config,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
         )
-        self.__initiate_browser_options__()
-    def __initiate_browser_options__(self):
-        if not self.cdp_url:
-            # `launch_options` is used with persistent context
-            self.launch_options = dict(
-                _launch_kwargs(
-                    self.headless,
-                    self.proxy,
-                    self.locale,
-                    tuple(self.extra_headers.items())
-                    if self.extra_headers
-                    else tuple(),
-                    self.useragent,
-                    self.real_chrome,
-                    self.stealth,
-                    self.hide_canvas,
-                    self.disable_webgl,
-                )
-            )
-            self.launch_options["extra_http_headers"] = dict(
-                self.launch_options["extra_http_headers"]
-            )
-            self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
-            self.context_options = dict()
-        else:
-            # while `context_options` is left to be used when cdp mode is enabled
-            self.launch_options = dict()
-            self.context_options = dict(
-                _context_kwargs(
-                    self.proxy,
-                    self.locale,
-                    tuple(self.extra_headers.items())
-                    if self.extra_headers
-                    else tuple(),
-                    self.useragent,
-                    self.stealth,
-                )
-            )
-            self.context_options["extra_http_headers"] = dict(
-                self.context_options["extra_http_headers"]
-            )
-            self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
+        super().__init__(max_pages=self.max_pages)
     def __create__(self):
         """Create a browser for this instance and context."""
@@ -232,16 +159,21 @@ class DynamicSession:
             # Because rebrowser_playwright doesn't play well with real browsers
             sync_context = sync_playwright
-        self.playwright = sync_context().start()
+        self.playwright: Playwright = sync_context().start()
         if self.cdp_url:  # pragma: no cover
-            self.context = self.playwright.chromium.connect_over_cdp(
-                endpoint_url=self.cdp_url
-            ).new_context(**self.context_options)
-        else:
-            self.context = self.playwright.chromium.launch_persistent_context(
-                user_data_dir="", **self.launch_options
+            self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
+                **self.context_options
             )
+        else:
+            self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
+        # Get the default page and close it
+        default_page = self.context.pages[0]
+        default_page.close()
+        if self.init_script:  # pragma: no cover
+            self.context.add_init_script(path=self.init_script)
         if self.cookies:  # pragma: no cover
             self.context.add_cookies(self.cookies)
@@ -268,56 +200,63 @@ class DynamicSession:
         self._closed = True
-    def _get_or_create_page(self) -> PageInfo:  # pragma: no cover
-        """Get an available page or create a new one"""
-        # Try to get a ready page first
-        page_info = self.page_pool.get_ready_page()
-        if page_info:
-            return page_info
-        # Create a new page if under limit
-        if self.page_pool.pages_count < self.max_pages:
-            page = self.context.new_page()
-            page.set_default_navigation_timeout(self.timeout)
-            page.set_default_timeout(self.timeout)
-            if self.extra_headers:
-                page.set_extra_http_headers(self.extra_headers)
-            if self.disable_resources:
-                page.route("**/*", intercept_route)
-            if self.stealth:
-                for script in _compiled_stealth_scripts():
-                    page.add_init_script(script=script)
-            return self.page_pool.add_page(page)
-        # Wait for a page to become available
-        max_wait = 30
-        start_time = time()
-        while time() - start_time < max_wait:
-            page_info = self.page_pool.get_ready_page()
-            if page_info:
-                return page_info
-            sleep(0.05)
-        raise TimeoutError("No pages available within timeout period")
-    def fetch(self, url: str) -> Response:
+    def fetch(
+        self,
+        url: str,
+        google_search: bool = _UNSET,
+        timeout: int | float = _UNSET,
+        wait: int | float = _UNSET,
+        page_action: Optional[Callable] = _UNSET,
+        extra_headers: Optional[Dict[str, str]] = _UNSET,
+        disable_resources: bool = _UNSET,
+        wait_selector: Optional[str] = _UNSET,
+        wait_selector_state: SelectorWaitStates = _UNSET,
+        network_idle: bool = _UNSET,
+        load_dom: bool = _UNSET,
+        selector_config: Optional[Dict] = _UNSET,
+    ) -> Response:
         """Opens up the browser and do your request based on your chosen options.
         :param url: The Target url.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :return: A `Response` object.
         """
+        # Validate all resolved parameters
+        params = validate(
+            dict(
+                google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
+                timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
+                wait=self._get_with_precedence(wait, self.wait, _UNSET),
+                page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
+                extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
+                disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
+                wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
+                wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
+                network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
+                load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
+                selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
+            ),
+            PlaywrightConfig,
+        )
         if self._closed:  # pragma: no cover
             raise RuntimeError("Context manager has been closed")
         final_response = None
         referer = (
-            generate_convincing_referer(url)
-            if (self.google_search and "referer" not in self._headers_keys)
-            else None
+            generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
         )
         def handle_response(finished_response: SyncPlaywrightResponse):
@@ -328,48 +267,50 @@ class DynamicSession:
             ):
                 final_response = finished_response
-        page_info = self._get_or_create_page()
+        page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
         try:  # pragma: no cover
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)
             first_response = page_info.page.goto(url, referer=referer)
-            page_info.page.wait_for_load_state(state="domcontentloaded")
+            if params.load_dom:
+                page_info.page.wait_for_load_state(state="domcontentloaded")
-            if self.network_idle:
+            if params.network_idle:
                 page_info.page.wait_for_load_state("networkidle")
             if not first_response:
                 raise RuntimeError(f"Failed to get response for {url}")
-            if self.page_action is not None:
+            if params.page_action:
                 try:
-                    page_info.page = self.page_action(page_info.page)
+                    _ = params.page_action(page_info.page)
                 except Exception as e:  # pragma: no cover
                     log.error(f"Error executing page_action: {e}")
-            if self.wait_selector:
+            if params.wait_selector:
                 try:
-                    waiter: Locator = page_info.page.locator(self.wait_selector)
-                    waiter.first.wait_for(state=self.wait_selector_state)
+                    waiter: Locator = page_info.page.locator(params.wait_selector)
+                    waiter.first.wait_for(state=params.wait_selector_state)
                     # Wait again after waiting for the selector, helpful with protections like Cloudflare
                     page_info.page.wait_for_load_state(state="load")
-                    page_info.page.wait_for_load_state(state="domcontentloaded")
-                    if self.network_idle:
+                    if params.load_dom:
+                        page_info.page.wait_for_load_state(state="domcontentloaded")
+                    if params.network_idle:
                         page_info.page.wait_for_load_state("networkidle")
                 except Exception as e:  # pragma: no cover
-                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
+                    log.error(f"Error waiting for selector {params.wait_selector}: {e}")
-            page_info.page.wait_for_timeout(self.wait)
+            page_info.page.wait_for_timeout(params.wait)
             # Create response object
             response = ResponseFactory.from_playwright_response(
-                page_info.page, first_response, final_response, self.selector_config
+                page_info.page, first_response, final_response, params.selector_config
             )
-            # Mark the page as ready for next use
-            page_info.mark_ready()
+            # Mark the page as finished for next use
+            page_info.mark_finished()
             return response
@@ -377,17 +318,8 @@ class DynamicSession:
             page_info.mark_error()
             raise e
-    def get_pool_stats(self) -> Dict[str, int]:
-        """Get statistics about the current page pool"""
-        return {
-            "total_pages": self.page_pool.pages_count,
-            "ready_pages": self.page_pool.ready_count,
-            "busy_pages": self.page_pool.busy_count,
-            "max_pages": self.max_pages,
-        }
-class AsyncDynamicSession(DynamicSession):
+class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
     """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
     def __init__(
@@ -409,8 +341,10 @@ class AsyncDynamicSession(DynamicSession):
         timeout: int | float = 30000,
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
+        init_script: Optional[str] = None,
         cookies: Optional[List[Dict]] = None,
         network_idle: bool = False,
+        load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
         selector_config: Optional[Dict] = None,
     ):
@@ -423,10 +357,12 @@ class AsyncDynamicSession(DynamicSession):
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
         :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
-        :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
         :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
@@ -441,35 +377,32 @@ class AsyncDynamicSession(DynamicSession):
         :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         """
-        super().__init__(
-            max_pages,
-            headless,
-            google_search,
-            hide_canvas,
-            disable_webgl,
-            real_chrome,
-            stealth,
-            wait,
-            page_action,
-            proxy,
-            locale,
-            extra_headers,
-            useragent,
-            cdp_url,
-            timeout,
-            disable_resources,
-            wait_selector,
-            cookies,
-            network_idle,
-            wait_selector_state,
-            selector_config,
+        self.__validate__(
+            wait=wait,
+            proxy=proxy,
+            locale=locale,
+            timeout=timeout,
+            stealth=stealth,
+            cdp_url=cdp_url,
+            cookies=cookies,
+            load_dom=load_dom,
+            headless=headless,
+            useragent=useragent,
+            max_pages=max_pages,
+            real_chrome=real_chrome,
+            page_action=page_action,
+            hide_canvas=hide_canvas,
+            init_script=init_script,
+            network_idle=network_idle,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            wait_selector=wait_selector,
+            disable_webgl=disable_webgl,
+            selector_config=selector_config,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
         )
-        self.playwright: Optional[AsyncPlaywright] = None
-        self.context: Optional[AsyncBrowserContext] = None
-        self._lock = Lock()
-        self.__enter__ = None
-        self.__exit__ = None
+        super().__init__(max_pages=self.max_pages)
     async def __create__(self):
         """Create a browser for this instance and context."""
@@ -481,19 +414,20 @@ class AsyncDynamicSession(DynamicSession):
         self.playwright: AsyncPlaywright = await async_context().start()
         if self.cdp_url:
-            browser = await self.playwright.chromium.connect_over_cdp(
-                endpoint_url=self.cdp_url
-            )
-            self.context: AsyncBrowserContext = await browser.new_context(
-                **self.context_options
-            )
+            browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
+            self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
         else:
-            self.context: AsyncBrowserContext = (
-                await self.playwright.chromium.launch_persistent_context(
-                    user_data_dir="", **self.launch_options
-                )
+            self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
+                user_data_dir="", **self.launch_options
             )
+        # Get the default page and close it
+        default_page = self.context.pages[0]
+        await default_page.close()
+        if self.init_script:  # pragma: no cover
+            await self.context.add_init_script(path=self.init_script)
         if self.cookies:
             await self.context.add_cookies(self.cookies)
@@ -519,57 +453,63 @@ class AsyncDynamicSession(DynamicSession):
         self._closed = True
-    async def _get_or_create_page(self) -> PageInfo:
-        """Get an available page or create a new one"""
-        async with self._lock:
-            # Try to get a ready page first
-            page_info = self.page_pool.get_ready_page()
-            if page_info:
-                return page_info
-            # Create a new page if under limit
-            if self.page_pool.pages_count < self.max_pages:
-                page = await self.context.new_page()
-                page.set_default_navigation_timeout(self.timeout)
-                page.set_default_timeout(self.timeout)
-                if self.extra_headers:
-                    await page.set_extra_http_headers(self.extra_headers)
-                if self.disable_resources:
-                    await page.route("**/*", async_intercept_route)
-                if self.stealth:
-                    for script in _compiled_stealth_scripts():
-                        await page.add_init_script(script=script)
-                return self.page_pool.add_page(page)
-        # Wait for a page to become available
-        max_wait = 30  # seconds
-        start_time = time()
-        while time() - start_time < max_wait:  # pragma: no cover
-            page_info = self.page_pool.get_ready_page()
-            if page_info:
-                return page_info
-            await asyncio_sleep(0.05)
-        raise TimeoutError("No pages available within timeout period")
-    async def fetch(self, url: str) -> Response:
+    async def fetch(
+        self,
+        url: str,
+        google_search: bool = _UNSET,
+        timeout: int | float = _UNSET,
+        wait: int | float = _UNSET,
+        page_action: Optional[Callable] = _UNSET,
+        extra_headers: Optional[Dict[str, str]] = _UNSET,
+        disable_resources: bool = _UNSET,
+        wait_selector: Optional[str] = _UNSET,
+        wait_selector_state: SelectorWaitStates = _UNSET,
+        network_idle: bool = _UNSET,
+        load_dom: bool = _UNSET,
+        selector_config: Optional[Dict] = _UNSET,
+    ) -> Response:
         """Opens up the browser and do your request based on your chosen options.
         :param url: The Target url.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
         :return: A `Response` object.
         """
+        # Validate all resolved parameters
+        params = validate(
+            dict(
+                google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
+                timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
+                wait=self._get_with_precedence(wait, self.wait, _UNSET),
+                page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
+                extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
+                disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
+                wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
+                wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
+                network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
+                load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
+                selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
+            ),
+            PlaywrightConfig,
+        )
         if self._closed:  # pragma: no cover
             raise RuntimeError("Context manager has been closed")
         final_response = None
         referer = (
-            generate_convincing_referer(url)
-            if (self.google_search and "referer" not in self._headers_keys)
-            else None
+            generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
         )
         async def handle_response(finished_response: AsyncPlaywrightResponse):
@@ -580,48 +520,50 @@ class AsyncDynamicSession(DynamicSession):
             ):
                 final_response = finished_response
-        page_info = await self._get_or_create_page()
+        page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
         try:
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)
             first_response = await page_info.page.goto(url, referer=referer)
-            await page_info.page.wait_for_load_state(state="domcontentloaded")
+            if self.load_dom:
+                await page_info.page.wait_for_load_state(state="domcontentloaded")
-            if self.network_idle:
+            if params.network_idle:
                 await page_info.page.wait_for_load_state("networkidle")
             if not first_response:
                 raise RuntimeError(f"Failed to get response for {url}")
-            if self.page_action is not None:
+            if params.page_action:
                 try:
-                    page_info.page = await self.page_action(page_info.page)
+                    _ = await params.page_action(page_info.page)
                 except Exception as e:
                     log.error(f"Error executing page_action: {e}")
-            if self.wait_selector:
+            if params.wait_selector:
                 try:
-                    waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
-                    await waiter.first.wait_for(state=self.wait_selector_state)
+                    waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
+                    await waiter.first.wait_for(state=params.wait_selector_state)
                     # Wait again after waiting for the selector, helpful with protections like Cloudflare
                     await page_info.page.wait_for_load_state(state="load")
-                    await page_info.page.wait_for_load_state(state="domcontentloaded")
-                    if self.network_idle:
+                    if self.load_dom:
+                        await page_info.page.wait_for_load_state(state="domcontentloaded")
+                    if params.network_idle:
                         await page_info.page.wait_for_load_state("networkidle")
                 except Exception as e:
-                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
+                    log.error(f"Error waiting for selector {params.wait_selector}: {e}")
-            await page_info.page.wait_for_timeout(self.wait)
+            await page_info.page.wait_for_timeout(params.wait)
             # Create response object
             response = await ResponseFactory.from_async_playwright_response(
-                page_info.page, first_response, final_response, self.selector_config
+                page_info.page, first_response, final_response, params.selector_config
             )
-            # Mark the page as ready for next use
-            page_info.mark_ready()
+            # Mark the page as finished for next use
+            page_info.mark_finished()
             return response

scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl

scrapling 0.3py3-none-any.whl → 0.3.2py3-none-any.whl