PyPI - scrapling - Versions diffs - 0.2.96__py3-none-any.whl → 0.2.97__py3-none-any.whl - Mend

scrapling 0.2.96py3-none-any.whl → 0.2.97py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

scrapling/__init__.py +1 -1
scrapling/core/custom_types.py +1 -3
scrapling/core/storage_adaptors.py +3 -3
scrapling/core/translator.py +1 -1
scrapling/core/utils.py +1 -1
scrapling/engines/camo.py +123 -104
scrapling/engines/pw.py +100 -75
scrapling/engines/static.py +22 -42
scrapling/engines/toolbelt/custom.py +2 -2
scrapling/engines/toolbelt/fingerprints.py +2 -2
scrapling/engines/toolbelt/navigation.py +1 -1
scrapling/fetchers.py +24 -24
scrapling/parser.py +1 -1
{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/METADATA +17 -16
{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/RECORD +19 -19
{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/WHEEL +1 -1
{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/LICENSE +0 -0
{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/entry_points.txt +0 -0
{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/top_level.txt +0 -0

scrapling/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
 from scrapling.parser import Adaptor, Adaptors
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.2.96"
+__version__ = "0.2.97"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"

scrapling/core/custom_types.py CHANGED Viewed

@@ -19,9 +19,7 @@ class TextHandler(str):
     __slots__ = ()
     def __new__(cls, string):
-        if isinstance(string, str):
-            return super().__new__(cls, string)
-        return super().__new__(cls, '')
+        return super().__new__(cls, str(string))
     def __getitem__(self, key: Union[SupportsIndex, slice]) -> "TextHandler":
         lst = super().__getitem__(key)

scrapling/core/storage_adaptors.py CHANGED Viewed

@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
         """
         self.url = url
-    @lru_cache(None, typed=True)
+    @lru_cache(126, typed=True)
     def _get_base_url(self, default_value: str = 'default') -> str:
         if not self.url or type(self.url) is not str:
             return default_value
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
         raise NotImplementedError('Storage system must implement `save` method')
     @staticmethod
-    @lru_cache(None, typed=True)
+    @lru_cache(256, typed=True)
     def _get_hash(identifier: str) -> str:
         """If you want to hash identifier in your storage system, use this safer"""
         identifier = identifier.lower().strip()
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
         return f"{hash_value}_{len(identifier)}"  # Length to reduce collision chance
-@lru_cache(None, typed=True)
+@lru_cache(10, typed=True)
 class SQLiteStorageSystem(StorageSystemMixin):
     """The recommended system to use, it's race condition safe and thread safe.
     Mainly built so the library can run in threaded frameworks like scrapy or threaded tools

scrapling/core/translator.py CHANGED Viewed

@@ -139,6 +139,6 @@ class TranslatorMixin:
 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
-    @lru_cache(maxsize=2048)
+    @lru_cache(maxsize=256)
     def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
         return super().css_to_xpath(css, prefix)

scrapling/core/utils.py CHANGED Viewed

@@ -115,7 +115,7 @@ class _StorageTools:
 #     return _impl
-@lru_cache(None, typed=True)
+@lru_cache(256, typed=True)
 def clean_spaces(string):
     string = string.replace('\t', ' ')
     string = re.sub('[\n|\r]', '', string)

scrapling/engines/camo.py CHANGED Viewed

@@ -15,12 +15,12 @@ from scrapling.engines.toolbelt import (Response, StatusText,
 class CamoufoxEngine:
     def __init__(
-            self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
-            block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
+            self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
+            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True,
             timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
-            wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False,
-            geoip: Optional[bool] = False,
+            wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
+            geoip: bool = False,
             adaptor_arguments: Dict = None,
     ):
         """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
@@ -64,107 +64,140 @@ class CamoufoxEngine:
         self.addons = addons or []
         self.humanize = humanize
         self.timeout = check_type_validity(timeout, [int, float], 30000)
+        # Page action callable validation
+        self.page_action = None
         if page_action is not None:
             if callable(page_action):
                 self.page_action = page_action
             else:
-                self.page_action = None
                 log.error('[Ignored] Argument "page_action" must be callable')
-        else:
-            self.page_action = None
         self.wait_selector = wait_selector
         self.wait_selector_state = wait_selector_state
         self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
+    def _get_camoufox_options(self):
+        """Return consistent browser options dictionary for both sync and async methods"""
+        return {
+            "geoip": self.geoip,
+            "proxy": self.proxy,
+            "enable_cache": True,
+            "addons": self.addons,
+            "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
+            "headless": self.headless,
+            "humanize": self.humanize,
+            "i_know_what_im_doing": True,  # To turn warnings off with the user configurations
+            "allow_webgl": self.allow_webgl,
+            "block_webrtc": self.block_webrtc,
+            "block_images": self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
+            "os": None if self.os_randomize else get_os_name(),
+        }
+    def _process_response_history(self, first_response):
+        """Process response history to build a list of Response objects"""
+        history = []
+        current_request = first_response.request.redirected_from
+        try:
+            while current_request:
+                try:
+                    current_response = current_request.response()
+                    history.insert(0, Response(
+                        url=current_request.url,
+                        # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                        text='',
+                        body=b'',
+                        status=current_response.status if current_response else 301,
+                        reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
+                        encoding=current_response.headers.get('content-type', '') or 'utf-8',
+                        cookies={},
+                        headers=current_response.all_headers() if current_response else {},
+                        request_headers=current_request.all_headers(),
+                        **self.adaptor_arguments
+                    ))
+                except Exception as e:
+                    log.error(f"Error processing redirect: {e}")
+                    break
+                current_request = current_request.redirected_from
+        except Exception as e:
+            log.error(f"Error processing response history: {e}")
+        return history
     def fetch(self, url: str) -> Response:
         """Opens up the browser and do your request based on your chosen options.
         :param url: Target url.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        addons = [] if self.disable_ads else [DefaultAddons.UBO]
-        # Store the final response
         final_response = None
+        referer = generate_convincing_referer(url) if self.google_search else None
         def handle_response(finished_response):
             nonlocal final_response
             if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
-        with Camoufox(
-                geoip=self.geoip,
-                proxy=self.proxy,
-                enable_cache=True,
-                addons=self.addons,
-                exclude_addons=addons,
-                headless=self.headless,
-                humanize=self.humanize,
-                i_know_what_im_doing=True,  # To turn warnings off with the user configurations
-                allow_webgl=self.allow_webgl,
-                block_webrtc=self.block_webrtc,
-                block_images=self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
-                os=None if self.os_randomize else get_os_name(),
-        ) as browser:
-            page = browser.new_page()
+        with Camoufox(**self._get_camoufox_options()) as browser:
+            context = browser.new_context()
+            page = context.new_page()
             page.set_default_navigation_timeout(self.timeout)
             page.set_default_timeout(self.timeout)
-            # Listen for all responses
             page.on("response", handle_response)
             if self.disable_resources:
                 page.route("**/*", intercept_route)
             if self.extra_headers:
                 page.set_extra_http_headers(self.extra_headers)
-            first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
+            first_response = page.goto(url, referer=referer)
             page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
                 page.wait_for_load_state('networkidle')
             if self.page_action is not None:
-                page = self.page_action(page)
+                try:
+                    page = self.page_action(page)
+                except Exception as e:
+                    log.error(f"Error executing page_action: {e}")
             if self.wait_selector and type(self.wait_selector) is str:
-                waiter = page.locator(self.wait_selector)
-                waiter.first.wait_for(state=self.wait_selector_state)
-                # Wait again after waiting for the selector, helpful with protections like Cloudflare
-                page.wait_for_load_state(state="load")
-                page.wait_for_load_state(state="domcontentloaded")
-                if self.network_idle:
-                    page.wait_for_load_state('networkidle')
+                try:
+                    waiter = page.locator(self.wait_selector)
+                    waiter.first.wait_for(state=self.wait_selector_state)
+                    # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                    page.wait_for_load_state(state="load")
+                    page.wait_for_load_state(state="domcontentloaded")
+                    if self.network_idle:
+                        page.wait_for_load_state('networkidle')
+                except Exception as e:
+                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
             # In case we didn't catch a document type somehow
             final_response = final_response if final_response else first_response
+            if not final_response:
+                raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
             encoding = final_response.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
             status_text = final_response.status_text or StatusText.get(final_response.status)
-            history = []
-            current_request = first_response.request.redirected_from
-            while current_request:
-                current_response = current_request.response()
-                history.insert(0, Response(
-                    url=current_request.url,
-                    # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                    text='',
-                    body=b'',
-                    status=current_response.status if current_response else 301,
-                    reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
-                    encoding=current_response.headers.get('content-type', '') or 'utf-8',
-                    cookies={},
-                    headers=current_response.all_headers() if current_response else {},
-                    request_headers=current_request.all_headers(),
-                    **self.adaptor_arguments
-                ))
-                current_request = current_request.redirected_from
+            history = self._process_response_history(first_response)
+            try:
+                page_content = page.content()
+            except Exception as e:
+                log.error(f"Error getting page content: {e}")
+                page_content = ""
             response = Response(
                 url=page.url,
-                text=page.content(),
-                body=page.content().encode('utf-8'),
+                text=page_content,
+                body=page_content.encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
@@ -175,6 +208,7 @@ class CamoufoxEngine:
                 **self.adaptor_arguments
             )
             page.close()
+            context.close()
         return response
@@ -184,88 +218,72 @@ class CamoufoxEngine:
         :param url: Target url.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        addons = [] if self.disable_ads else [DefaultAddons.UBO]
-        # Store the final response
         final_response = None
+        referer = generate_convincing_referer(url) if self.google_search else None
         async def handle_response(finished_response):
             nonlocal final_response
             if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
-        async with AsyncCamoufox(
-                geoip=self.geoip,
-                proxy=self.proxy,
-                enable_cache=True,
-                addons=self.addons,
-                exclude_addons=addons,
-                headless=self.headless,
-                humanize=self.humanize,
-                i_know_what_im_doing=True,  # To turn warnings off with the user configurations
-                allow_webgl=self.allow_webgl,
-                block_webrtc=self.block_webrtc,
-                block_images=self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
-                os=None if self.os_randomize else get_os_name(),
-        ) as browser:
-            page = await browser.new_page()
+        async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
+            context = await browser.new_context()
+            page = await context.new_page()
             page.set_default_navigation_timeout(self.timeout)
             page.set_default_timeout(self.timeout)
-            # Listen for all responses
             page.on("response", handle_response)
             if self.disable_resources:
                 await page.route("**/*", async_intercept_route)
             if self.extra_headers:
                 await page.set_extra_http_headers(self.extra_headers)
-            first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
+            first_response = await page.goto(url, referer=referer)
             await page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
                 await page.wait_for_load_state('networkidle')
             if self.page_action is not None:
-                page = await self.page_action(page)
+                try:
+                    page = await self.page_action(page)
+                except Exception as e:
+                    log.error(f"Error executing async page_action: {e}")
             if self.wait_selector and type(self.wait_selector) is str:
-                waiter = page.locator(self.wait_selector)
-                await waiter.first.wait_for(state=self.wait_selector_state)
-                # Wait again after waiting for the selector, helpful with protections like Cloudflare
-                await page.wait_for_load_state(state="load")
-                await page.wait_for_load_state(state="domcontentloaded")
-                if self.network_idle:
-                    await page.wait_for_load_state('networkidle')
+                try:
+                    waiter = page.locator(self.wait_selector)
+                    await waiter.first.wait_for(state=self.wait_selector_state)
+                    # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                    await page.wait_for_load_state(state="load")
+                    await page.wait_for_load_state(state="domcontentloaded")
+                    if self.network_idle:
+                        await page.wait_for_load_state('networkidle')
+                except Exception as e:
+                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
             # In case we didn't catch a document type somehow
             final_response = final_response if final_response else first_response
+            if not final_response:
+                raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
             encoding = final_response.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
             status_text = final_response.status_text or StatusText.get(final_response.status)
-            history = []
-            current_request = first_response.request.redirected_from
-            while current_request:
-                current_response = await current_request.response()
-                history.insert(0, Response(
-                    url=current_request.url,
-                    # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                    text='',
-                    body=b'',
-                    status=current_response.status if current_response else 301,
-                    reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
-                    encoding=current_response.headers.get('content-type', '') or 'utf-8',
-                    cookies={},
-                    headers=await current_response.all_headers() if current_response else {},
-                    request_headers=await current_request.all_headers(),
-                    **self.adaptor_arguments
-                ))
-                current_request = current_request.redirected_from
+            history = self._process_response_history(first_response)
+            try:
+                page_content = await page.content()
+            except Exception as e:
+                log.error(f"Error getting page content in async: {e}")
+                page_content = ""
             response = Response(
                 url=page.url,
-                text=await page.content(),
-                body=(await page.content()).encode('utf-8'),
+                text=page_content,
+                body=page_content.encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
@@ -276,5 +294,6 @@ class CamoufoxEngine:
                 **self.adaptor_arguments
             )
             await page.close()
+            await context.close()
         return response

scrapling/engines/pw.py CHANGED Viewed

@@ -19,20 +19,20 @@ class PlaywrightEngine:
             self, headless: Union[bool, str] = True,
             disable_resources: bool = False,
             useragent: Optional[str] = None,
-            network_idle: Optional[bool] = False,
+            network_idle: bool = False,
             timeout: Optional[float] = 30000,
             page_action: Callable = None,
             wait_selector: Optional[str] = None,
             locale: Optional[str] = 'en-US',
             wait_selector_state: SelectorWaitStates = 'attached',
-            stealth: Optional[bool] = False,
-            real_chrome: Optional[bool] = False,
-            hide_canvas: Optional[bool] = False,
-            disable_webgl: Optional[bool] = False,
+            stealth: bool = False,
+            real_chrome: bool = False,
+            hide_canvas: bool = False,
+            disable_webgl: bool = False,
             cdp_url: Optional[str] = None,
-            nstbrowser_mode: Optional[bool] = False,
+            nstbrowser_mode: bool = False,
             nstbrowser_config: Optional[Dict] = None,
-            google_search: Optional[bool] = True,
+            google_search: bool = True,
             extra_headers: Optional[Dict[str, str]] = None,
             proxy: Optional[Union[str, Dict[str, str]]] = None,
             adaptor_arguments: Dict = None
@@ -126,7 +126,7 @@ class PlaywrightEngine:
         return cdp_url
-    @lru_cache(typed=True)
+    @lru_cache(126, typed=True)
     def __set_flags(self):
         """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
         flags = DEFAULT_STEALTH_FLAGS
@@ -169,7 +169,7 @@ class PlaywrightEngine:
         return context_kwargs
-    @lru_cache()
+    @lru_cache(10)
     def __stealth_scripts(self):
         # Basic bypasses nothing fancy as I'm still working on it
         # But with adding these bypasses to the above config, it bypasses many online tests like
@@ -188,6 +188,38 @@ class PlaywrightEngine:
             )
         )
+    def _process_response_history(self, first_response):
+        """Process response history to build a list of Response objects"""
+        history = []
+        current_request = first_response.request.redirected_from
+        try:
+            while current_request:
+                try:
+                    current_response = current_request.response()
+                    history.insert(0, Response(
+                        url=current_request.url,
+                        # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                        text='',
+                        body=b'',
+                        status=current_response.status if current_response else 301,
+                        reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
+                        encoding=current_response.headers.get('content-type', '') or 'utf-8',
+                        cookies={},
+                        headers=current_response.all_headers() if current_response else {},
+                        request_headers=current_request.all_headers(),
+                        **self.adaptor_arguments
+                    ))
+                except Exception as e:
+                    log.error(f"Error processing redirect: {e}")
+                    break
+                current_request = current_request.redirected_from
+        except Exception as e:
+            log.error(f"Error processing response history: {e}")
+        return history
     def fetch(self, url: str) -> Response:
         """Opens up the browser and do your request based on your chosen options.
@@ -201,8 +233,8 @@ class PlaywrightEngine:
         else:
             from rebrowser_playwright.sync_api import sync_playwright
-        # Store the final response
         final_response = None
+        referer = generate_convincing_referer(url) if self.google_search else None
         def handle_response(finished_response: PlaywrightResponse):
             nonlocal final_response
@@ -218,11 +250,9 @@ class PlaywrightEngine:
                 browser = p.chromium.launch(**self.__launch_kwargs())
             context = browser.new_context(**self.__context_kwargs())
-            # Finally we are in business
             page = context.new_page()
             page.set_default_navigation_timeout(self.timeout)
             page.set_default_timeout(self.timeout)
-            # Listen for all responses
             page.on("response", handle_response)
             if self.extra_headers:
@@ -235,54 +265,51 @@ class PlaywrightEngine:
                 for script in self.__stealth_scripts():
                     page.add_init_script(path=script)
-            first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
+            first_response = page.goto(url, referer=referer)
             page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
                 page.wait_for_load_state('networkidle')
             if self.page_action is not None:
-                page = self.page_action(page)
+                try:
+                    page = self.page_action(page)
+                except Exception as e:
+                    log.error(f"Error executing page_action: {e}")
             if self.wait_selector and type(self.wait_selector) is str:
-                waiter = page.locator(self.wait_selector)
-                waiter.first.wait_for(state=self.wait_selector_state)
-                # Wait again after waiting for the selector, helpful with protections like Cloudflare
-                page.wait_for_load_state(state="load")
-                page.wait_for_load_state(state="domcontentloaded")
-                if self.network_idle:
-                    page.wait_for_load_state('networkidle')
+                try:
+                    waiter = page.locator(self.wait_selector)
+                    waiter.first.wait_for(state=self.wait_selector_state)
+                    # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                    page.wait_for_load_state(state="load")
+                    page.wait_for_load_state(state="domcontentloaded")
+                    if self.network_idle:
+                        page.wait_for_load_state('networkidle')
+                except Exception as e:
+                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
             # In case we didn't catch a document type somehow
             final_response = final_response if final_response else first_response
+            if not final_response:
+                raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
             encoding = final_response.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
             status_text = final_response.status_text or StatusText.get(final_response.status)
-            history = []
-            current_request = first_response.request.redirected_from
-            while current_request:
-                current_response = current_request.response()
-                history.insert(0, Response(
-                    url=current_request.url,
-                    # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                    text='',
-                    body=b'',
-                    status=current_response.status if current_response else 301,
-                    reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
-                    encoding=current_response.headers.get('content-type', '') or 'utf-8',
-                    cookies={},
-                    headers=current_response.all_headers() if current_response else {},
-                    request_headers=current_request.all_headers(),
-                    **self.adaptor_arguments
-                ))
-                current_request = current_request.redirected_from
+            history = self._process_response_history(first_response)
+            try:
+                page_content = page.content()
+            except Exception as e:
+                log.error(f"Error getting page content: {e}")
+                page_content = ""
             response = Response(
                 url=page.url,
-                text=page.content(),
-                body=page.content().encode('utf-8'),
+                text=page_content,
+                body=page_content.encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
@@ -293,6 +320,7 @@ class PlaywrightEngine:
                 **self.adaptor_arguments
             )
             page.close()
+            context.close()
         return response
     async def async_fetch(self, url: str) -> Response:
@@ -308,8 +336,8 @@ class PlaywrightEngine:
         else:
             from rebrowser_playwright.async_api import async_playwright
-        # Store the final response
         final_response = None
+        referer = generate_convincing_referer(url) if self.google_search else None
         async def handle_response(finished_response: PlaywrightResponse):
             nonlocal final_response
@@ -325,11 +353,9 @@ class PlaywrightEngine:
                 browser = await p.chromium.launch(**self.__launch_kwargs())
             context = await browser.new_context(**self.__context_kwargs())
-            # Finally we are in business
             page = await context.new_page()
             page.set_default_navigation_timeout(self.timeout)
             page.set_default_timeout(self.timeout)
-            # Listen for all responses
             page.on("response", handle_response)
             if self.extra_headers:
@@ -342,54 +368,51 @@ class PlaywrightEngine:
                 for script in self.__stealth_scripts():
                     await page.add_init_script(path=script)
-            first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
+            first_response = await page.goto(url, referer=referer)
             await page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
                 await page.wait_for_load_state('networkidle')
             if self.page_action is not None:
-                page = await self.page_action(page)
+                try:
+                    page = await self.page_action(page)
+                except Exception as e:
+                    log.error(f"Error executing async page_action: {e}")
             if self.wait_selector and type(self.wait_selector) is str:
-                waiter = page.locator(self.wait_selector)
-                await waiter.first.wait_for(state=self.wait_selector_state)
-                # Wait again after waiting for the selector, helpful with protections like Cloudflare
-                await page.wait_for_load_state(state="load")
-                await page.wait_for_load_state(state="domcontentloaded")
-                if self.network_idle:
-                    await page.wait_for_load_state('networkidle')
+                try:
+                    waiter = page.locator(self.wait_selector)
+                    await waiter.first.wait_for(state=self.wait_selector_state)
+                    # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                    await page.wait_for_load_state(state="load")
+                    await page.wait_for_load_state(state="domcontentloaded")
+                    if self.network_idle:
+                        await page.wait_for_load_state('networkidle')
+                except Exception as e:
+                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
             # In case we didn't catch a document type somehow
             final_response = final_response if final_response else first_response
+            if not final_response:
+                raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
             encoding = final_response.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
             status_text = final_response.status_text or StatusText.get(final_response.status)
-            history = []
-            current_request = first_response.request.redirected_from
-            while current_request:
-                current_response = await current_request.response()
-                history.insert(0, Response(
-                    url=current_request.url,
-                    # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                    text='',
-                    body=b'',
-                    status=current_response.status if current_response else 301,
-                    reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
-                    encoding=current_response.headers.get('content-type', '') or 'utf-8',
-                    cookies={},
-                    headers=await current_response.all_headers() if current_response else {},
-                    request_headers=await current_request.all_headers(),
-                    **self.adaptor_arguments
-                ))
-                current_request = current_request.redirected_from
+            history = self._process_response_history(first_response)
+            try:
+                page_content = await page.content()
+            except Exception as e:
+                log.error(f"Error getting page content in async: {e}")
+                page_content = ""
             response = Response(
                 url=page.url,
-                text=await page.content(),
-                body=(await page.content()).encode('utf-8'),
+                text=page_content,
+                body=page_content.encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
@@ -400,4 +423,6 @@ class PlaywrightEngine:
                 **self.adaptor_arguments
             )
             await page.close()
+            await context.close()
         return response

scrapling/engines/static.py CHANGED Viewed

@@ -7,10 +7,10 @@ from scrapling.core.utils import log, lru_cache
 from .toolbelt import Response, generate_convincing_referer, generate_headers
-@lru_cache(typed=True)
+@lru_cache(5, typed=True)  # Singleton easily
 class StaticEngine:
     def __init__(
-            self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, follow_redirects: bool = True,
+            self, url: str, proxy: Optional[str] = None, stealthy_headers: bool = True, follow_redirects: bool = True,
             timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
     ):
         """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
@@ -79,17 +79,25 @@ class StaticEngine:
             **self.adaptor_arguments
         )
+    def _make_request(self, method: str, **kwargs) -> Response:
+        headers = self._headers_job(kwargs.pop('headers', {}))
+        with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
+            request = getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
+        return self._prepare_response(request)
+    async def _async_make_request(self, method: str, **kwargs) -> Response:
+        headers = self._headers_job(kwargs.pop('headers', {}))
+        async with httpx.AsyncClient(proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)) as client:
+            request = await getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
+        return self._prepare_response(request)
     def get(self, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
-            request = client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
-        return self._prepare_response(request)
+        return self._make_request('get', **kwargs)
     async def async_get(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP GET request for you but with some added flavors.
@@ -97,11 +105,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            request = await client.get(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
-        return self._prepare_response(request)
+        return await self._async_make_request('get', **kwargs)
     def post(self, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
@@ -109,11 +113,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
-            request = client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
-        return self._prepare_response(request)
+        return self._make_request('post', **kwargs)
     async def async_post(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP POST request for you but with some added flavors.
@@ -121,11 +121,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            request = await client.post(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
-        return self._prepare_response(request)
+        return await self._async_make_request('post', **kwargs)
     def delete(self, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
@@ -133,11 +129,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
-            request = client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
-        return self._prepare_response(request)
+        return self._make_request('delete', **kwargs)
     async def async_delete(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP DELETE request for you but with some added flavors.
@@ -145,11 +137,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            request = await client.delete(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
-        return self._prepare_response(request)
+        return await self._async_make_request('delete', **kwargs)
     def put(self, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
@@ -157,11 +145,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
-            request = client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
-        return self._prepare_response(request)
+        return self._make_request('put', **kwargs)
     async def async_put(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP PUT request for you but with some added flavors.
@@ -169,8 +153,4 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            request = await client.put(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
-        return self._prepare_response(request)
+        return await self._async_make_request('put', **kwargs)

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -16,7 +16,7 @@ class ResponseEncoding:
     __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
     @classmethod
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=256)
     def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
         """Parse content type and parameters from a content-type header value.
@@ -38,7 +38,7 @@ class ResponseEncoding:
         return content_type, params
     @classmethod
-    @lru_cache(maxsize=None)
+    @lru_cache(maxsize=256)
     def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
         """Determine the appropriate character encoding from a content-type header.

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -12,7 +12,7 @@ from scrapling.core._types import Dict, Union
 from scrapling.core.utils import lru_cache
-@lru_cache(None, typed=True)
+@lru_cache(128, typed=True)
 def generate_convincing_referer(url: str) -> str:
     """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
     return f'https://www.google.com/search?q={website_name}'
-@lru_cache(None, typed=True)
+@lru_cache(128, typed=True)
 def get_os_name() -> Union[str, None]:
     """Get the current OS name in the same format needed for browserforge

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -110,7 +110,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
         raise ValueError(f"Invalid CDP URL: {str(e)}")
-@lru_cache(None, typed=True)
+@lru_cache(126, typed=True)
 def js_bypass_path(filename: str) -> str:
     """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it

scrapling/fetchers.py CHANGED Viewed

@@ -11,7 +11,7 @@ class Fetcher(BaseFetcher):
     Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
     """
     def get(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
+            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
             proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
@@ -30,7 +30,7 @@ class Fetcher(BaseFetcher):
         return response_object
     def post(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
+            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
             proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
@@ -49,7 +49,7 @@ class Fetcher(BaseFetcher):
         return response_object
     def put(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
+            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
             proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
@@ -69,7 +69,7 @@ class Fetcher(BaseFetcher):
         return response_object
     def delete(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
+            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
             proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
@@ -90,7 +90,7 @@ class Fetcher(BaseFetcher):
 class AsyncFetcher(Fetcher):
     async def get(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
+            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
             proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
@@ -109,7 +109,7 @@ class AsyncFetcher(Fetcher):
         return response_object
     async def post(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
+            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
             proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
@@ -128,7 +128,7 @@ class AsyncFetcher(Fetcher):
         return response_object
     async def put(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
+            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
             proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
@@ -147,7 +147,7 @@ class AsyncFetcher(Fetcher):
         return response_object
     async def delete(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True,
+            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
             proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
@@ -173,11 +173,11 @@ class StealthyFetcher(BaseFetcher):
      Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
     """
     def fetch(
-            self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
-            block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
+            self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
+            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
             timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
-            wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
+            wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -231,11 +231,11 @@ class StealthyFetcher(BaseFetcher):
         return engine.fetch(url)
     async def async_fetch(
-            self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
-            block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
+            self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
+            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
             timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
-            wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
+            wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -307,13 +307,13 @@ class PlayWrightFetcher(BaseFetcher):
     """
     def fetch(
             self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
-            useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
+            useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
             page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
-            hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
+            hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
             proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
-            stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
+            stealth: bool = False, real_chrome: bool = False,
             cdp_url: Optional[str] = None,
-            nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
+            nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -367,13 +367,13 @@ class PlayWrightFetcher(BaseFetcher):
     async def async_fetch(
             self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
-            useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
+            useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
             page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
-            hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
+            hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
             proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
-            stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
+            stealth: bool = False, real_chrome: bool = False,
             cdp_url: Optional[str] = None,
-            nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
+            nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.

scrapling/parser.py CHANGED Viewed

@@ -71,7 +71,7 @@ class Adaptor(SelectorsGeneration):
         if root is None and not body and text is None:
             raise ValueError("Adaptor class needs text, body, or root arguments to work")
-        self.__text = None
+        self.__text = ''
         self.__raw_body = ''
         if root is None:
             if text is None:

{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: scrapling
-Version: 0.2.96
+Version: 0.2.97
 Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
@@ -73,6 +73,22 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
 # Sponsors
+[Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
+Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents, aiming to provide real-time, accurate and unbiased information to help AI applications retrieve and process data efficiently.
+- covering 20+ Google SERP scenarios and mainstream search engines.
+- support real-time data updates to ensure real-time and accurate information.
+- It can integrate information from all available online channels and search engines.
+- Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
+- **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
+- 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
+- ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
+- 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
+[![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
+---
 [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
 - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
@@ -88,21 +104,6 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
 [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
 ---
-[Scrapeless](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci) is your all-in-one web scraping toolkit, starting at just $0.60 per 1k URLs!
-- 🚀 Scraping API: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
-- ⚡ Scraping Browser: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
-- 🔒 Web Unlocker: Bypass CAPTCHAs, IP blocks, and dynamic content in real time, ensuring uninterrupted access.
-- 🌐 Proxies: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
-- 💼 Enterprise-Grade: Custom solutions for large-scale and complex data needs.
-- 🎁 Free Trial: Try before you buy—experience our service firsthand.
-- 💬 Pay-Per-Use: Flexible, cost-effective pricing with no long-term commitments.
-- 🔧 Easy Integration: Seamlessly integrate with your existing tools and workflows for hassle-free automation.
-[![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/?utm_source=github&utm_medium=ads&utm_campaign=scraping&utm_term=D4Vinci)
----
 ## Table of content
   * [Key Features](#key-features)
     * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)

{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/RECORD RENAMED Viewed

@@ -1,25 +1,25 @@
-scrapling/__init__.py,sha256=5r6_yxrfXbeoh8UqUaCdmmbWH9TQxBivP9cLWUXPI5g,500
+scrapling/__init__.py,sha256=5yeUml2K0xHe2NAALM2x2hGSl_ORcEttIZL17b1cWtg,500
 scrapling/cli.py,sha256=7yTsMhVAqqS8Z27T5dFKrR9_X8vuFjBlwYgAF22W7T8,1292
 scrapling/defaults.py,sha256=sdXeZjXEX7PmCtaa0weK0nRrAUzqZukNNqipZ_sltYE,469
-scrapling/fetchers.py,sha256=qmiJ6S-bnPWvP48Z6rKxBnSuR-tdwHlJwlIsYxGxFM0,35405
-scrapling/parser.py,sha256=b_1eHxRwHRCidyvm3F6ST6qIYvVEVU6GhTTCI1LblVk,54330
+scrapling/fetchers.py,sha256=xwVCjAg0VCXwhB2igSLQvb0D0bOPGfg5WNtxgE7m-W0,34987
+scrapling/parser.py,sha256=U6qFV23qeeX1pYl6mw0TZEL4FlaQw6puaoDTldUpi-M,54328
 scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
 scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
-scrapling/core/custom_types.py,sha256=tejeLYmWa_aLaLtMSymG4z7h6rxO-9EvmiRWEWcW54s,13022
+scrapling/core/custom_types.py,sha256=EWGx5t5scHEB1SMsitzc8duskq-5f-Qaj40IWkNTRzM,12947
 scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
-scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
-scrapling/core/translator.py,sha256=hFSc3mxG5pYhbwRgingeFbD_E73U799vCsvVv0uFEXw,5237
-scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
+scrapling/core/storage_adaptors.py,sha256=EkSE8LlOS9SggFblBNzgyEp0fLxl8dqYU3-MAuXUitY,6216
+scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
+scrapling/core/utils.py,sha256=0e3jD029CXj3gfA_MIKcBC0Mai9fXW2scIuoKtHy1e8,3704
 scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
-scrapling/engines/camo.py,sha256=SHMRnIrN6599upo5-G3fZQ10455xyB-bB_EsLMjBStA,16072
+scrapling/engines/camo.py,sha256=oYKA0l3EpOcQW2APRj5FEmslqtp9A8i_ZljqlKvIDeI,16129
 scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
-scrapling/engines/pw.py,sha256=LvS1jvTf3s7mfdeQo7_OyQ5zpiOzvBu5g88hOLlQBCQ,20856
-scrapling/engines/static.py,sha256=8v6RmdsSP6fAtWNXaJG24evHPsZ2oDiBl7yfkLrdARU,10635
+scrapling/engines/pw.py,sha256=_fy8mhkVrOnb_Qho8zKCjFyd1Y_kr2mkdo0PHrBks4M,21371
+scrapling/engines/static.py,sha256=okrEIFfYaxqVuIXPanxQDxQpN8i88AgWODo7Dnex2EI,9306
 scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
-scrapling/engines/toolbelt/custom.py,sha256=qgONLwpxUoEIAIQBF1RcakYu8cqAAmX8qdyaol5hfjA,12813
-scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
-scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
+scrapling/engines/toolbelt/custom.py,sha256=dwpuEHNOd9uJbMf7sx8sXsYZhozSXStrwqfpooce1Wk,12811
+scrapling/engines/toolbelt/fingerprints.py,sha256=spJMij0qBGvbSlVjv9xJWCF8KFDf6xnNz5fWtXWhrzY,2927
+scrapling/engines/toolbelt/navigation.py,sha256=KyFQ4vHS4jR7z378VRGtUeXQHWr5NMy5nNp2-c_Evk8,4566
 scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
 scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
 scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
@@ -41,9 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=MEyDRaMyxDIWupG7f_xz0f0jd9Cpbd5rXC
 tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
 tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
-scrapling-0.2.96.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
-scrapling-0.2.96.dist-info/METADATA,sha256=yNRmjMR5qmJyH_6ob-6nwLuqD6iXIegMI-d-xQ95ZpA,69063
-scrapling-0.2.96.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-scrapling-0.2.96.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
-scrapling-0.2.96.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
-scrapling-0.2.96.dist-info/RECORD,,
+scrapling-0.2.97.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
+scrapling-0.2.97.dist-info/METADATA,sha256=VnP3UEy6RcQytld-8ZYSF0Cpdd4fb-tKoX01jajFneo,69666
+scrapling-0.2.97.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+scrapling-0.2.97.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
+scrapling-0.2.97.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
+scrapling-0.2.97.dist-info/RECORD,,

{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.2)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/LICENSE RENAMED Viewed

File without changes

{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{scrapling-0.2.96.dist-info → scrapling-0.2.97.dist-info}/top_level.txt RENAMED Viewed

File without changes

scrapling 0.2.96__py3-none-any.whl → 0.2.97__py3-none-any.whl

scrapling 0.2.96py3-none-any.whl → 0.2.97py3-none-any.whl