PyPI - scrapling - Versions diffs - 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

scrapling 0.3py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

scrapling/__init__.py +1 -1
scrapling/cli.py +38 -51
scrapling/core/_html_utils.py +3 -9
scrapling/core/ai.py +5 -13
scrapling/core/custom_types.py +19 -61
scrapling/core/mixins.py +6 -28
scrapling/core/shell.py +51 -129
scrapling/core/storage.py +2 -8
scrapling/core/translator.py +8 -20
scrapling/core/utils/__init__.py +10 -0
scrapling/core/utils/_shell.py +48 -0
scrapling/core/{utils.py → utils/_utils.py} +5 -21
scrapling/engines/__init__.py +0 -16
scrapling/engines/_browsers/_base.py +297 -0
scrapling/engines/_browsers/_camoufox.py +238 -293
scrapling/engines/_browsers/_config_tools.py +2 -1
scrapling/engines/_browsers/_controllers.py +220 -278
scrapling/engines/_browsers/_page.py +37 -15
scrapling/engines/_browsers/_validators.py +29 -15
scrapling/engines/constants.py +3 -6
scrapling/engines/static.py +25 -75
scrapling/engines/toolbelt/__init__.py +1 -20
scrapling/engines/toolbelt/convertor.py +95 -86
scrapling/engines/toolbelt/custom.py +7 -99
scrapling/engines/toolbelt/fingerprints.py +1 -3
scrapling/engines/toolbelt/navigation.py +4 -58
scrapling/fetchers.py +41 -24
scrapling/parser.py +45 -122
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
scrapling-0.3.2.dist-info/RECORD +44 -0
scrapling-0.3.dist-info/RECORD +0 -41
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0

scrapling/engines/_browsers/_page.py CHANGED Viewed

@@ -6,7 +6,7 @@ from playwright.async_api import Page as AsyncPage
 from scrapling.core._types import Optional, List, Literal
-PageState = Literal["ready", "busy", "error"]  # States that a page can be in
+PageState = Literal["finished", "ready", "busy", "error"]  # States that a page can be in
 @dataclass
@@ -23,9 +23,9 @@ class PageInfo:
         self.state = "busy"
         self.url = url
-    def mark_ready(self):
-        """Mark the page as ready for new requests"""
-        self.state = "ready"
+    def mark_finished(self):
+        """Mark the page as finished for new requests"""
+        self.state = "finished"
         self.url = ""
     def mark_error(self):
@@ -62,24 +62,16 @@ class PagePool:
             self.pages.append(page_info)
             return page_info
-    def get_ready_page(self) -> Optional[PageInfo]:
-        """Get a page that's ready for use"""
-        with self._lock:
-            for page_info in self.pages:
-                if page_info.state == "ready":
-                    return page_info
-            return None
     @property
     def pages_count(self) -> int:
         """Get the total number of pages"""
         return len(self.pages)
     @property
-    def ready_count(self) -> int:
-        """Get the number of ready pages"""
+    def finished_count(self) -> int:
+        """Get the number of finished pages"""
         with self._lock:
-            return sum(1 for p in self.pages if p.state == "ready")
+            return sum(1 for p in self.pages if p.state == "finished")
     @property
     def busy_count(self) -> int:
@@ -91,3 +83,33 @@ class PagePool:
         """Remove pages in error state"""
         with self._lock:
             self.pages = [p for p in self.pages if p.state != "error"]
+    def close_all_finished_pages(self):
+        """Close all pages in finished state and remove them from the pool"""
+        with self._lock:
+            pages_to_remove = []
+            for page_info in self.pages:
+                if page_info.state == "finished":
+                    try:
+                        page_info.page.close()
+                    except Exception:
+                        pass
+                    pages_to_remove.append(page_info)
+            for page_info in pages_to_remove:
+                self.pages.remove(page_info)
+    async def aclose_all_finished_pages(self):
+        """Async version: Close all pages in finished state and remove them from the pool"""
+        with self._lock:
+            pages_to_remove = []
+            for page_info in self.pages:
+                if page_info.state == "finished":
+                    try:
+                        await page_info.page.close()
+                    except Exception:
+                        pass
+                    pages_to_remove.append(page_info)
+            for page_info in pages_to_remove:
+                self.pages.remove(page_info)

scrapling/engines/_browsers/_validators.py CHANGED Viewed

@@ -9,7 +9,7 @@ from scrapling.core._types import (
     List,
     SelectorWaitStates,
 )
-from scrapling.engines.toolbelt import construct_proxy_dict
+from scrapling.engines.toolbelt.navigation import construct_proxy_dict
 class PlaywrightConfig(Struct, kw_only=True, frozen=False):
@@ -25,17 +25,17 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
     stealth: bool = False
     wait: int | float = 0
     page_action: Optional[Callable] = None
-    proxy: Optional[str | Dict[str, str]] = (
-        None  # The default value for proxy in Playwright's source is `None`
-    )
+    proxy: Optional[str | Dict[str, str]] = None  # The default value for proxy in Playwright's source is `None`
     locale: str = "en-US"
     extra_headers: Optional[Dict[str, str]] = None
     useragent: Optional[str] = None
     timeout: int | float = 30000
+    init_script: Optional[str] = None
     disable_resources: bool = False
     wait_selector: Optional[str] = None
     cookies: Optional[List[Dict]] = None
     network_idle: bool = False
+    load_dom: bool = True
     wait_selector_state: SelectorWaitStates = "attached"
     selector_config: Optional[Dict] = None
@@ -45,10 +45,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
             raise ValueError("max_pages must be between 1 and 50")
         if self.timeout < 0:
             raise ValueError("timeout must be >= 0")
-        if self.page_action is not None and not callable(self.page_action):
-            raise TypeError(
-                f"page_action must be callable, got {type(self.page_action).__name__}"
-            )
+        if self.page_action and not callable(self.page_action):
+            raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
         if self.proxy:
             self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
         if self.cdp_url:
@@ -58,6 +56,15 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
         if not self.selector_config:
             self.selector_config = {}
+        if self.init_script is not None:
+            script_path = Path(self.init_script)
+            if not script_path.exists():
+                raise ValueError("Init script path not found")
+            elif not script_path.is_file():
+                raise ValueError("Init script is not a file")
+            elif not script_path.is_absolute():
+                raise ValueError("Init script is not a absolute path")
     @staticmethod
     def __validate_cdp(cdp_url):
         try:
@@ -86,10 +93,12 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
     block_webrtc: bool = False
     allow_webgl: bool = True
     network_idle: bool = False
+    load_dom: bool = True
     humanize: bool | float = True
     solve_cloudflare: bool = False
     wait: int | float = 0
     timeout: int | float = 30000
+    init_script: Optional[str] = None
     page_action: Optional[Callable] = None
     wait_selector: Optional[str] = None
     addons: Optional[List[str]] = None
@@ -97,9 +106,7 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
     cookies: Optional[List[Dict]] = None
     google_search: bool = True
     extra_headers: Optional[Dict[str, str]] = None
-    proxy: Optional[str | Dict[str, str]] = (
-        None  # The default value for proxy in Playwright's source is `None`
-    )
+    proxy: Optional[str | Dict[str, str]] = None  # The default value for proxy in Playwright's source is `None`
     os_randomize: bool = False
     disable_ads: bool = False
     geoip: bool = False
@@ -112,10 +119,8 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
             raise ValueError("max_pages must be between 1 and 50")
         if self.timeout < 0:
             raise ValueError("timeout must be >= 0")
-        if self.page_action is not None and not callable(self.page_action):
-            raise TypeError(
-                f"page_action must be callable, got {type(self.page_action).__name__}"
-            )
+        if self.page_action and not callable(self.page_action):
+            raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
         if self.proxy:
             self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
@@ -131,6 +136,15 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
                         f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
                     )
+        if self.init_script is not None:
+            script_path = Path(self.init_script)
+            if not script_path.exists():
+                raise ValueError("Init script path not found")
+            elif not script_path.is_file():
+                raise ValueError("Init script is not a file")
+            elif not script_path.is_absolute():
+                raise ValueError("Init script is not a absolute path")
         if not self.cookies:
             self.cookies = []
         if self.solve_cloudflare and self.timeout < 60_000:

scrapling/engines/constants.py CHANGED Viewed

@@ -16,9 +16,9 @@ HARMFUL_DEFAULT_ARGS = (
     # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
     "--enable-automation",
     "--disable-popup-blocking",
-    # '--disable-component-update',
-    # '--disable-default-apps',
-    # '--disable-extensions',
+    "--disable-component-update",
+    "--disable-default-apps",
+    "--disable-extensions",
 )
 DEFAULT_FLAGS = (
@@ -50,7 +50,6 @@ DEFAULT_STEALTH_FLAGS = (
     "--accept-lang=en-US",
     "--use-mock-keychain",
     "--disable-translate",
-    "--disable-extensions",
     "--disable-voice-input",
     "--window-position=0,0",
     "--disable-wake-on-wifi",
@@ -59,7 +58,6 @@ DEFAULT_STEALTH_FLAGS = (
     "--enable-web-bluetooth",
     "--disable-hang-monitor",
     "--disable-cloud-import",
-    "--disable-default-apps",
     "--disable-print-preview",
     "--disable-dev-shm-usage",
     # '--disable-popup-blocking',
@@ -72,7 +70,6 @@ DEFAULT_STEALTH_FLAGS = (
     "--force-color-profile=srgb",
     "--font-render-hinting=none",
     "--aggressive-cache-discard",
-    "--disable-component-update",
     "--disable-cookie-encryption",
     "--disable-domain-reliability",
     "--disable-threaded-animation",

scrapling/engines/static.py CHANGED Viewed

@@ -24,13 +24,9 @@ from scrapling.core._types import (
     Any,
 )
-from .toolbelt import (
-    Response,
-    generate_convincing_referer,
-    generate_headers,
-    ResponseFactory,
-    __default_useragent__,
-)
+from .toolbelt.custom import Response
+from .toolbelt.convertor import ResponseFactory
+from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__
 _UNSET = object()
@@ -108,13 +104,9 @@ class FetcherSession:
         headers = self.get_with_precedence(kwargs, "headers", self.default_headers)
         stealth = self.get_with_precedence(kwargs, "stealth", self.stealth)
-        impersonate = self.get_with_precedence(
-            kwargs, "impersonate", self.default_impersonate
-        )
+        impersonate = self.get_with_precedence(kwargs, "impersonate", self.default_impersonate)
-        if self.get_with_precedence(
-            kwargs, "http3", self.default_http3
-        ):  # pragma: no cover
+        if self.get_with_precedence(kwargs, "http3", self.default_http3):  # pragma: no cover
             request_args["http_version"] = CurlHttpVersion.V3ONLY
             if impersonate:
                 log.warning(
@@ -126,25 +118,13 @@ class FetcherSession:
                 "url": url,
                 # Curl automatically generates the suitable browser headers when you use `impersonate`
                 "headers": self._headers_job(url, headers, stealth, bool(impersonate)),
-                "proxies": self.get_with_precedence(
-                    kwargs, "proxies", self.default_proxies
-                ),
+                "proxies": self.get_with_precedence(kwargs, "proxies", self.default_proxies),
                 "proxy": self.get_with_precedence(kwargs, "proxy", self.default_proxy),
-                "proxy_auth": self.get_with_precedence(
-                    kwargs, "proxy_auth", self.default_proxy_auth
-                ),
-                "timeout": self.get_with_precedence(
-                    kwargs, "timeout", self.default_timeout
-                ),
-                "allow_redirects": self.get_with_precedence(
-                    kwargs, "allow_redirects", self.default_follow_redirects
-                ),
-                "max_redirects": self.get_with_precedence(
-                    kwargs, "max_redirects", self.default_max_redirects
-                ),
-                "verify": self.get_with_precedence(
-                    kwargs, "verify", self.default_verify
-                ),
+                "proxy_auth": self.get_with_precedence(kwargs, "proxy_auth", self.default_proxy_auth),
+                "timeout": self.get_with_precedence(kwargs, "timeout", self.default_timeout),
+                "allow_redirects": self.get_with_precedence(kwargs, "allow_redirects", self.default_follow_redirects),
+                "max_redirects": self.get_with_precedence(kwargs, "max_redirects", self.default_max_redirects),
+                "verify": self.get_with_precedence(kwargs, "verify", self.default_verify),
                 "cert": self.get_with_precedence(kwargs, "cert", self.default_cert),
                 "impersonate": impersonate,
                 **{
@@ -192,18 +172,12 @@ class FetcherSession:
             extra_headers = generate_headers(browser_mode=False)
             # Don't overwrite user-supplied headers
-            extra_headers = {
-                key: value
-                for key, value in extra_headers.items()
-                if key.lower() not in headers_keys
-            }
+            extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
             headers.update(extra_headers)
         elif "user-agent" not in headers_keys and not impersonate_enabled:
             headers["User-Agent"] = __default_useragent__
-            log.debug(
-                f"Can't find useragent in headers so '{headers['User-Agent']}' was used."
-            )
+            log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
         return headers
@@ -215,9 +189,7 @@ class FetcherSession:
                 "Create a new FetcherSession instance for a new independent session, "
                 "or use the current instance sequentially after the previous context has exited."
             )
-        if (
-            self._async_curl_session
-        ):  # Prevent mixing if async is active from this instance
+        if self._async_curl_session:  # Prevent mixing if async is active from this instance
             raise RuntimeError(
                 "This FetcherSession instance has an active asynchronous session. "
                 "Cannot enter a synchronous context simultaneously with the same manager instance."
@@ -275,9 +247,7 @@ class FetcherSession:
         :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
         """
         session = self._curl_session
-        if session is True and not any(
-            (self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
-        ):
+        if session is True and not any((self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)):
             # For usage inside FetcherClient
             # It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
             session = CurlSession()
@@ -290,9 +260,7 @@ class FetcherSession:
                     return ResponseFactory.from_http_request(response, selector_config)
                 except CurlError as e:  # pragma: no cover
                     if attempt < max_retries - 1:
-                        log.error(
-                            f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds..."
-                        )
+                        log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
                         time_sleep(retry_delay)
                     else:
                         log.error(f"Failed after {max_retries} attempts: {e}")
@@ -320,9 +288,7 @@ class FetcherSession:
         :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
         """
         session = self._async_curl_session
-        if session is True and not any(
-            (self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
-        ):
+        if session is True and not any((self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)):
             # For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
             # 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
             # 2. `curl_cffi` doesn't support making async requests without sessions
@@ -337,9 +303,7 @@ class FetcherSession:
                     return ResponseFactory.from_http_request(response, selector_config)
                 except CurlError as e:  # pragma: no cover
                     if attempt < max_retries - 1:
-                        log.error(
-                            f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds..."
-                        )
+                        log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
                         await asyncio_sleep(retry_delay)
                     else:
                         log.error(f"Failed after {max_retries} attempts: {e}")
@@ -372,19 +336,13 @@ class FetcherSession:
         selector_config = kwargs.pop("selector_config", {}) or self.selector_config
         max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
-        retry_delay = self.get_with_precedence(
-            kwargs, "retry_delay", self.default_retry_delay
-        )
+        retry_delay = self.get_with_precedence(kwargs, "retry_delay", self.default_retry_delay)
         request_args = self._merge_request_args(stealth=stealth, **kwargs)
         if self._curl_session:
-            return self.__make_request(
-                method, request_args, max_retries, retry_delay, selector_config
-            )
+            return self.__make_request(method, request_args, max_retries, retry_delay, selector_config)
         elif self._async_curl_session:
             # The returned value is a Coroutine
-            return self.__make_async_request(
-                method, request_args, max_retries, retry_delay, selector_config
-            )
+            return self.__make_async_request(method, request_args, max_retries, retry_delay, selector_config)
         raise RuntimeError("No active session available.")
@@ -455,9 +413,7 @@ class FetcherSession:
             "http3": http3,
             **kwargs,
         }
-        return self.__prepare_and_dispatch(
-            "GET", stealth=stealthy_headers, **request_args
-        )
+        return self.__prepare_and_dispatch("GET", stealth=stealthy_headers, **request_args)
     def post(
         self,
@@ -532,9 +488,7 @@ class FetcherSession:
             "http3": http3,
             **kwargs,
         }
-        return self.__prepare_and_dispatch(
-            "POST", stealth=stealthy_headers, **request_args
-        )
+        return self.__prepare_and_dispatch("POST", stealth=stealthy_headers, **request_args)
     def put(
         self,
@@ -609,9 +563,7 @@ class FetcherSession:
             "http3": http3,
             **kwargs,
         }
-        return self.__prepare_and_dispatch(
-            "PUT", stealth=stealthy_headers, **request_args
-        )
+        return self.__prepare_and_dispatch("PUT", stealth=stealthy_headers, **request_args)
     def delete(
         self,
@@ -688,9 +640,7 @@ class FetcherSession:
             "http3": http3,
             **kwargs,
         }
-        return self.__prepare_and_dispatch(
-            "DELETE", stealth=stealthy_headers, **request_args
-        )
+        return self.__prepare_and_dispatch("DELETE", stealth=stealthy_headers, **request_args)
 class FetcherClient(FetcherSession):

scrapling/engines/toolbelt/__init__.py CHANGED Viewed

@@ -1,20 +1 @@
-from .custom import (
-    BaseFetcher,
-    Response,
-    StatusText,
-    get_variable_name,
-)
-from .fingerprints import (
-    generate_convincing_referer,
-    generate_headers,
-    get_os_name,
-    __default_useragent__,
-)
-from .navigation import (
-    async_intercept_route,
-    construct_cdp_url,
-    construct_proxy_dict,
-    intercept_route,
-    js_bypass_path,
-)
-from .convertor import ResponseFactory

scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl

scrapling 0.3py3-none-any.whl → 0.3.2py3-none-any.whl