PyPI - scrapling - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

scrapling 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

scrapling/__init__.py +1 -1
scrapling/core/_types.py +3 -0
scrapling/core/ai.py +2 -1
scrapling/core/custom_types.py +20 -27
scrapling/core/mixins.py +15 -9
scrapling/core/shell.py +4 -3
scrapling/core/storage.py +5 -5
scrapling/core/translator.py +13 -8
scrapling/engines/_browsers/_base.py +37 -14
scrapling/engines/_browsers/_camoufox.py +76 -35
scrapling/engines/_browsers/_config_tools.py +1 -1
scrapling/engines/_browsers/_controllers.py +32 -11
scrapling/engines/_browsers/_validators.py +31 -10
scrapling/engines/static.py +678 -668
scrapling/engines/toolbelt/convertor.py +13 -15
scrapling/engines/toolbelt/custom.py +6 -9
scrapling/engines/toolbelt/fingerprints.py +17 -10
scrapling/engines/toolbelt/navigation.py +11 -3
scrapling/fetchers/__init__.py +11 -1
scrapling/fetchers/chrome.py +9 -4
scrapling/fetchers/firefox.py +0 -4
scrapling/parser.py +105 -80
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/METADATA +3 -4
scrapling-0.3.7.dist-info/RECORD +47 -0
scrapling-0.3.6.dist-info/RECORD +0 -47
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0

scrapling/engines/toolbelt/convertor.py CHANGED Viewed

@@ -24,15 +24,15 @@ class ResponseFactory:
     @classmethod
     @lru_cache(maxsize=16)
-    def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
+    def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
         """Extract browser encoding from headers.
         Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
         """
         if content_type:
             # Because Playwright can't do that by themselves like all libraries for some reason :3
             match = __CHARSET_RE__.search(content_type)
-            return match.group(1) if match else None
-        return None
+            return match.group(1) if match else default
+        return default
     @classmethod
     def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
@@ -58,7 +58,8 @@ class ResponseFactory:
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
-                                or "utf-8",
+                                if current_response
+                                else "utf-8",
                                 "cookies": tuple(),
                                 "headers": current_response.all_headers() if current_response else {},
                                 "request_headers": current_request.all_headers(),
@@ -107,15 +108,13 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        encoding = (
-            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
-        )  # default encoding
+        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
         history = cls._process_response_history(first_response, parser_arguments)
         try:
-            page_content = page.content()
+            page_content = final_response.text()
         except Exception as e:  # pragma: no cover
             log.error(f"Error getting page content: {e}")
             page_content = ""
@@ -161,7 +160,8 @@ class ResponseFactory:
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
-                                or "utf-8",
+                                if current_response
+                                else "utf-8",
                                 "cookies": tuple(),
                                 "headers": await current_response.all_headers() if current_response else {},
                                 "request_headers": await current_request.all_headers(),
@@ -210,15 +210,13 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        encoding = (
-            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
-        )  # default encoding
+        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
         history = await cls._async_process_response_history(first_response, parser_arguments)
         try:
-            page_content = await page.content()
+            page_content = await final_response.text()
         except Exception as e:  # pragma: no cover
             log.error(f"Error getting page content in async: {e}")
             page_content = ""
@@ -255,8 +253,8 @@ class ResponseFactory:
                 "encoding": response.encoding or "utf-8",
                 "cookies": dict(response.cookies),
                 "headers": dict(response.headers),
-                "request_headers": dict(response.request.headers),
-                "method": response.request.method,
+                "request_headers": dict(response.request.headers) if response.request else {},
+                "method": response.request.method if response.request else "GET",
                 "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
                 **parser_arguments,
             }

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -8,6 +8,7 @@ from scrapling.core.utils import log
 from scrapling.core._types import (
     Any,
     Dict,
+    cast,
     List,
     Optional,
     Tuple,
@@ -30,10 +31,10 @@ class Response(Selector):
         request_headers: Dict,
         encoding: str = "utf-8",
         method: str = "GET",
-        history: List = None,
-        **selector_config: Dict,
+        history: List | None = None,
+        **selector_config: Any,
     ):
-        adaptive_domain = selector_config.pop("adaptive_domain", None)
+        adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
         self.status = status
         self.reason = reason
         self.cookies = cookies
@@ -58,7 +59,7 @@ class BaseFetcher:
     keep_cdata: Optional[bool] = False
     storage_args: Optional[Dict] = None
     keep_comments: Optional[bool] = False
-    adaptive_domain: Optional[str] = None
+    adaptive_domain: str = ""
     parser_keywords: Tuple = (
         "huge_tree",
         "adaptive",
@@ -124,12 +125,8 @@ class BaseFetcher:
             adaptive=cls.adaptive,
             storage=cls.storage,
             storage_args=cls.storage_args,
+            adaptive_domain=cls.adaptive_domain,
         )
-        if cls.adaptive_domain:
-            if not isinstance(cls.adaptive_domain, str):
-                log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
-            else:
-                parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
         return parser_arguments

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -8,9 +8,10 @@ from platform import system as platform_system
 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
-from scrapling.core._types import Dict, Optional
+from scrapling.core._types import Dict, Literal
 __OS_NAME__ = platform_system()
+OSName = Literal["linux", "macos", "windows"]
 @lru_cache(10, typed=True)
@@ -28,16 +29,20 @@ def generate_convincing_referer(url: str) -> str:
 @lru_cache(1, typed=True)
-def get_os_name() -> Optional[str]:
-    """Get the current OS name in the same format needed for browserforge
+def get_os_name() -> OSName | None:
+    """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
     :return: Current OS name or `None` otherwise
     """
-    return {
-        "Linux": "linux",
-        "Darwin": "macos",
-        "Windows": "windows",
-    }.get(__OS_NAME__)
+    match __OS_NAME__:
+        case "Linux":
+            return "linux"
+        case "Darwin":
+            return "macos"
+        case "Windows":
+            return "windows"
+        case _:
+            return None
 def generate_headers(browser_mode: bool = False) -> Dict:
@@ -58,8 +63,10 @@ def generate_headers(browser_mode: bool = False) -> Dict:
                 Browser(name="edge", min_version=130),
             ]
         )
-    return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
+    if os_name:
+        return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
+    else:
+        return HeaderGenerator(browser=browsers, device="desktop").generate()
 __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
 from playwright.sync_api import Route
 from scrapling.core.utils import log
-from scrapling.core._types import Dict, Optional, Tuple
+from scrapling.core._types import Dict, Tuple, overload, Literal
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
 __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
         await route.continue_()
-def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
+@overload
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
+@overload
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
     """Validate a proxy and return it in the acceptable format for Playwright
     Reference: https://playwright.dev/python/docs/network#http-proxy
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
         except ValidationError as e:
             raise TypeError(f"Invalid proxy dictionary: {e}")
-    return None
+    raise TypeError(f"Invalid proxy string: {proxy_string}")
 @lru_cache(10, typed=True)

scrapling/fetchers/__init__.py CHANGED Viewed

@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
     "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
 }
-__all__ = ["Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
+__all__ = [
+    "Fetcher",
+    "AsyncFetcher",
+    "FetcherSession",
+    "DynamicFetcher",
+    "DynamicSession",
+    "AsyncDynamicSession",
+    "StealthyFetcher",
+    "StealthySession",
+    "AsyncStealthySession",
+]
 def __getattr__(name: str) -> Any:

scrapling/fetchers/chrome.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from scrapling.core._types import (
     Callable,
-    Dict,
     List,
+    Dict,
     Optional,
     SelectorWaitStates,
-    Iterable,
 )
 from scrapling.engines.toolbelt.custom import BaseFetcher, Response
 from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
@@ -47,10 +46,11 @@ class DynamicFetcher(BaseFetcher):
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
         init_script: Optional[str] = None,
-        cookies: Optional[Iterable[Dict]] = None,
+        cookies: Optional[List[Dict]] = None,
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
+        additional_args: Optional[Dict] = None,
         custom_config: Optional[Dict] = None,
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -80,6 +80,7 @@ class DynamicFetcher(BaseFetcher):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
         :return: A `Response` object.
         """
         if not custom_config:
@@ -107,6 +108,7 @@ class DynamicFetcher(BaseFetcher):
             extra_headers=extra_headers,
             wait_selector=wait_selector,
             disable_webgl=disable_webgl,
+            additional_args=additional_args,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
             selector_config={**cls._generate_parser_arguments(), **custom_config},
@@ -134,10 +136,11 @@ class DynamicFetcher(BaseFetcher):
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
         init_script: Optional[str] = None,
-        cookies: Optional[Iterable[Dict]] = None,
+        cookies: Optional[List[Dict]] = None,
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
+        additional_args: Optional[Dict] = None,
         custom_config: Optional[Dict] = None,
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -167,6 +170,7 @@ class DynamicFetcher(BaseFetcher):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
         :return: A `Response` object.
         """
         if not custom_config:
@@ -195,6 +199,7 @@ class DynamicFetcher(BaseFetcher):
             extra_headers=extra_headers,
             wait_selector=wait_selector,
             disable_webgl=disable_webgl,
+            additional_args=additional_args,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
             selector_config={**cls._generate_parser_arguments(), **custom_config},

scrapling/fetchers/firefox.py CHANGED Viewed

@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
         """
         if not custom_config:
             custom_config = {}
-        elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         with StealthySession(
             wait=wait,
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
         """
         if not custom_config:
             custom_config = {}
-        elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         async with AsyncStealthySession(
             wait=wait,

scrapling 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

scrapling 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl