PyPI - scrapling - Versions diffs - 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

scrapling 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

scrapling/__init__.py +29 -19
scrapling/cli.py +21 -4
scrapling/core/_types.py +3 -2
scrapling/core/ai.py +24 -15
scrapling/core/custom_types.py +20 -27
scrapling/core/mixins.py +15 -9
scrapling/core/shell.py +6 -4
scrapling/core/storage.py +7 -6
scrapling/core/translator.py +13 -8
scrapling/core/utils/__init__.py +0 -1
scrapling/engines/_browsers/__init__.py +0 -2
scrapling/engines/_browsers/_base.py +45 -21
scrapling/engines/_browsers/_camoufox.py +98 -43
scrapling/engines/_browsers/_config_tools.py +1 -1
scrapling/engines/_browsers/_controllers.py +34 -13
scrapling/engines/_browsers/_validators.py +31 -10
scrapling/engines/constants.py +0 -15
scrapling/engines/static.py +749 -336
scrapling/engines/toolbelt/convertor.py +13 -15
scrapling/engines/toolbelt/custom.py +6 -9
scrapling/engines/toolbelt/fingerprints.py +17 -10
scrapling/engines/toolbelt/navigation.py +11 -3
scrapling/fetchers/__init__.py +46 -0
scrapling/fetchers/chrome.py +210 -0
scrapling/fetchers/firefox.py +212 -0
scrapling/fetchers/requests.py +28 -0
scrapling/parser.py +109 -84
{scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
scrapling-0.3.7.dist-info/RECORD +47 -0
scrapling/fetchers.py +0 -444
scrapling-0.3.5.dist-info/RECORD +0 -44
{scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
{scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0

scrapling/engines/toolbelt/convertor.py CHANGED Viewed

@@ -24,15 +24,15 @@ class ResponseFactory:
     @classmethod
     @lru_cache(maxsize=16)
-    def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
+    def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
         """Extract browser encoding from headers.
         Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
         """
         if content_type:
             # Because Playwright can't do that by themselves like all libraries for some reason :3
             match = __CHARSET_RE__.search(content_type)
-            return match.group(1) if match else None
-        return None
+            return match.group(1) if match else default
+        return default
     @classmethod
     def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
@@ -58,7 +58,8 @@ class ResponseFactory:
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
-                                or "utf-8",
+                                if current_response
+                                else "utf-8",
                                 "cookies": tuple(),
                                 "headers": current_response.all_headers() if current_response else {},
                                 "request_headers": current_request.all_headers(),
@@ -107,15 +108,13 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        encoding = (
-            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
-        )  # default encoding
+        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
         history = cls._process_response_history(first_response, parser_arguments)
         try:
-            page_content = page.content()
+            page_content = final_response.text()
         except Exception as e:  # pragma: no cover
             log.error(f"Error getting page content: {e}")
             page_content = ""
@@ -161,7 +160,8 @@ class ResponseFactory:
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
-                                or "utf-8",
+                                if current_response
+                                else "utf-8",
                                 "cookies": tuple(),
                                 "headers": await current_response.all_headers() if current_response else {},
                                 "request_headers": await current_request.all_headers(),
@@ -210,15 +210,13 @@ class ResponseFactory:
         if not final_response:
             raise ValueError("Failed to get a response from the page")
-        encoding = (
-            cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
-        )  # default encoding
+        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
         # PlayWright API sometimes give empty status text for some reason!
         status_text = final_response.status_text or StatusText.get(final_response.status)
         history = await cls._async_process_response_history(first_response, parser_arguments)
         try:
-            page_content = await page.content()
+            page_content = await final_response.text()
         except Exception as e:  # pragma: no cover
             log.error(f"Error getting page content in async: {e}")
             page_content = ""
@@ -255,8 +253,8 @@ class ResponseFactory:
                 "encoding": response.encoding or "utf-8",
                 "cookies": dict(response.cookies),
                 "headers": dict(response.headers),
-                "request_headers": dict(response.request.headers),
-                "method": response.request.method,
+                "request_headers": dict(response.request.headers) if response.request else {},
+                "method": response.request.method if response.request else "GET",
                 "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
                 **parser_arguments,
             }

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -8,6 +8,7 @@ from scrapling.core.utils import log
 from scrapling.core._types import (
     Any,
     Dict,
+    cast,
     List,
     Optional,
     Tuple,
@@ -30,10 +31,10 @@ class Response(Selector):
         request_headers: Dict,
         encoding: str = "utf-8",
         method: str = "GET",
-        history: List = None,
-        **selector_config: Dict,
+        history: List | None = None,
+        **selector_config: Any,
     ):
-        adaptive_domain = selector_config.pop("adaptive_domain", None)
+        adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
         self.status = status
         self.reason = reason
         self.cookies = cookies
@@ -58,7 +59,7 @@ class BaseFetcher:
     keep_cdata: Optional[bool] = False
     storage_args: Optional[Dict] = None
     keep_comments: Optional[bool] = False
-    adaptive_domain: Optional[str] = None
+    adaptive_domain: str = ""
     parser_keywords: Tuple = (
         "huge_tree",
         "adaptive",
@@ -124,12 +125,8 @@ class BaseFetcher:
             adaptive=cls.adaptive,
             storage=cls.storage,
             storage_args=cls.storage_args,
+            adaptive_domain=cls.adaptive_domain,
         )
-        if cls.adaptive_domain:
-            if not isinstance(cls.adaptive_domain, str):
-                log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
-            else:
-                parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
         return parser_arguments

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -8,9 +8,10 @@ from platform import system as platform_system
 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
-from scrapling.core._types import Dict, Optional
+from scrapling.core._types import Dict, Literal
 __OS_NAME__ = platform_system()
+OSName = Literal["linux", "macos", "windows"]
 @lru_cache(10, typed=True)
@@ -28,16 +29,20 @@ def generate_convincing_referer(url: str) -> str:
 @lru_cache(1, typed=True)
-def get_os_name() -> Optional[str]:
-    """Get the current OS name in the same format needed for browserforge
+def get_os_name() -> OSName | None:
+    """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
     :return: Current OS name or `None` otherwise
     """
-    return {
-        "Linux": "linux",
-        "Darwin": "macos",
-        "Windows": "windows",
-    }.get(__OS_NAME__)
+    match __OS_NAME__:
+        case "Linux":
+            return "linux"
+        case "Darwin":
+            return "macos"
+        case "Windows":
+            return "windows"
+        case _:
+            return None
 def generate_headers(browser_mode: bool = False) -> Dict:
@@ -58,8 +63,10 @@ def generate_headers(browser_mode: bool = False) -> Dict:
                 Browser(name="edge", min_version=130),
             ]
         )
-    return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
+    if os_name:
+        return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
+    else:
+        return HeaderGenerator(browser=browsers, device="desktop").generate()
 __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
 from playwright.sync_api import Route
 from scrapling.core.utils import log
-from scrapling.core._types import Dict, Optional, Tuple
+from scrapling.core._types import Dict, Tuple, overload, Literal
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
 __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
         await route.continue_()
-def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
+@overload
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
+@overload
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
     """Validate a proxy and return it in the acceptable format for Playwright
     Reference: https://playwright.dev/python/docs/network#http-proxy
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
         except ValidationError as e:
             raise TypeError(f"Invalid proxy dictionary: {e}")
-    return None
+    raise TypeError(f"Invalid proxy string: {proxy_string}")
 @lru_cache(10, typed=True)

scrapling/fetchers/__init__.py ADDED Viewed

@@ -0,0 +1,46 @@
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
+    from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
+    from scrapling.fetchers.firefox import StealthyFetcher, StealthySession, AsyncStealthySession
+# Lazy import mapping
+_LAZY_IMPORTS = {
+    "Fetcher": ("scrapling.fetchers.requests", "Fetcher"),
+    "AsyncFetcher": ("scrapling.fetchers.requests", "AsyncFetcher"),
+    "FetcherSession": ("scrapling.fetchers.requests", "FetcherSession"),
+    "DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
+    "DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
+    "AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
+    "StealthyFetcher": ("scrapling.fetchers.firefox", "StealthyFetcher"),
+    "StealthySession": ("scrapling.fetchers.firefox", "StealthySession"),
+    "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
+}
+__all__ = [
+    "Fetcher",
+    "AsyncFetcher",
+    "FetcherSession",
+    "DynamicFetcher",
+    "DynamicSession",
+    "AsyncDynamicSession",
+    "StealthyFetcher",
+    "StealthySession",
+    "AsyncStealthySession",
+]
+def __getattr__(name: str) -> Any:
+    if name in _LAZY_IMPORTS:
+        module_path, class_name = _LAZY_IMPORTS[name]
+        module = __import__(module_path, fromlist=[class_name])
+        return getattr(module, class_name)
+    else:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+def __dir__() -> list[str]:
+    """Support for dir() and autocomplete."""
+    return sorted(list(_LAZY_IMPORTS.keys()))

scrapling/fetchers/chrome.py ADDED Viewed

@@ -0,0 +1,210 @@
+from scrapling.core._types import (
+    Callable,
+    List,
+    Dict,
+    Optional,
+    SelectorWaitStates,
+)
+from scrapling.engines.toolbelt.custom import BaseFetcher, Response
+from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
+class DynamicFetcher(BaseFetcher):
+    """A `Fetcher` class type that provide many options, all of them are based on PlayWright.
+     Using this Fetcher class, you can do requests with:
+        - Vanilla Playwright without any modifications other than the ones you chose.
+        - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress, but it bypasses many online tests like bot.sannysoft.com
+            Some of the things stealth mode does include:
+                1) Patches the CDP runtime fingerprint.
+                2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
+                3) Using custom flags on launch to hide Playwright even more and make it faster.
+                4) Generates real browser's headers of the same type and same user OS, then append it to the request.
+        - Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.
+    > Note that these are the main options with PlayWright, but it can be mixed.
+    """
+    @classmethod
+    def fetch(
+        cls,
+        url: str,
+        headless: bool = True,
+        google_search: bool = True,
+        hide_canvas: bool = False,
+        disable_webgl: bool = False,
+        real_chrome: bool = False,
+        stealth: bool = False,
+        wait: int | float = 0,
+        page_action: Optional[Callable] = None,
+        proxy: Optional[str | Dict[str, str]] = None,
+        locale: str = "en-US",
+        extra_headers: Optional[Dict[str, str]] = None,
+        useragent: Optional[str] = None,
+        cdp_url: Optional[str] = None,
+        timeout: int | float = 30000,
+        disable_resources: bool = False,
+        wait_selector: Optional[str] = None,
+        init_script: Optional[str] = None,
+        cookies: Optional[List[Dict]] = None,
+        network_idle: bool = False,
+        load_dom: bool = True,
+        wait_selector_state: SelectorWaitStates = "attached",
+        additional_args: Optional[Dict] = None,
+        custom_config: Optional[Dict] = None,
+    ) -> Response:
+        """Opens up a browser and do your request based on your chosen options below.
+        :param url: Target url.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
+        :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        :return: A `Response` object.
+        """
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        with DynamicSession(
+            wait=wait,
+            proxy=proxy,
+            locale=locale,
+            timeout=timeout,
+            stealth=stealth,
+            cdp_url=cdp_url,
+            cookies=cookies,
+            headless=headless,
+            load_dom=load_dom,
+            useragent=useragent,
+            real_chrome=real_chrome,
+            page_action=page_action,
+            hide_canvas=hide_canvas,
+            init_script=init_script,
+            network_idle=network_idle,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            wait_selector=wait_selector,
+            disable_webgl=disable_webgl,
+            additional_args=additional_args,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
+            selector_config={**cls._generate_parser_arguments(), **custom_config},
+        ) as session:
+            return session.fetch(url)
+    @classmethod
+    async def async_fetch(
+        cls,
+        url: str,
+        headless: bool = True,
+        google_search: bool = True,
+        hide_canvas: bool = False,
+        disable_webgl: bool = False,
+        real_chrome: bool = False,
+        stealth: bool = False,
+        wait: int | float = 0,
+        page_action: Optional[Callable] = None,
+        proxy: Optional[str | Dict[str, str]] = None,
+        locale: str = "en-US",
+        extra_headers: Optional[Dict[str, str]] = None,
+        useragent: Optional[str] = None,
+        cdp_url: Optional[str] = None,
+        timeout: int | float = 30000,
+        disable_resources: bool = False,
+        wait_selector: Optional[str] = None,
+        init_script: Optional[str] = None,
+        cookies: Optional[List[Dict]] = None,
+        network_idle: bool = False,
+        load_dom: bool = True,
+        wait_selector_state: SelectorWaitStates = "attached",
+        additional_args: Optional[Dict] = None,
+        custom_config: Optional[Dict] = None,
+    ) -> Response:
+        """Opens up a browser and do your request based on your chosen options below.
+        :param url: Target url.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
+        :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        :return: A `Response` object.
+        """
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        async with AsyncDynamicSession(
+            wait=wait,
+            max_pages=1,
+            proxy=proxy,
+            locale=locale,
+            timeout=timeout,
+            stealth=stealth,
+            cdp_url=cdp_url,
+            cookies=cookies,
+            headless=headless,
+            load_dom=load_dom,
+            useragent=useragent,
+            real_chrome=real_chrome,
+            page_action=page_action,
+            hide_canvas=hide_canvas,
+            init_script=init_script,
+            network_idle=network_idle,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            wait_selector=wait_selector,
+            disable_webgl=disable_webgl,
+            additional_args=additional_args,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
+            selector_config={**cls._generate_parser_arguments(), **custom_config},
+        ) as session:
+            return await session.fetch(url)
+PlayWrightFetcher = DynamicFetcher  # For backward-compatibility

scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

scrapling 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl