PyPI - scrapling - Versions diffs - 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl - Mend

scrapling 0.2.98py3-none-any.whl → 0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

scrapling/__init__.py +18 -31
scrapling/cli.py +818 -20
scrapling/core/_html_utils.py +348 -0
scrapling/core/_types.py +34 -17
scrapling/core/ai.py +611 -0
scrapling/core/custom_types.py +183 -100
scrapling/core/mixins.py +27 -19
scrapling/core/shell.py +647 -0
scrapling/core/{storage_adaptors.py → storage.py} +41 -33
scrapling/core/translator.py +20 -26
scrapling/core/utils.py +49 -54
scrapling/engines/__init__.py +15 -6
scrapling/engines/_browsers/__init__.py +2 -0
scrapling/engines/_browsers/_camoufox.py +745 -0
scrapling/engines/_browsers/_config_tools.py +130 -0
scrapling/engines/_browsers/_controllers.py +630 -0
scrapling/engines/_browsers/_page.py +93 -0
scrapling/engines/_browsers/_validators.py +150 -0
scrapling/engines/constants.py +101 -88
scrapling/engines/static.py +667 -110
scrapling/engines/toolbelt/__init__.py +20 -6
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
scrapling/engines/toolbelt/convertor.py +254 -0
scrapling/engines/toolbelt/custom.py +205 -186
scrapling/engines/toolbelt/fingerprints.py +32 -46
scrapling/engines/toolbelt/navigation.py +68 -39
scrapling/fetchers.py +255 -260
scrapling/parser.py +781 -449
scrapling-0.3.dist-info/METADATA +409 -0
scrapling-0.3.dist-info/RECORD +41 -0
{scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
{scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
scrapling/defaults.py +0 -19
scrapling/engines/camo.py +0 -299
scrapling/engines/pw.py +0 -428
scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
scrapling-0.2.98.dist-info/METADATA +0 -867
scrapling-0.2.98.dist-info/RECORD +0 -49
tests/__init__.py +0 -1
tests/fetchers/__init__.py +0 -1
tests/fetchers/async/__init__.py +0 -0
tests/fetchers/async/test_camoufox.py +0 -95
tests/fetchers/async/test_httpx.py +0 -83
tests/fetchers/async/test_playwright.py +0 -99
tests/fetchers/sync/__init__.py +0 -0
tests/fetchers/sync/test_camoufox.py +0 -68
tests/fetchers/sync/test_httpx.py +0 -82
tests/fetchers/sync/test_playwright.py +0 -87
tests/fetchers/test_utils.py +0 -97
tests/parser/__init__.py +0 -0
tests/parser/test_automatch.py +0 -111
tests/parser/test_general.py +0 -330
{scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
{scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -1,74 +1,97 @@
 """
 Functions related to files and URLs
 """
-import os
+from pathlib import Path
+from functools import lru_cache
 from urllib.parse import urlencode, urlparse
 from playwright.async_api import Route as async_Route
+from msgspec import Struct, structs, convert, ValidationError
 from playwright.sync_api import Route
-from scrapling.core._types import Dict, Optional, Union
-from scrapling.core.utils import log, lru_cache
+from scrapling.core.utils import log
+from scrapling.core._types import Dict, Optional, Tuple
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
+__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
+class ProxyDict(Struct):
+    server: str
+    username: str = ""
+    password: str = ""
 def intercept_route(route: Route):
-    """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
+    """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
     :param route: PlayWright `Route` object of the current page
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
+        log.debug(
+            f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
+        )
         route.abort()
     else:
         route.continue_()
 async def async_intercept_route(route: async_Route):
-    """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
+    """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
     :param route: PlayWright `Route` object of the current page
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
+        log.debug(
+            f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
+        )
         await route.abort()
     else:
         await route.continue_()
-def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
+def construct_proxy_dict(
+    proxy_string: str | Dict[str, str], as_tuple=False
+) -> Optional[Dict | Tuple]:
     """Validate a proxy and return it in the acceptable format for Playwright
     Reference: https://playwright.dev/python/docs/network#http-proxy
     :param proxy_string: A string or a dictionary representation of the proxy.
+    :param as_tuple: Return the proxy dictionary as a tuple to be cachable
     :return:
     """
-    if proxy_string:
-        if isinstance(proxy_string, str):
-            proxy = urlparse(proxy_string)
-            try:
-                return {
-                    'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
-                    'username': proxy.username or '',
-                    'password': proxy.password or '',
-                }
-            except ValueError:
-                # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
-                raise TypeError('The proxy argument\'s string is in invalid format!')
-        elif isinstance(proxy_string, dict):
-            valid_keys = ('server', 'username', 'password', )
-            if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
-                return proxy_string
-            else:
-                raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')
-        else:
-            raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')
-    # The default value for proxy in Playwright's source is `None`
+    if isinstance(proxy_string, str):
+        proxy = urlparse(proxy_string)
+        if (
+            proxy.scheme not in ("http", "https", "socks4", "socks5")
+            or not proxy.hostname
+        ):
+            raise ValueError("Invalid proxy string!")
+        try:
+            result = {
+                "server": f"{proxy.scheme}://{proxy.hostname}",
+                "username": proxy.username or "",
+                "password": proxy.password or "",
+            }
+            if proxy.port:
+                result["server"] += f":{proxy.port}"
+            return tuple(result.items()) if as_tuple else result
+        except ValueError:
+            # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
+            raise ValueError("The proxy argument's string is in invalid format!")
+    elif isinstance(proxy_string, dict):
+        try:
+            validated = convert(proxy_string, ProxyDict)
+            result_dict = structs.asdict(validated)
+            return tuple(result_dict.items()) if as_tuple else result_dict
+        except ValidationError as e:
+            raise TypeError(f"Invalid proxy dictionary: {e}")
     return None
@@ -84,17 +107,24 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
         parsed = urlparse(cdp_url)
         # Check scheme
-        if parsed.scheme not in ('ws', 'wss'):
+        if parsed.scheme not in ("ws", "wss"):
             raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
         # Validate hostname and port
         if not parsed.netloc:
             raise ValueError("Invalid hostname for the CDP URL")
-        # Ensure path starts with /
+        try:
+            # Checking if the port is valid (if available)
+            _ = parsed.port
+        except ValueError:
+            # urlparse will raise `ValueError` if the port can't be casted to integer
+            raise ValueError("Invalid port for the CDP URL")
+        # Ensure the path starts with /
         path = parsed.path
-        if not path.startswith('/'):
-            path = '/' + path
+        if not path.startswith("/"):
+            path = "/" + path
         # Reconstruct the base URL with validated parts
         validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
@@ -112,10 +142,9 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
 @lru_cache(10, typed=True)
 def js_bypass_path(filename: str) -> str:
-    """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
+    """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it
     :param filename: The base filename of the JS file.
     :return: The full path of the JS file.
     """
-    current_directory = os.path.dirname(__file__)
-    return os.path.join(current_directory, 'bypasses', filename)
+    return str(__BYPASSES_DIR__ / filename)

scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl

scrapling 0.2.98py3-none-any.whl → 0.3py3-none-any.whl