PyPI - scrapling - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

scrapling 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

scrapling/__init__.py +29 -19
scrapling/cli.py +25 -8
scrapling/core/_types.py +0 -2
scrapling/core/ai.py +22 -14
scrapling/core/custom_types.py +2 -2
scrapling/core/shell.py +6 -5
scrapling/core/storage.py +2 -1
scrapling/core/utils/__init__.py +0 -1
scrapling/engines/_browsers/__init__.py +0 -2
scrapling/engines/_browsers/_base.py +11 -36
scrapling/engines/_browsers/_camoufox.py +75 -60
scrapling/engines/_browsers/_controllers.py +43 -52
scrapling/engines/_browsers/_page.py +1 -42
scrapling/engines/_browsers/_validators.py +130 -65
scrapling/engines/constants.py +0 -15
scrapling/engines/static.py +417 -16
scrapling/engines/toolbelt/navigation.py +1 -1
scrapling/fetchers/__init__.py +36 -0
scrapling/fetchers/chrome.py +205 -0
scrapling/fetchers/firefox.py +216 -0
scrapling/fetchers/requests.py +28 -0
scrapling/parser.py +7 -7
{scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/METADATA +25 -23
scrapling-0.3.6.dist-info/RECORD +47 -0
scrapling/fetchers.py +0 -444
scrapling-0.3.4.dist-info/RECORD +0 -44
{scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/WHEEL +0 -0
{scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/entry_points.txt +0 -0
{scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/licenses/LICENSE +0 -0
{scrapling-0.3.4.dist-info → scrapling-0.3.6.dist-info}/top_level.txt +0 -0

scrapling/__init__.py CHANGED Viewed

@@ -1,28 +1,38 @@
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.3.4"
+__version__ = "0.3.6"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"
+from typing import Any, TYPE_CHECKING
-# A lightweight approach to create a lazy loader for each import for backward compatibility
-# This will reduces initial memory footprint significantly (only loads what's used)
-def __getattr__(name):
-    lazy_imports = {
-        "Fetcher": ("scrapling.fetchers", "Fetcher"),
-        "Selector": ("scrapling.parser", "Selector"),
-        "Selectors": ("scrapling.parser", "Selectors"),
-        "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
-        "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
-        "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
-        "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
-        "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
-    }
-    if name in lazy_imports:
-        module_path, class_name = lazy_imports[name]
+if TYPE_CHECKING:
+    from scrapling.parser import Selector, Selectors
+    from scrapling.core.custom_types import AttributesHandler, TextHandler
+    from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
+# Lazy import mapping
+_LAZY_IMPORTS = {
+    "Fetcher": ("scrapling.fetchers", "Fetcher"),
+    "Selector": ("scrapling.parser", "Selector"),
+    "Selectors": ("scrapling.parser", "Selectors"),
+    "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
+    "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
+    "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
+    "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
+    "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
+}
+__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
+def __getattr__(name: str) -> Any:
+    if name in _LAZY_IMPORTS:
+        module_path, class_name = _LAZY_IMPORTS[name]
         module = __import__(module_path, fromlist=[class_name])
         return getattr(module, class_name)
     else:
-        raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
+def __dir__() -> list[str]:
+    """Support for dir() and autocomplete."""
+    return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])

scrapling/cli.py CHANGED Viewed

@@ -2,8 +2,9 @@ from pathlib import Path
 from subprocess import check_output
 from sys import executable as python_executable
+from scrapling.core.utils import log
 from scrapling.engines.toolbelt.custom import Response
-from scrapling.core.utils import log, _CookieParser, _ParseHeaders
+from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
 from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
 from orjson import loads as json_loads, JSONDecodeError
@@ -32,8 +33,8 @@ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any
     try:
         return json_loads(json_string)
-    except JSONDecodeError as e:  # pragma: no cover
-        raise ValueError(f"Invalid JSON data '{json_string}': {e}")
+    except JSONDecodeError as err:  # pragma: no cover
+        raise ValueError(f"Invalid JSON data '{json_string}': {err}")
 def __Request_and_Save(
@@ -65,8 +66,8 @@ def __ParseExtractArguments(
         for key, value in _CookieParser(cookies):
             try:
                 parsed_cookies[key] = value
-            except Exception as e:
-                raise ValueError(f"Could not parse cookies '{cookies}': {e}")
+            except Exception as err:
+                raise ValueError(f"Could not parse cookies '{cookies}': {err}")
     parsed_json = __ParseJSONData(json)
     parsed_params = {}
@@ -135,10 +136,26 @@ def install(force):  # pragma: no cover
 @command(help="Run Scrapling's MCP server (Check the docs for more info).")
-def mcp():
+@option(
+    "--http",
+    is_flag=True,
+    default=False,
+    help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
+)
+@option(
+    "--host",
+    type=str,
+    default="0.0.0.0",
+    help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
+)
+@option(
+    "--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
+)
+def mcp(http, host, port):
     from scrapling.core.ai import ScraplingMCPServer
-    ScraplingMCPServer().serve()
+    server = ScraplingMCPServer()
+    server.serve(http, host, port)
 @command(help="Interactive scraping console")
@@ -766,7 +783,7 @@ def stealthy_fetch(
     :param disable_resources: Drop requests of unnecessary resources for a speed boost.
     :param block_webrtc: Blocks WebRTC entirely.
     :param humanize: Humanize the cursor movement.
-    :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page.
+    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
     :param allow_webgl: Allow WebGL (recommended to keep enabled).
     :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
     :param disable_ads: Install the uBlock Origin addon on the browser.

scrapling/core/_types.py CHANGED Viewed

@@ -39,6 +39,4 @@ except ImportError:  # pragma: no cover
     try:
         from typing_extensions import Self  # Backport
     except ImportError:
-        from typing import TypeVar
         Self = object

scrapling/core/ai.py CHANGED Viewed

@@ -42,10 +42,7 @@ def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResp
 class ScraplingMCPServer:
-    _server = FastMCP(name="Scrapling")
     @staticmethod
-    @_server.tool()
     def get(
         url: str,
         impersonate: Optional[BrowserTypeLiteral] = "chrome",
@@ -124,7 +121,6 @@ class ScraplingMCPServer:
         )
     @staticmethod
-    @_server.tool()
     async def bulk_get(
         urls: Tuple[str, ...],
         impersonate: Optional[BrowserTypeLiteral] = "chrome",
@@ -211,7 +207,6 @@ class ScraplingMCPServer:
             ]
     @staticmethod
-    @_server.tool()
     async def fetch(
         url: str,
         extraction_type: extraction_types = "markdown",
@@ -263,7 +258,7 @@ class ScraplingMCPServer:
         :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
         :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
         :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
-        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
@@ -300,7 +295,6 @@ class ScraplingMCPServer:
         )
     @staticmethod
-    @_server.tool()
     async def bulk_fetch(
         urls: Tuple[str, ...],
         extraction_type: extraction_types = "markdown",
@@ -352,7 +346,7 @@ class ScraplingMCPServer:
         :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
         :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
         :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
-        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
@@ -394,7 +388,6 @@ class ScraplingMCPServer:
             ]
     @staticmethod
-    @_server.tool()
     async def stealthy_fetch(
         url: str,
         extraction_type: extraction_types = "markdown",
@@ -443,7 +436,7 @@ class ScraplingMCPServer:
         :param cookies: Set cookies for the next request.
         :param addons: List of Firefox addons to use. Must be paths to extracted addons.
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
-        :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
         :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
@@ -494,7 +487,6 @@ class ScraplingMCPServer:
         )
     @staticmethod
-    @_server.tool()
     async def bulk_stealthy_fetch(
         urls: Tuple[str, ...],
         extraction_type: extraction_types = "markdown",
@@ -543,7 +535,7 @@ class ScraplingMCPServer:
         :param cookies: Set cookies for the next request.
         :param addons: List of Firefox addons to use. Must be paths to extracted addons.
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
-        :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
         :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
@@ -598,6 +590,22 @@ class ScraplingMCPServer:
                 for page in responses
             ]
-    def serve(self):
+    def serve(self, http: bool, host: str, port: int):
         """Serve the MCP server."""
-        self._server.run(transport="stdio")
+        server = FastMCP(name="Scrapling", host=host, port=port)
+        server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
+        server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
+        server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
+        server.add_tool(
+            self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
+        )
+        server.add_tool(
+            self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
+        )
+        server.add_tool(
+            self.bulk_stealthy_fetch,
+            title="bulk_stealthy_fetch",
+            description=self.bulk_stealthy_fetch.__doc__,
+            structured_output=True,
+        )
+        server.run(transport="stdio" if not http else "streamable-http")

scrapling/core/custom_types.py CHANGED Viewed

@@ -145,7 +145,7 @@ class TextHandler(str):
         clean_match: bool = False,
         case_sensitive: bool = True,
         check_match: Literal[False] = False,
-    ) -> "TextHandlers[TextHandler]": ...
+    ) -> "TextHandlers": ...
     def re(
         self,
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
-    ) -> "TextHandlers[TextHandler]":
+    ) -> "TextHandlers":
         """Call the ``.re()`` method for each element in this list and return
         their results flattened as TextHandlers.

scrapling/core/shell.py CHANGED Viewed

@@ -22,10 +22,11 @@ from logging import (
 from orjson import loads as json_loads, JSONDecodeError
 from scrapling import __version__
+from scrapling.core.utils import log
 from scrapling.parser import Selector, Selectors
 from scrapling.core.custom_types import TextHandler
 from scrapling.engines.toolbelt.custom import Response
-from scrapling.core.utils import log, _ParseHeaders, _CookieParser
+from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
 from scrapling.core._types import (
     Optional,
     Dict,
@@ -201,7 +202,7 @@ class CurlParser:
                 data_payload = parsed_args.data_binary  # Fallback to string
         elif parsed_args.data_raw is not None:
-            data_payload = parsed_args.data_raw
+            data_payload = parsed_args.data_raw.lstrip("$")
         elif parsed_args.data is not None:
             data_payload = parsed_args.data
@@ -317,8 +318,8 @@ def show_page_in_browser(page: Selector):  # pragma: no cover
     try:
         fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
-        with open(fd, "wb") as f:
-            f.write(page.body)
+        with open(fd, "w", encoding=page.encoding) as f:
+            f.write(page.html_content)
         open_in_browser(f"file://{fname}")
     except IOError as e:
@@ -545,7 +546,7 @@ class Convertor:
             for page in pages:
                 match extraction_type:
                     case "markdown":
-                        yield cls._convert_to_markdown(page.body)
+                        yield cls._convert_to_markdown(page.html_content)
                     case "html":
                         yield page.body
                     case "text":

scrapling/core/storage.py CHANGED Viewed

@@ -6,7 +6,6 @@ from sqlite3 import connect as db_connect
 from orjson import dumps, loads
 from lxml.html import HtmlElement
-from tldextract import extract as tld
 from scrapling.core.utils import _StorageTools, log
 from scrapling.core._types import Dict, Optional, Any
@@ -26,6 +25,8 @@ class StorageSystemMixin(ABC):  # pragma: no cover
             return default_value
         try:
+            from tldextract import extract as tld
             extracted = tld(self.url)
             return extracted.top_domain_under_public_suffix or extracted.domain or default_value
         except AttributeError:

scrapling/core/utils/__init__.py CHANGED Viewed

@@ -7,4 +7,3 @@ from ._utils import (
     clean_spaces,
     html_forbidden,
 )
-from ._shell import _CookieParser, _ParseHeaders

scrapling/engines/_browsers/__init__.py CHANGED Viewed

	@@ -1,2 +0,0 @@
1	- from ._controllers import DynamicSession, AsyncDynamicSession
2	- from ._camoufox import StealthySession, AsyncStealthySession

scrapling/engines/_browsers/_base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from time import time, sleep
+from time import time
 from asyncio import sleep as asyncio_sleep, Lock
 from camoufox import DefaultAddons
@@ -12,17 +12,13 @@ from camoufox.utils import (
     installed_verstr as camoufox_version,
 )
-from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
-from scrapling.core._types import (
-    Any,
-    Dict,
-    Optional,
-)
 from ._page import PageInfo, PagePool
-from ._config_tools import _compiled_stealth_scripts
-from ._config_tools import _launch_kwargs, _context_kwargs
+from scrapling.parser import Selector
+from scrapling.core._types import Dict, Optional
 from scrapling.engines.toolbelt.fingerprints import get_os_name
 from ._validators import validate, PlaywrightConfig, CamoufoxConfig
+from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
+from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
 __ff_version_str__ = camoufox_version().split(".", 1)[0]
@@ -44,23 +40,7 @@ class SyncSession:
     ) -> PageInfo:  # pragma: no cover
         """Get a new page to use"""
-        # Close all finished pages to ensure clean state
-        self.page_pool.close_all_finished_pages()
-        # If we're at max capacity after cleanup, wait for busy pages to finish
-        if self.page_pool.pages_count >= self.max_pages:
-            start_time = time()
-            while time() - start_time < self._max_wait_for_page:
-                # Wait for any pages to finish, then clean them up
-                sleep(0.05)
-                self.page_pool.close_all_finished_pages()
-                if self.page_pool.pages_count < self.max_pages:
-                    break
-            else:
-                raise TimeoutError(
-                    f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
-                )
+        # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
         page = self.context.new_page()
         page.set_default_navigation_timeout(timeout)
         page.set_default_timeout(timeout)
@@ -76,11 +56,6 @@ class SyncSession:
         return self.page_pool.add_page(page)
-    @staticmethod
-    def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
-        """Get value with request-level priority over session-level"""
-        return request_value if request_value is not sentinel_value else session_value
     def get_pool_stats(self) -> Dict[str, int]:
         """Get statistics about the current page pool"""
         return {
@@ -105,16 +80,11 @@ class AsyncSession(SyncSession):
     ) -> PageInfo:  # pragma: no cover
         """Get a new page to use"""
         async with self._lock:
-            # Close all finished pages to ensure clean state
-            await self.page_pool.aclose_all_finished_pages()
             # If we're at max capacity after cleanup, wait for busy pages to finish
             if self.page_pool.pages_count >= self.max_pages:
                 start_time = time()
                 while time() - start_time < self._max_wait_for_page:
-                    # Wait for any pages to finish, then clean them up
                     await asyncio_sleep(0.05)
-                    await self.page_pool.aclose_all_finished_pages()
                     if self.page_pool.pages_count < self.max_pages:
                         break
                 else:
@@ -294,4 +264,9 @@ class StealthySessionMixin:
             if f"cType: '{ctype}'" in page_content:
                 return ctype
+        # Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
+        selector = Selector(content=page_content)
+        if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
+            return "embedded"
         return None

scrapling 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

scrapling 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl