PyPI - scrapling - Versions diffs - 0.3.3__tar.gz → 0.3.4__tar.gz - Mend

scrapling 0.3.3tar.gz → 0.3.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{scrapling-0.3.3/scrapling.egg-info → scrapling-0.3.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scrapling
-Version: 0.3.3
+Version: 0.3.4
 Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
@@ -114,14 +114,6 @@ Dynamic: license-file
 </p>
 <p align="center">
-    <a href="https://scrapling.readthedocs.io/en/latest/#installation">
-        Installation
-    </a>
-    ·
-    <a href="https://scrapling.readthedocs.io/en/latest/overview/">
-        Overview
-    </a>
-    ·
     <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
         Selection methods
     </a>
@@ -130,6 +122,14 @@ Dynamic: license-file
         Choosing a fetcher
     </a>
     ·
+    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
+        CLI
+    </a>
+    ·
+    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
+        MCP mode
+    </a>
+    ·
     <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
         Migrating from Beautifulsoup
     </a>
@@ -159,6 +159,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
 <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
 <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
+<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
 <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
 <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
 <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>

{scrapling-0.3.3 → scrapling-0.3.4}/README.md RENAMED Viewed

@@ -24,14 +24,6 @@
 </p>
 <p align="center">
-    <a href="https://scrapling.readthedocs.io/en/latest/#installation">
-        Installation
-    </a>
-    ·
-    <a href="https://scrapling.readthedocs.io/en/latest/overview/">
-        Overview
-    </a>
-    ·
     <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
         Selection methods
     </a>
@@ -40,6 +32,14 @@
         Choosing a fetcher
     </a>
     ·
+    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
+        CLI
+    </a>
+    ·
+    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
+        MCP mode
+    </a>
+    ·
     <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
         Migrating from Beautifulsoup
     </a>
@@ -69,6 +69,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
 <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
 <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
+<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
 <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
 <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
 <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>

{scrapling-0.3.3 → scrapling-0.3.4}/scrapling/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.3.3"
+__version__ = "0.3.4"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"

{scrapling-0.3.3 → scrapling-0.3.4}/scrapling/core/shell.py RENAMED Viewed

@@ -317,7 +317,7 @@ def show_page_in_browser(page: Selector):  # pragma: no cover
     try:
         fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
-        with open(fd, "w", encoding=page.encoding) as f:
+        with open(fd, "wb") as f:
             f.write(page.body)
         open_in_browser(f"file://{fname}")
@@ -335,15 +335,25 @@ class CustomShell:
         from scrapling.fetchers import (
             Fetcher as __Fetcher,
             AsyncFetcher as __AsyncFetcher,
+            FetcherSession as __FetcherSession,
             DynamicFetcher as __DynamicFetcher,
+            DynamicSession as __DynamicSession,
+            AsyncDynamicSession as __AsyncDynamicSession,
             StealthyFetcher as __StealthyFetcher,
+            StealthySession as __StealthySession,
+            AsyncStealthySession as __AsyncStealthySession,
         )
         self.__InteractiveShellEmbed = __InteractiveShellEmbed
         self.__Fetcher = __Fetcher
         self.__AsyncFetcher = __AsyncFetcher
+        self.__FetcherSession = __FetcherSession
         self.__DynamicFetcher = __DynamicFetcher
+        self.__DynamicSession = __DynamicSession
+        self.__AsyncDynamicSession = __AsyncDynamicSession
         self.__StealthyFetcher = __StealthyFetcher
+        self.__StealthySession = __StealthySession
+        self.__AsyncStealthySession = __AsyncStealthySession
         self.code = code
         self.page = None
         self.pages = Selectors([])
@@ -379,9 +389,9 @@ class CustomShell:
         """Create a custom banner for the shell"""
         return f"""
 -> Available Scrapling objects:
-   - Fetcher/AsyncFetcher
-   - DynamicFetcher
-   - StealthyFetcher
+   - Fetcher/AsyncFetcher/FetcherSession
+   - DynamicFetcher/DynamicSession/AsyncDynamicSession
+   - StealthyFetcher/StealthySession/AsyncStealthySession
    - Selector
 -> Useful shortcuts:
@@ -449,6 +459,11 @@ Type 'exit' or press Ctrl+D to exit.
             "delete": delete,
             "Fetcher": self.__Fetcher,
             "AsyncFetcher": self.__AsyncFetcher,
+            "FetcherSession": self.__FetcherSession,
+            "DynamicSession": self.__DynamicSession,
+            "AsyncDynamicSession": self.__AsyncDynamicSession,
+            "StealthySession": self.__StealthySession,
+            "AsyncStealthySession": self.__AsyncStealthySession,
             "fetch": dynamic_fetch,
             "DynamicFetcher": self.__DynamicFetcher,
             "stealthy_fetch": stealthy_fetch,

{scrapling-0.3.3 → scrapling-0.3.4}/scrapling/engines/_browsers/_base.py RENAMED Viewed

@@ -31,7 +31,7 @@ class SyncSession:
     def __init__(self, max_pages: int = 1):
         self.max_pages = max_pages
         self.page_pool = PagePool(max_pages)
-        self.__max_wait_for_page = 60
+        self._max_wait_for_page = 60
         self.playwright: Optional[Playwright] = None
         self.context: Optional[BrowserContext] = None
         self._closed = False
@@ -50,7 +50,7 @@ class SyncSession:
         # If we're at max capacity after cleanup, wait for busy pages to finish
         if self.page_pool.pages_count >= self.max_pages:
             start_time = time()
-            while time() - start_time < self.__max_wait_for_page:
+            while time() - start_time < self._max_wait_for_page:
                 # Wait for any pages to finish, then clean them up
                 sleep(0.05)
                 self.page_pool.close_all_finished_pages()
@@ -58,7 +58,7 @@ class SyncSession:
                     break
             else:
                 raise TimeoutError(
-                    f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
+                    f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
                 )
         page = self.context.new_page()
@@ -111,7 +111,7 @@ class AsyncSession(SyncSession):
             # If we're at max capacity after cleanup, wait for busy pages to finish
             if self.page_pool.pages_count >= self.max_pages:
                 start_time = time()
-                while time() - start_time < self.__max_wait_for_page:
+                while time() - start_time < self._max_wait_for_page:
                     # Wait for any pages to finish, then clean them up
                     await asyncio_sleep(0.05)
                     await self.page_pool.aclose_all_finished_pages()
@@ -119,7 +119,7 @@ class AsyncSession(SyncSession):
                         break
                 else:
                     raise TimeoutError(
-                        f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
+                        f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
                     )
             page = await self.context.new_page()

{scrapling-0.3.3 → scrapling-0.3.4}/scrapling/engines/_browsers/_camoufox.py RENAMED Viewed

@@ -14,6 +14,7 @@ from playwright.async_api import (
     Locator as AsyncLocator,
     Page as async_Page,
 )
+from playwright._impl._errors import Error as PlaywrightError
 from ._validators import validate, CamoufoxConfig
 from ._base import SyncSession, AsyncSession, StealthySessionMixin
@@ -201,20 +202,34 @@ class StealthySession(StealthySessionMixin, SyncSession):
         self._closed = True
+    @staticmethod
+    def _get_page_content(page: Page) -> str | None:
+        """
+        A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        :param page: The page to extract content from.
+        :return:
+        """
+        while True:
+            try:
+                return page.content() or ""
+            except PlaywrightError:
+                page.wait_for_timeout(1000)
+                continue
     def _solve_cloudflare(self, page: Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed
         :param page: The targeted page
         :return:
         """
-        challenge_type = self._detect_cloudflare(page.content())
+        challenge_type = self._detect_cloudflare(self._get_page_content(page))
         if not challenge_type:
             log.error("No Cloudflare challenge found.")
             return
         else:
             log.info(f'The turnstile version discovered is "{challenge_type}"')
             if challenge_type == "non-interactive":
-                while "<title>Just a moment...</title>" in (page.content()):
+                while "<title>Just a moment...</title>" in (self._get_page_content(page)):
                     log.info("Waiting for Cloudflare wait page to disappear.")
                     page.wait_for_timeout(1000)
                     page.wait_for_load_state()
@@ -222,7 +237,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
                 return
             else:
-                while "Verifying you are human." in page.content():
+                while "Verifying you are human." in self._get_page_content(page):
                     # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
                     page.wait_for_timeout(500)
@@ -506,20 +521,34 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
         self._closed = True
+    @staticmethod
+    async def _get_page_content(page: async_Page) -> str | None:
+        """
+        A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        :param page: The page to extract content from.
+        :return:
+        """
+        while True:
+            try:
+                return (await page.content()) or ""
+            except PlaywrightError:
+                await page.wait_for_timeout(1000)
+                continue
     async def _solve_cloudflare(self, page: async_Page):
         """Solve the cloudflare challenge displayed on the playwright page passed. The async version
         :param page: The async targeted page
         :return:
         """
-        challenge_type = self._detect_cloudflare(await page.content())
+        challenge_type = self._detect_cloudflare(await self._get_page_content(page))
         if not challenge_type:
             log.error("No Cloudflare challenge found.")
             return
         else:
             log.info(f'The turnstile version discovered is "{challenge_type}"')
             if challenge_type == "non-interactive":  # pragma: no cover
-                while "<title>Just a moment...</title>" in (await page.content()):
+                while "<title>Just a moment...</title>" in (await self._get_page_content(page)):
                     log.info("Waiting for Cloudflare wait page to disappear.")
                     await page.wait_for_timeout(1000)
                     await page.wait_for_load_state()
@@ -527,7 +556,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
                 return
             else:
-                while "Verifying you are human." in (await page.content()):
+                while "Verifying you are human." in (await self._get_page_content(page)):
                     # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
                     await page.wait_for_timeout(500)

{scrapling-0.3.3 → scrapling-0.3.4}/scrapling/parser.py RENAMED Viewed

@@ -339,7 +339,10 @@ class Selector(SelectorsGeneration):
     @property
     def html_content(self) -> TextHandler:
         """Return the inner HTML code of the element"""
-        return TextHandler(tostring(self._root, encoding=self.encoding, method="html", with_tail=False))
+        content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False)
+        if isinstance(content, bytes):
+            content = content.decode("utf-8")
+        return TextHandler(content)
     @property
     def body(self):
@@ -348,15 +351,16 @@ class Selector(SelectorsGeneration):
     def prettify(self) -> TextHandler:
         """Return a prettified version of the element's inner html-code"""
-        return TextHandler(
-            tostring(
-                self._root,
-                encoding=self.encoding,
-                pretty_print=True,
-                method="html",
-                with_tail=False,
-            )
+        content = tostring(
+            self._root,
+            encoding=self.encoding,
+            pretty_print=True,
+            method="html",
+            with_tail=False,
         )
+        if isinstance(content, bytes):
+            content = content.decode("utf-8")
+        return TextHandler(content)
     def has_class(self, class_name: str) -> bool:
         """Check if the element has a specific class

{scrapling-0.3.3 → scrapling-0.3.4/scrapling.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scrapling
-Version: 0.3.3
+Version: 0.3.4
 Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
@@ -114,14 +114,6 @@ Dynamic: license-file
 </p>
 <p align="center">
-    <a href="https://scrapling.readthedocs.io/en/latest/#installation">
-        Installation
-    </a>
-    ·
-    <a href="https://scrapling.readthedocs.io/en/latest/overview/">
-        Overview
-    </a>
-    ·
     <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
         Selection methods
     </a>
@@ -130,6 +122,14 @@ Dynamic: license-file
         Choosing a fetcher
     </a>
     ·
+    <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
+        CLI
+    </a>
+    ·
+    <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
+        MCP mode
+    </a>
+    ·
     <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
         Migrating from Beautifulsoup
     </a>
@@ -159,6 +159,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
 <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
 <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
+<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
 <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
 <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
 <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>

{scrapling-0.3.3 → scrapling-0.3.4}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = scrapling
-version = 0.3.3
+version = 0.3.4
 author = Karim Shoair
 author_email = karim.shoair@pm.me
 description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!