PyPI - PlaywrightCapture - Versions diffs - 1.21.8__tar.gz → 1.21.10__tar.gz - Mend

PlaywrightCapture 1.21.8tar.gz → 1.21.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: PlaywrightCapture
-Version: 1.21.8
+Version: 1.21.10
 Summary: A simple library to capture websites using playwright
 Home-page: https://github.com/Lookyloo/PlaywrightCapture
 License: BSD-3-Clause
@@ -22,9 +22,9 @@ Provides-Extra: recaptcha
 Requires-Dist: SpeechRecognition (>=3.10.0,<4.0.0) ; extra == "recaptcha"
 Requires-Dist: beautifulsoup4[lxml] (>=4.12.2,<5.0.0)
 Requires-Dist: dateparser (>=1.1.8,<2.0.0)
-Requires-Dist: playwright (>=1.37.0,<2.0.0)
+Requires-Dist: playwright (>=1.38.0,<2.0.0)
 Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
-Requires-Dist: pytz (>=2023.3,<2024.0) ; python_version < "3.9"
+Requires-Dist: pytz (>=2023.3.post1,<2024.0) ; python_version < "3.9"
 Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
 Requires-Dist: tzdata (>=2023.3,<2024.0)
 Requires-Dist: w3lib (>=2.1.2,<3.0.0)

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/capture.py RENAMED Viewed

@@ -20,7 +20,7 @@ import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_bytes
-from playwright.async_api import async_playwright, Frame, Error, Page
+from playwright.async_api import async_playwright, Frame, Error, Page, Download
 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
 from w3lib.html import strip_html5_whitespace
 from w3lib.url import canonicalize_url, safe_url_string
@@ -429,21 +429,42 @@ class Capture():
                            rendered_hostname_only: bool=True,
                            with_favicon: bool=False
                            ) -> CaptureResponse:
         to_return: CaptureResponse = {}
-        try:
-            if page:
-                capturing_sub = True
-            else:
-                capturing_sub = False
-                page = await self.context.new_page()
-                page.set_default_timeout(self._capture_timeout * 1000)
+        self.wait_for_download = False
+        async def handle_download(download: Download) -> None:
+            # This method is called when a download event is triggered from JS in a page that also renders
+            try:
+                self.wait_for_download = True
+                self.logger.info('Got a download triggered from JS.')
+                tmp_f = NamedTemporaryFile(delete=False)
+                await download.save_as(tmp_f.name)
+                to_return["downloaded_filename"] = download.suggested_filename
+                with open(tmp_f.name, "rb") as f:
+                    to_return["downloaded_file"] = f.read()
+                os.unlink(tmp_f.name)
+                self.logger.info('Done with download.')
+            except Exception as e:
+                self.logger.warning(f'Unable to finish download triggered from JS: {e}')
+            finally:
+                self.wait_for_download = False
+        if page is not None:
+            capturing_sub = True
+        else:
+            capturing_sub = False
+            page = await self.context.new_page()
+            page.set_default_timeout(self._capture_timeout * 1000)
+        try:
             # Parse the URL. If there is a fragment, we need to scroll to it manually
             parsed_url = urlparse(url, allow_fragments=True)
             try:
                 # NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
                 await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
+                page.on("download", handle_download)
             except Error as initial_error:
                 self._update_exceptions(initial_error)
                 # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
@@ -547,6 +568,10 @@ class Capture():
                 if 'html' in to_return and to_return['html'] is not None and with_favicon:
                     to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
+                if self.wait_for_download:
+                    self.logger.info('Waiting for download to finish...')
+                    await self._safe_wait(page)
                 if depth > 0 and to_return.get('html') and to_return['html']:
                     if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
                         to_return['children'] = []
@@ -816,7 +841,8 @@ class Capture():
                 name, _ = name.split(' at ', maxsplit=1)
             elif '; ' in name:
                 name, _ = name.split('; ', maxsplit=1)
-            exception.name = name.strip()
+            # This is kinda dirty.
+            exception._name = name.strip()
     def _exception_is_network_error(self, exception: Error) -> bool:
         if exception.name in [

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "PlaywrightCapture"
-version = "1.21.8"
+version = "1.21.10"
 description = "A simple library to capture websites using playwright"
 authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
 license = "BSD-3-Clause"
@@ -21,14 +21,14 @@ include = ['README.md']
 [tool.poetry.dependencies]
 python = "^3.8"
-playwright = "^1.37.0"
+playwright = "^1.38.0"
 dateparser = "^1.1.8"
 beautifulsoup4 = {version= "^4.12.2", extras = ["lxml"]}
 w3lib = "^2.1.2"
 requests = {extras = ["socks"], version = "^2.31.0"}
 pydub = {version = "^0.25.1", optional = true}
 SpeechRecognition = {version = "^3.10.0", optional = true}
-pytz = {"version" = "^2023.3", python = "<3.9"}
+pytz = {"version" = "^2023.3.post1", python = "<3.9"}
 tzdata = "^2023.3"
 [tool.poetry.extras]
@@ -39,11 +39,11 @@ optional = true
 [tool.poetry.group.dev.dependencies]
 types-beautifulsoup4 = "^4.12.0.6"
-pytest = "^7.4.0"
+pytest = "^7.4.2"
 mypy = "^1.5.1"
 types-dateparser = "^1.1.4.10"
-types-requests = "^2.31.0.2"
-types-pytz = "^2023.3.0.1"
+types-requests = "^2.31.0.6"
+types-pytz = "^2023.3.1.1"
 [build-system]

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/LICENSE RENAMED Viewed

File without changes

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/README.md RENAMED Viewed

File without changes

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/__init__.py RENAMED Viewed

File without changes

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/exceptions.py RENAMED Viewed

File without changes

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/helpers.py RENAMED Viewed

File without changes

{playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/py.typed RENAMED Viewed

File without changes

PlaywrightCapture 1.21.8__tar.gz → 1.21.10__tar.gz

PlaywrightCapture 1.21.8tar.gz → 1.21.10tar.gz