PyPI - PlaywrightCapture - Versions diffs - 1.22.4__tar.gz → 1.22.5__tar.gz - Mend

PlaywrightCapture 1.22.4tar.gz → 1.22.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: PlaywrightCapture
-Version: 1.22.4
+Version: 1.22.5
 Summary: A simple library to capture websites using playwright
 Home-page: https://github.com/Lookyloo/PlaywrightCapture
 License: BSD-3-Clause
@@ -28,7 +28,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
 Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
 Requires-Dist: pytz (>=2023.3.post1,<2024.0) ; python_version < "3.9"
 Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
-Requires-Dist: setuptools (>=69.0.2,<70.0.0)
+Requires-Dist: setuptools (>=69.0.3,<70.0.0)
 Requires-Dist: tzdata (>=2023.3,<2024.0)
 Requires-Dist: w3lib (>=2.1.2,<3.0.0)
 Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/playwrightcapture/capture.py RENAMED Viewed

@@ -11,9 +11,11 @@ import sys
 import time
 from base64 import b64decode
+from io import BytesIO
 from tempfile import NamedTemporaryFile
 from typing import Optional, Dict, List, Union, Any, TypedDict, Literal, TYPE_CHECKING, Set, Tuple
 from urllib.parse import urlparse, unquote, urljoin
+from zipfile import ZipFile
 import dateparser
 import requests
@@ -459,23 +461,29 @@ class Capture():
         to_return: CaptureResponse = {}
-        self.wait_for_download = False
+        # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
+        # but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't
+        self.wait_for_download = 0
+        # We may have multiple download triggered via JS
+        multiple_downloads: List[Tuple[str, bytes]] = []
         async def handle_download(download: Download) -> None:
             # This method is called when a download event is triggered from JS in a page that also renders
             try:
-                self.wait_for_download = True
+                self.wait_for_download += 1
                 with NamedTemporaryFile() as tmp_f:
                     self.logger.info('Got a download triggered from JS.')
                     await download.save_as(tmp_f.name)
-                    to_return["downloaded_filename"] = download.suggested_filename
+                    filename = download.suggested_filename
                     with open(tmp_f.name, "rb") as f:
-                        to_return["downloaded_file"] = f.read()
+                        file_content = f.read()
+                    multiple_downloads.append((filename, file_content))
                     self.logger.info('Done with download.')
             except Exception as e:
                 self.logger.warning(f'Unable to finish download triggered from JS: {e}')
             finally:
-                self.wait_for_download = False
+                self.wait_for_download -= 1
         if page is not None:
             capturing_sub = True
@@ -504,13 +512,13 @@ class Capture():
                                 await page.goto(url, referer=referer if referer else '')
                             except Exception:
                                 pass
-                            tmp_f = NamedTemporaryFile(delete=False)
-                            download = await download_info.value
-                            await download.save_as(tmp_f.name)
-                            to_return["downloaded_filename"] = download.suggested_filename
-                            with open(tmp_f.name, "rb") as f:
-                                to_return["downloaded_file"] = f.read()
-                            os.unlink(tmp_f.name)
+                            with NamedTemporaryFile() as tmp_f:
+                                download = await download_info.value
+                                await download.save_as(tmp_f.name)
+                                filename = download.suggested_filename
+                                with open(tmp_f.name, "rb") as f:
+                                    file_content = f.read()
+                                multiple_downloads.append((filename, file_content))
                     except PlaywrightTimeoutError:
                         self.logger.debug('No download has been triggered.')
                         raise initial_error
@@ -597,10 +605,24 @@ class Capture():
                 if 'html' in to_return and to_return['html'] is not None and with_favicon:
                     to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
-                if self.wait_for_download:
+                if self.wait_for_download > 0:
                     self.logger.info('Waiting for download to finish...')
                     await self._safe_wait(page)
+                if multiple_downloads:
+                    if len(multiple_downloads) == 1:
+                        to_return["downloaded_filename"] = multiple_downloads[0][0]
+                        to_return["downloaded_file"] = multiple_downloads[0][1]
+                    else:
+                        # we have multiple downloads, making it a zip
+                        mem_zip = BytesIO()
+                        to_return["downloaded_filename"] = 'multiple_downloads.zip'
+                        with ZipFile(mem_zip, 'w') as z:
+                            for i, f_details in enumerate(multiple_downloads):
+                                filename, file_content = f_details
+                                z.writestr(f'{i}_{filename}', file_content)
+                        to_return["downloaded_file"] = mem_zip.getvalue()
                 if depth > 0 and to_return.get('html') and to_return['html']:
                     if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
                         to_return['children'] = []
@@ -630,10 +652,10 @@ class Capture():
                             except (TimeoutError, asyncio.exceptions.TimeoutError):
                                 self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
                             except Exception as e:
-                                self.logger.warning(f'Error while capturing child "{url}": {e}. {total_urls - index - 1} more to go.')
+                                self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.')
                             else:
                                 runtime = int(time.time() - start_time)
-                                self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {total_urls - index - 1} to go.')
+                                self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.')
                             try:
                                 await page.go_back()
                             except PlaywrightTimeoutError:

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "PlaywrightCapture"
-version = "1.22.4"
+version = "1.22.5"
 description = "A simple library to capture websites using playwright"
 authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
 license = "BSD-3-Clause"
@@ -29,7 +29,7 @@ SpeechRecognition = {version = "^3.10.1", optional = true}
 pytz = {"version" = "^2023.3.post1", python = "<3.9"}
 tzdata = "^2023.3"
 playwright-stealth = "^1.0.6"
-setuptools = "^69.0.2"
+setuptools = "^69.0.3"
 [tool.poetry.extras]
 recaptcha = ["requests", "pydub", "SpeechRecognition"]
@@ -40,7 +40,7 @@ optional = true
 [tool.poetry.group.dev.dependencies]
 types-beautifulsoup4 = "^4.12.0.7"
 pytest = "^7.4.3"
-mypy = "^1.7.1"
+mypy = "^1.8.0"
 types-dateparser = "^1.1.4.10"
 types-requests = "^2.31.0.10"
 types-pytz = "^2023.3.1.1"

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/LICENSE RENAMED Viewed

File without changes

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/README.md RENAMED Viewed

File without changes

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/playwrightcapture/__init__.py RENAMED Viewed

File without changes

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/playwrightcapture/exceptions.py RENAMED Viewed

File without changes

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/playwrightcapture/helpers.py RENAMED Viewed

File without changes

{playwrightcapture-1.22.4 → playwrightcapture-1.22.5}/playwrightcapture/py.typed RENAMED Viewed

File without changes

PlaywrightCapture 1.22.4__tar.gz → 1.22.5__tar.gz

PlaywrightCapture 1.22.4tar.gz → 1.22.5tar.gz