PyPI - PlaywrightCapture - Versions diffs - 1.28.4__tar.gz → 1.28.6__tar.gz - Mend

PlaywrightCapture 1.28.4tar.gz → 1.28.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: PlaywrightCapture
-Version: 1.28.4
+Version: 1.28.6
 Summary: A simple library to capture websites using playwright
 License: BSD-3-Clause
 Author: Raphaël Vinot

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/playwrightcapture/capture.py RENAMED Viewed

@@ -159,8 +159,9 @@ class Capture():
         master_logger = logging.getLogger('playwrightcapture')
         master_logger.setLevel(loglevel)
         self.logger: Logger | PlaywrightCaptureLogAdapter
-        if uuid is not None:
-            self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': uuid})
+        self.uuid = uuid
+        if self.uuid is not None:
+            self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': self.uuid})
         else:
             self.logger = master_logger
         self.browser_name: BROWSER = browser if browser else 'chromium'
@@ -934,6 +935,7 @@ class Capture():
                            ) -> CaptureResponse:
         to_return: CaptureResponse = {}
+        errors: list[str] = []
         got_favicons = False
         # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
@@ -996,6 +998,7 @@ class Capture():
             except Error as e:
                 self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}')
                 self.should_retry = True
+                to_return['error'] = f'Unable to create new page: {e}'
                 return to_return
             if allow_tracking:
@@ -1049,8 +1052,8 @@ class Capture():
                             error_msg = download.failure()
                             if not error_msg:
                                 raise e
-                            to_return['error'] = f"Error while downloading: {error_msg}"
-                            self.logger.info(to_return['error'])
+                            errors.append(f"Error while downloading: {error_msg}")
+                            self.logger.info(f'Error while downloading: {error_msg}')
                             self.should_retry = True
                         except Exception:
                             raise e
@@ -1073,20 +1076,6 @@ class Capture():
                 except Exception as e:
                     self.logger.exception(f'Error during instrumentation: {e}')
-                if multiple_downloads:
-                    if len(multiple_downloads) == 1:
-                        to_return["downloaded_filename"] = multiple_downloads[0][0]
-                        to_return["downloaded_file"] = multiple_downloads[0][1]
-                    else:
-                        # we have multiple downloads, making it a zip
-                        mem_zip = BytesIO()
-                        to_return["downloaded_filename"] = 'multiple_downloads.zip'
-                        with ZipFile(mem_zip, 'w') as z:
-                            for i, f_details in enumerate(multiple_downloads):
-                                filename, file_content = f_details
-                                z.writestr(f'{i}_{filename}', file_content)
-                        to_return["downloaded_file"] = mem_zip.getvalue()
                 if content := await self._failsafe_get_content(page):
                     to_return['html'] = content
@@ -1150,7 +1139,7 @@ class Capture():
                             if consecutive_errors >= 5:
                                 # if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
                                 self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
-                                to_return['error'] = "Got more than 5 consecutive errors while capturing children"
+                                errors.append("Got more than 5 consecutive errors while capturing children")
                                 self.should_retry = True
                                 break
@@ -1162,19 +1151,19 @@ class Capture():
                                 self.logger.info(f'Unable to go back: {e}.')
         except PlaywrightTimeoutError as e:
-            to_return['error'] = f"The capture took too long - {e.message}"
+            errors.append(f"The capture took too long - {e.message}")
             self.should_retry = True
         except (asyncio.TimeoutError, TimeoutError):
-            to_return['error'] = "Something in the capture took too long"
+            errors.append("Something in the capture took too long")
             self.should_retry = True
         except TargetClosedError as e:
-            to_return['error'] = f"The target was closed - {e}"
+            errors.append(f"The target was closed - {e}")
             self.should_retry = True
         except Error as e:
-            # NOTE: there are a lot of errors that look like duplicates and they are trggered at different times in the process.
-            # it is tricky to figure our which one whouls (and should not) trigger a retry. Below is our best guess and it will change over time.
+            # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process.
+            # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time.
             self._update_exceptions(e)
-            to_return['error'] = e.message
+            errors.append(e.message)
             to_return['error_name'] = e.name
             # TODO: check e.message and figure out if it is worth retrying or not.
             # NOTE: e.name is generally (always?) "Error"
@@ -1183,6 +1172,7 @@ class Capture():
             elif self._retry_network_error(e) or self._retry_browser_error(e):
                 # this one sounds like something we can retry...
                 self.logger.info(f'Issue with {url} (retrying): {e.message}')
+                errors.append(f'Issue with {url}: {e.message}')
                 self.should_retry = True
             else:
                 # Unexpected ones
@@ -1190,25 +1180,56 @@ class Capture():
         except Exception as e:
             # we may get a non-playwright exception to.
             # The ones we try to handle here should be treated as if they were.
-            to_return['error'] = str(e)
-            if to_return['error'] in ['Connection closed while reading from the driver']:
+            errors.append(str(e))
+            if str(e) in ['Connection closed while reading from the driver']:
                 self.logger.info(f'Issue with {url} (retrying): {e}')
+                errors.append(f'Issue with {url}: {e}')
                 self.should_retry = True
             else:
                 raise e
         finally:
             self.logger.debug('Finishing up capture.')
             if not capturing_sub:
+                if multiple_downloads:
+                    if len(multiple_downloads) == 1:
+                        to_return["downloaded_filename"] = multiple_downloads[0][0]
+                        to_return["downloaded_file"] = multiple_downloads[0][1]
+                    else:
+                        # we have multiple downloads, making it a zip, make sure the filename is unique
+                        mem_zip = BytesIO()
+                        to_return["downloaded_filename"] = f'{self.uuid}_multiple_downloads.zip'
+                        with ZipFile(mem_zip, 'w') as z:
+                            for i, f_details in enumerate(multiple_downloads):
+                                filename, file_content = f_details
+                                z.writestr(f'{i}_{filename}', file_content)
+                        to_return["downloaded_file"] = mem_zip.getvalue()
                 try:
-                    to_return['storage'] = await self._failsafe_get_storage()
-                    to_return['cookies'] = await self._failsafe_get_cookies()
-                    self.logger.debug('Done with cookies and storage.')
-                except Exception as e:
-                    if 'error' not in to_return:
-                        to_return['error'] = f'Unable to get the storage: {e}'
+                    async with timeout(15):
+                        to_return['cookies'] = await self.context.cookies()
+                except (TimeoutError, asyncio.TimeoutError):
+                    self.logger.warning("Unable to get cookies (timeout).")
+                    errors.append("Unable to get the cookies (timeout).")
+                    self.should_retry = True
+                except Error as e:
+                    self.logger.warning(f"Unable to get cookies: {e}")
+                    errors.append(f'Unable to get the cookies: {e}')
+                    self.should_retry = True
+                try:
+                    async with timeout(15):
+                        to_return['storage'] = await self.context.storage_state(indexed_db=True)
+                except (TimeoutError, asyncio.TimeoutError):
+                    self.logger.warning("Unable to get storage (timeout).")
+                    errors.append("Unable to get the storage (timeout).")
+                    self.should_retry = True
+                except Error as e:
+                    self.logger.warning(f"Unable to get the storage: {e}")
+                    errors.append(f'Unable to get the storage: {e}')
+                    self.should_retry = True
                 # frames_tree = self.make_frame_tree(page.main_frame)
                 try:
-                    async with timeout(60):
+                    async with timeout(30):
                         page.remove_listener("requestfinished", store_request)
                         await page.close(reason="Closing the page because the capture finished.")
                         self.logger.debug('Page closed.')
@@ -1219,30 +1240,16 @@ class Capture():
                         self.logger.debug('Got HAR.')
                 except (TimeoutError, asyncio.TimeoutError):
                     self.logger.warning("Unable to close page and context at the end of the capture.")
+                    errors.append("Unable to close page and context at the end of the capture.")
                     self.should_retry = True
                 except Exception as e:
                     self.logger.warning(f"Other exception while finishing up the capture: {e}.")
-                    if 'error' not in to_return:
-                        to_return['error'] = f'Unable to generate HAR file: {e}'
+                    errors.append(f'Unable to generate HAR file: {e}')
         self.logger.debug('Capture done')
+        if errors:
+            to_return['error'] = '\n'.join(errors)
         return to_return
-    async def _failsafe_get_cookies(self) -> list[Cookie] | None:
-        try:
-            async with timeout(15):
-                return await self.context.cookies()
-        except (TimeoutError, asyncio.TimeoutError):
-            self.logger.warning("Unable to get cookies (timeout).")
-        return None
-    async def _failsafe_get_storage(self) -> StorageState | None:
-        try:
-            async with timeout(15):
-                return await self.context.storage_state(indexed_db=True)
-        except (TimeoutError, asyncio.TimeoutError):
-            self.logger.warning("Unable to get storage (timeout).")
-        return None
     async def _failsafe_get_screenshot(self, page: Page) -> bytes:
         self.logger.debug("Capturing a screenshot of the full page.")
         try:
@@ -1293,7 +1300,7 @@ class Capture():
         tries = 3
         while tries:
             try:
-                async with timeout(30):
+                async with timeout(15):
                     return await page.content()
             except (Error, TimeoutError, asyncio.TimeoutError):
                 self.logger.debug('Unable to get page content, trying again.')

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "PlaywrightCapture"
-version = "1.28.4"
+version = "1.28.6"
 description = "A simple library to capture websites using playwright"
 authors = [
     {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
@@ -49,7 +49,7 @@ recaptcha = [
 types-beautifulsoup4 = "^4.12.0.20250204"
 pytest = "^8.3.5"
 mypy = "^1.15.0"
-types-dateparser = "^1.2.0.20250208"
+types-dateparser = "^1.2.0.20250408"
 types-pytz = "^2025.2.0.20250326"

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/LICENSE RENAMED Viewed

File without changes

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/README.md RENAMED Viewed

File without changes

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/playwrightcapture/__init__.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/playwrightcapture/exceptions.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/playwrightcapture/helpers.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.4 → playwrightcapture-1.28.6}/playwrightcapture/py.typed RENAMED Viewed

File without changes

PlaywrightCapture 1.28.4__tar.gz → 1.28.6__tar.gz

PlaywrightCapture 1.28.4tar.gz → 1.28.6tar.gz