PyPI - PlaywrightCapture - Versions diffs - 1.25.3__tar.gz → 1.25.5__tar.gz - Mend

PlaywrightCapture 1.25.3tar.gz → 1.25.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: PlaywrightCapture
-Version: 1.25.3
+Version: 1.25.5
 Summary: A simple library to capture websites using playwright
 Home-page: https://github.com/Lookyloo/PlaywrightCapture
 License: BSD-3-Clause

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/capture.py RENAMED Viewed

@@ -165,10 +165,15 @@ class Capture():
         if proxy:
             if isinstance(proxy, str):
                 self.proxy = {'server': proxy}
-            else:
+            elif isinstance(proxy, dict):
                 self.proxy = {'server': proxy['server'], 'bypass': proxy.get('bypass', ''),
                               'username': proxy.get('username', ''),
                               'password': proxy.get('password', '')}
+            elif isinstance(proxy, int):
+                # This is clearly a mistake, just ignoring it
+                self.logger.warning('Proxy is an integer, this is a mistake, ignoring it.')
+            else:
+                raise InvalidPlaywrightParameter(f'Invalid proxy parameter: "{proxy}" ({type(proxy)})')
         self.should_retry: bool = False
         self.__network_not_idle: int = 2  # makes sure we do not wait for network idle the max amount of time the capture is allowed to take
@@ -682,8 +687,7 @@ class Capture():
             capturing_sub = False
             try:
                 page = await self.context.new_page()
-                await page.clock.install()
-                page.on("dialog", lambda dialog: dialog.accept())
+                # await page.clock.install()
             except Error as e:
                 self.logger.warning(f'The context is in a broken state: {e}')
                 self.should_retry = True
@@ -707,6 +711,7 @@ class Capture():
             page.set_default_timeout((self._capture_timeout - 2) * 1000)
             # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
             page.on("requestfinished", store_request)
+            page.on("dialog", lambda dialog: dialog.accept())
         try:
             # Parse the URL. If there is a fragment, we need to scroll to it manually
@@ -762,31 +767,31 @@ class Capture():
                 await self._wait_for_random_timeout(page, 5)  # Wait 5 sec after document loaded
                 self.logger.debug('Start instrumentation.')
-                # ==== recaptcha
-                # Same technique as: https://github.com/NikolaiT/uncaptcha3
-                if CAN_SOLVE_CAPTCHA:
-                    try:
-                        if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
-                                and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
-                            self.logger.info('Found a captcha')
-                            await self._recaptcha_solver(page)
-                    except PlaywrightTimeoutError as e:
-                        self.logger.info(f'Captcha on {url} is not ready: {e}')
-                    except TargetClosedError as e:
-                        self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
-                    except Error as e:
-                        self.logger.warning(f'Error while resolving captcha on {url}: {e}')
-                    except Exception as e:
-                        self.logger.exception(f'General error with captcha solving on {url}: {e}')
-                # ======
-                # NOTE: testing
-                # await self.__cloudflare_bypass_attempt(page)
-                self.logger.debug('Done with captcha.')
                 # check if we have anything on the page. If we don't, the page is not working properly.
                 if await self._failsafe_get_content(page):
                     self.logger.debug('Got rendered content')
+                    # ==== recaptcha
+                    # Same technique as: https://github.com/NikolaiT/uncaptcha3
+                    if CAN_SOLVE_CAPTCHA:
+                        try:
+                            if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
+                                    and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
+                                self.logger.info('Found a captcha')
+                                await self._recaptcha_solver(page)
+                        except PlaywrightTimeoutError as e:
+                            self.logger.info(f'Captcha on {url} is not ready: {e}')
+                        except TargetClosedError as e:
+                            self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
+                        except Error as e:
+                            self.logger.warning(f'Error while resolving captcha on {url}: {e}')
+                        except Exception as e:
+                            self.logger.exception(f'General error with captcha solving on {url}: {e}')
+                    # ======
+                    # NOTE: testing
+                    # await self.__cloudflare_bypass_attempt(page)
+                    self.logger.debug('Done with captcha.')
                     # move mouse
                     await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
                     self.logger.debug('Moved mouse.')
@@ -812,7 +817,7 @@ class Capture():
                         try:
                             await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
                             await self._wait_for_random_timeout(page, 2)
-                            async with timeout(3):
+                            async with timeout(5):
                                 await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
                             self.logger.debug('Jumped to fragment.')
                         except PlaywrightTimeoutError as e:
@@ -821,20 +826,24 @@ class Capture():
                             self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
                         except Error as e:
                             self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
-                        except TimeoutError:
+                        except (asyncio.TimeoutError, TimeoutError):
                             self.logger.debug('Unable to scroll due to timeout')
+                        except (asyncio.CancelledError):
+                            self.logger.debug('Unable to scroll due to timeout, call canceled')
                     else:
                         # scroll more
                         try:
                             # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
                             #   2024-07-08: Also, it sometimes get stuck.
-                            async with timeout(3):
+                            async with timeout(5):
                                 await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
                             self.logger.debug('Scrolled down.')
                         except Error as e:
                             self.logger.debug(f'Unable to scroll: {e}')
-                        except TimeoutError:
+                        except (TimeoutError, asyncio.TimeoutError):
                             self.logger.debug('Unable to scroll due to timeout')
+                        except (asyncio.CancelledError):
+                            self.logger.debug('Unable to scroll due to timeout, call canceled')
                     await self._wait_for_random_timeout(page, 3)
                     self.logger.debug('Keep going after moving on page.')
@@ -866,8 +875,12 @@ class Capture():
                         to_return["downloaded_file"] = mem_zip.getvalue()
                 # fast forward 30s
-                await page.clock.run_for("30")
-                self.logger.debug('Moved time forward.')
+                # try:
+                #    async with timeout(3):
+                #        await page.clock.run_for("47")
+                #        self.logger.debug('Moved time forward.')
+                # except (TimeoutError, asyncio.TimeoutError):
+                #    self.logger.warning('Unable to move time forward.')
                 self.logger.debug('Done with instrumentation, waiting for network idle.')
                 await self._wait_for_random_timeout(page, 5)  # Wait 5 sec after instrumentation
@@ -915,7 +928,7 @@ class Capture():
                                         rendered_hostname_only=rendered_hostname_only,
                                         max_depth_capture_time=max_capture_time)
                                     to_return['children'].append(child_capture)  # type: ignore[union-attr]
-                            except (TimeoutError, asyncio.exceptions.TimeoutError, asyncio.TimeoutError):
+                            except (TimeoutError, asyncio.TimeoutError):
                                 self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
                                 consecutive_errors += 1
                             except Exception as e:
@@ -943,6 +956,9 @@ class Capture():
         except PlaywrightTimeoutError as e:
             to_return['error'] = f"The capture took too long - {e.message}"
             self.should_retry = True
+        except (asyncio.TimeoutError, TimeoutError):
+            to_return['error'] = "Something in the capture took too long"
+            self.should_retry = True
         except TargetClosedError as e:
             to_return['error'] = f"The target was closed - {e}"
             self.should_retry = True
@@ -1078,8 +1094,9 @@ class Capture():
         tries = 3
         while tries:
             try:
-                return await page.content()
-            except Error:
+                async with timeout(30):
+                    return await page.content()
+            except (Error, TimeoutError):
                 self.logger.debug('Unable to get page content, trying again.')
                 tries -= 1
                 await self._wait_for_random_timeout(page, 1)
@@ -1225,6 +1242,11 @@ class Capture():
             if ': ' in name:
                 _, name = name.split(': ', maxsplit=1)
             exception._name = name.strip()
+        else:
+            # The format changed in Playwright 1.43.0, the name of the method that failed is set before the exception itself.
+            if ': ' in exception.message:
+                _, name = exception.message.split(': ', maxsplit=1)
+            exception._name = name.strip()
     def _exception_is_network_error(self, exception: Error) -> bool:
         if exception.name in [

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "PlaywrightCapture"
-version = "1.25.3"
+version = "1.25.5"
 description = "A simple library to capture websites using playwright"
 authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
 license = "BSD-3-Clause"

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/LICENSE RENAMED Viewed

File without changes

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/README.md RENAMED Viewed

File without changes

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/__init__.py RENAMED Viewed

File without changes

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/exceptions.py RENAMED Viewed

File without changes

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/helpers.py RENAMED Viewed

File without changes

{playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/py.typed RENAMED Viewed

File without changes

PlaywrightCapture 1.25.3__tar.gz → 1.25.5__tar.gz

PlaywrightCapture 1.25.3tar.gz → 1.25.5tar.gz