PyPI - PlaywrightCapture - Versions diffs - 1.28.5__tar.gz → 1.29.0__tar.gz - Mend

PlaywrightCapture 1.28.5tar.gz → 1.29.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: PlaywrightCapture
-Version: 1.28.5
+Version: 1.29.0
 Summary: A simple library to capture websites using playwright
 License: BSD-3-Clause
 Author: Raphaël Vinot
@@ -20,15 +20,17 @@ Classifier: Topic :: Security
 Provides-Extra: recaptcha
 Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
 Requires-Dist: aiohttp-socks (>=0.10.1)
-Requires-Dist: aiohttp[speedups] (>=3.11.16)
+Requires-Dist: aiohttp[speedups] (>=3.11.18)
 Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
-Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
+Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
 Requires-Dist: dateparser (>=1.2.1)
+Requires-Dist: dnspython (>=2.7.0,<3.0.0)
 Requires-Dist: playwright (>=1.51.0)
 Requires-Dist: playwright-stealth (>=1.0.6)
 Requires-Dist: puremagic (>=1.28)
 Requires-Dist: pydub (>=0.25.1) ; extra == "recaptcha"
-Requires-Dist: setuptools (>=78.1.0)
+Requires-Dist: python-socks (>=2.7.1,<3.0.0)
+Requires-Dist: setuptools (>=79.0.0)
 Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
 Requires-Dist: tzdata (>=2025.2)
 Requires-Dist: w3lib (>=2.3.1)

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/playwrightcapture/capture.py RENAMED Viewed

@@ -37,6 +37,7 @@ from w3lib.html import strip_html5_whitespace
 from w3lib.url import canonicalize_url, safe_url_string
 from .exceptions import UnknownPlaywrightBrowser, UnknownPlaywrightDevice, InvalidPlaywrightParameter
+from .socks5dnslookup import Socks5Resolver
 from zoneinfo import available_timezones
 all_timezones_set = available_timezones()
@@ -144,6 +145,7 @@ class Capture():
     def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
                  proxy: str | dict[str, str] | None=None,
+                 socks5_dns_resolver: str | list[str] | None=None,
                  general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
                  uuid: str | None=None, headless: bool=True):
         """Captures a page with Playwright.
@@ -151,6 +153,7 @@ class Capture():
         :param browser: The browser to use for the capture.
         :param device_name: The pre-defined device to use for the capture (from playwright).)
         :param proxy: The external proxy to use for the capture.
+        :param socks5_dns_resolver: DNS resolver to use for the socks5 proxy and fill the HAR file.
         :param general_timeout_in_sec: The general timeout for the capture, including children.
         :param loglevel: Python loglevel
         :param uuid: The UUID of the capture.
@@ -177,6 +180,7 @@ class Capture():
         self.device_name: str | None = device_name
         self.headless: bool = headless
         self.proxy: ProxySettings = {}
+        self.socks5_dns_resolver: str | list[str] | None = socks5_dns_resolver
         if proxy:
             if isinstance(proxy, str):
                 self.proxy = self.__prepare_proxy_playwright(proxy)
@@ -935,6 +939,7 @@ class Capture():
                            ) -> CaptureResponse:
         to_return: CaptureResponse = {}
+        errors: list[str] = []
         got_favicons = False
         # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
@@ -997,6 +1002,7 @@ class Capture():
             except Error as e:
                 self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}')
                 self.should_retry = True
+                to_return['error'] = f'Unable to create new page: {e}'
                 return to_return
             if allow_tracking:
@@ -1050,8 +1056,8 @@ class Capture():
                             error_msg = download.failure()
                             if not error_msg:
                                 raise e
-                            to_return['error'] = f"Error while downloading: {error_msg}"
-                            self.logger.info(to_return['error'])
+                            errors.append(f"Error while downloading: {error_msg}")
+                            self.logger.info(f'Error while downloading: {error_msg}')
                             self.should_retry = True
                         except Exception:
                             raise e
@@ -1137,7 +1143,7 @@ class Capture():
                             if consecutive_errors >= 5:
                                 # if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
                                 self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
-                                to_return['error'] = "Got more than 5 consecutive errors while capturing children"
+                                errors.append("Got more than 5 consecutive errors while capturing children")
                                 self.should_retry = True
                                 break
@@ -1149,19 +1155,19 @@ class Capture():
                                 self.logger.info(f'Unable to go back: {e}.')
         except PlaywrightTimeoutError as e:
-            to_return['error'] = f"The capture took too long - {e.message}"
+            errors.append(f"The capture took too long - {e.message}")
             self.should_retry = True
         except (asyncio.TimeoutError, TimeoutError):
-            to_return['error'] = "Something in the capture took too long"
+            errors.append("Something in the capture took too long")
             self.should_retry = True
         except TargetClosedError as e:
-            to_return['error'] = f"The target was closed - {e}"
+            errors.append(f"The target was closed - {e}")
             self.should_retry = True
         except Error as e:
-            # NOTE: there are a lot of errors that look like duplicates and they are trggered at different times in the process.
-            # it is tricky to figure our which one whouls (and should not) trigger a retry. Below is our best guess and it will change over time.
+            # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process.
+            # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time.
             self._update_exceptions(e)
-            to_return['error'] = e.message
+            errors.append(e.message)
             to_return['error_name'] = e.name
             # TODO: check e.message and figure out if it is worth retrying or not.
             # NOTE: e.name is generally (always?) "Error"
@@ -1170,6 +1176,7 @@ class Capture():
             elif self._retry_network_error(e) or self._retry_browser_error(e):
                 # this one sounds like something we can retry...
                 self.logger.info(f'Issue with {url} (retrying): {e.message}')
+                errors.append(f'Issue with {url}: {e.message}')
                 self.should_retry = True
             else:
                 # Unexpected ones
@@ -1177,9 +1184,10 @@ class Capture():
         except Exception as e:
             # we may get a non-playwright exception to.
             # The ones we try to handle here should be treated as if they were.
-            to_return['error'] = str(e)
-            if to_return['error'] in ['Connection closed while reading from the driver']:
+            errors.append(str(e))
+            if str(e) in ['Connection closed while reading from the driver']:
                 self.logger.info(f'Issue with {url} (retrying): {e}')
+                errors.append(f'Issue with {url}: {e}')
                 self.should_retry = True
             else:
                 raise e
@@ -1201,15 +1209,31 @@ class Capture():
                         to_return["downloaded_file"] = mem_zip.getvalue()
                 try:
-                    to_return['storage'] = await self._failsafe_get_storage()
-                    to_return['cookies'] = await self._failsafe_get_cookies()
-                    self.logger.debug('Done with cookies and storage.')
-                except Exception as e:
-                    if 'error' not in to_return:
-                        to_return['error'] = f'Unable to get the storage: {e}'
+                    async with timeout(15):
+                        to_return['cookies'] = await self.context.cookies()
+                except (TimeoutError, asyncio.TimeoutError):
+                    self.logger.warning("Unable to get cookies (timeout).")
+                    errors.append("Unable to get the cookies (timeout).")
+                    self.should_retry = True
+                except Error as e:
+                    self.logger.warning(f"Unable to get cookies: {e}")
+                    errors.append(f'Unable to get the cookies: {e}')
+                    self.should_retry = True
+                try:
+                    async with timeout(15):
+                        to_return['storage'] = await self.context.storage_state(indexed_db=True)
+                except (TimeoutError, asyncio.TimeoutError):
+                    self.logger.warning("Unable to get storage (timeout).")
+                    errors.append("Unable to get the storage (timeout).")
+                    self.should_retry = True
+                except Error as e:
+                    self.logger.warning(f"Unable to get the storage: {e}")
+                    errors.append(f'Unable to get the storage: {e}')
+                    self.should_retry = True
                 # frames_tree = self.make_frame_tree(page.main_frame)
                 try:
-                    async with timeout(60):
+                    async with timeout(30):
                         page.remove_listener("requestfinished", store_request)
                         await page.close(reason="Closing the page because the capture finished.")
                         self.logger.debug('Page closed.')
@@ -1218,34 +1242,24 @@ class Capture():
                         with open(self._temp_harfile.name) as _har:
                             to_return['har'] = json.load(_har)
                         self.logger.debug('Got HAR.')
+                    if (to_return.get('har') and self.proxy and self.proxy.get('server')
+                            and self.proxy['server'].startswith('socks5')):
+                        # Only if the capture was not done via a socks5 proxy
+                        if har := to_return['har']:  # Could be None
+                            async with timeout(30):
+                                await self.socks5_resolver(har)
                 except (TimeoutError, asyncio.TimeoutError):
                     self.logger.warning("Unable to close page and context at the end of the capture.")
+                    errors.append("Unable to close page and context at the end of the capture.")
                     self.should_retry = True
                 except Exception as e:
                     self.logger.warning(f"Other exception while finishing up the capture: {e}.")
-                    if 'error' not in to_return:
-                        to_return['error'] = f'Unable to generate HAR file: {e}'
+                    errors.append(f'Unable to generate HAR file: {e}')
         self.logger.debug('Capture done')
+        if errors:
+            to_return['error'] = '\n'.join(errors)
         return to_return
-    async def _failsafe_get_cookies(self) -> list[Cookie] | None:
-        try:
-            async with timeout(15):
-                return await self.context.cookies()
-        except (TimeoutError, asyncio.TimeoutError):
-            self.logger.warning("Unable to get cookies (timeout).")
-        return None
-    async def _failsafe_get_storage(self) -> StorageState | None:
-        try:
-            async with timeout(15):
-                return await self.context.storage_state(indexed_db=True)
-        except (TimeoutError, asyncio.TimeoutError):
-            self.logger.warning("Unable to get storage (timeout).")
-        except Error as e:
-            self.logger.warning(f"Unable to get storage: {e}")
-        return None
     async def _failsafe_get_screenshot(self, page: Page) -> bytes:
         self.logger.debug("Capturing a screenshot of the full page.")
         try:
@@ -1296,7 +1310,7 @@ class Capture():
         tries = 3
         while tries:
             try:
-                async with timeout(30):
+                async with timeout(15):
                     return await page.content()
             except (Error, TimeoutError, asyncio.TimeoutError):
                 self.logger.debug('Unable to get page content, trying again.')
@@ -1709,3 +1723,31 @@ class Capture():
         return to_return
     # END FAVICON EXTRACTOR
+    # ##### Run DNS resolution over socks5 proxy #####
+    # This is only use when the capture is done over a socks5 proxy, and not on a .onion
+    # We get the HAR file, iterate over the entries an update the IPs
+    async def socks5_resolver(self, harfile: dict[str, Any]) -> None:
+        resolver = Socks5Resolver(logger=self.logger, socks5_proxy=self.proxy['server'],
+                                  dns_resolver=self.socks5_dns_resolver)
+        # get all the hostnames from the HAR file
+        hostnames = set()
+        for entry in harfile['log']['entries']:
+            if entry['request']['url']:
+                parsed = urlparse(entry['request']['url'])
+                if parsed.netloc and not parsed.netloc.endswith('onion'):
+                    hostnames.add(parsed.netloc)
+        # use the same technique as in lookyloo to resolve many domains in parallel
+        semaphore = asyncio.Semaphore(20)
+        all_requests = [resolver.resolve(hostname, semaphore) for hostname in hostnames]
+        await asyncio.gather(*all_requests)
+        self.logger.debug('Resolved all domains through the proxy.')
+        for entry in harfile['log']['entries']:
+            if entry['request']['url']:
+                parsed = urlparse(entry['request']['url'])
+                if parsed.netloc and not parsed.netloc.endswith('onion'):
+                    answer = resolver.get_cache(parsed.netloc)
+                    if answer:
+                        entry['serverIPAddress'] = {str(b) for b in answer}.pop()
+        self.logger.debug('Done updating HAR file')

playwrightcapture-1.29.0/playwrightcapture/socks5dnslookup.py ADDED Viewed

@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+import asyncio
+import socket
+from typing import TYPE_CHECKING
+import dns
+from dns.asyncresolver import Resolver
+from dns.resolver import Cache
+from dns._asyncio_backend import _maybe_wait_for, StreamSocket, Backend
+from python_socks.async_.asyncio import Proxy
+if TYPE_CHECKING:
+    from .capture import PlaywrightCaptureLogAdapter
+    from logging import Logger
+class Socks5Backend(Backend):
+    def __init__(self, socks5_proxy_url: str):
+        super().__init__()
+        self.proxy = Proxy.from_url(socks5_proxy_url)
+    def name(self) -> str:
+        return "asyncio socks5"
+    async def make_socket(  # type: ignore[no-untyped-def]
+        self,
+        af,
+        socktype,
+        proto=0,
+        source=None,
+        destination=None,
+        timeout=None,
+        ssl_context=None,
+        server_hostname=None,
+    ):
+        if socktype == socket.SOCK_STREAM:
+            if destination is None:
+                # This shouldn't happen, but we check to make code analysis software
+                # happier.
+                raise ValueError("destination required for stream sockets")
+            sock = await self.proxy.connect(dest_host=destination[0], dest_port=destination[1])
+            (r, w) = await _maybe_wait_for(  # type: ignore[no-untyped-call]
+                asyncio.open_connection(
+                    None,
+                    None,
+                    sock=sock,
+                    ssl=ssl_context,
+                    family=af,
+                    proto=proto,
+                    local_addr=source,
+                    server_hostname=server_hostname,
+                ),
+                timeout,
+            )
+            return StreamSocket(af, r, w)  # type: ignore[no-untyped-call]
+        raise NotImplementedError(
+            "unsupported socket " + f"type {socktype}"
+        )  # pragma: no cover
+class Socks5Resolver:
+    def __init__(self, logger: Logger | PlaywrightCaptureLogAdapter, socks5_proxy: str, dns_resolver: str | list[str] | None=None):
+        self.logger = logger
+        # configure set to false means we don't want to load resolv.conf
+        self.resolver = Resolver(configure=False)
+        self.resolver.cache = Cache(900)
+        self.resolver.timeout = 2
+        self.resolver.lifetime = 4
+        if not dns_resolver:
+            # Fallback to 1.1.1.1
+            dns_resolver = ['1.1.1.1']
+        elif isinstance(dns_resolver, str):
+            dns_resolver = [dns_resolver]
+        self.resolver.nameservers = dns_resolver
+        self.backend = Socks5Backend(socks5_proxy_url=socks5_proxy)
+    def get_cache(self, domain: str, rdatatype: dns.rdatatype.RdataType=dns.rdatatype.A) -> dns.resolver.Answer | None:
+        # Get domain from cache
+        return self.resolver.cache.get((dns.name.from_text(domain), rdatatype, dns.rdataclass.IN))
+    async def resolve(self, domain: str, semaphore: asyncio.Semaphore, rdatatype: dns.rdatatype.RdataType=dns.rdatatype.A) -> dns.resolver.Answer | None:
+        # Resolve the A record only for the domain, might want to do AAAA instead.
+        async with semaphore:
+            max_retries = 3
+            while max_retries > 0:
+                try:
+                    response = await self.resolver.resolve(domain, rdatatype,
+                                                           tcp=True, backend=self.backend)
+                    return response
+                except dns.resolver.LifetimeTimeout:
+                    # Retry a few times on timeout, it happens.
+                    max_retries -= 1
+                    if max_retries > 0:
+                        self.logger.debug(f"[Socks5] Timeout resolving {domain}, retrying.")
+                        await asyncio.sleep(1)
+                    else:
+                        self.logger.info(f"[Socks5] Timeout resolving {domain}.")
+                except Exception as e:
+                    self.logger.info(f"[Socks5] Error resolving {domain}: {e}")
+                    break
+            return None

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "PlaywrightCapture"
-version = "1.28.5"
+version = "1.29.0"
 description = "A simple library to capture websites using playwright"
 authors = [
     {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
@@ -14,16 +14,18 @@ dynamic = [ "classifiers" ]
 dependencies = [
     "playwright (>=1.51.0)",
     "dateparser (>=1.2.1)",
-    "beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)",
+    "beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)",
     "w3lib (>=2.3.1)",
     "tzdata (>=2025.2)",
     "playwright-stealth (>=1.0.6)",
-    "setuptools (>=78.1.0)",
+    "setuptools (>=79.0.0)",
     "puremagic (>=1.28)",
     "async-timeout (>=5.0.1) ; python_version < \"3.11\"",
-    "aiohttp[speedups] (>=3.11.16)",
+    "aiohttp[speedups] (>=3.11.18)",
     "aiohttp-socks (>=0.10.1)",
-    "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\""
+    "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\"",
+    "dnspython (>=2.7.0,<3.0.0)",
+    "python-socks (>=2.7.1,<3.0.0)"
 ]
 [project.urls]
@@ -49,7 +51,7 @@ recaptcha = [
 types-beautifulsoup4 = "^4.12.0.20250204"
 pytest = "^8.3.5"
 mypy = "^1.15.0"
-types-dateparser = "^1.2.0.20250208"
+types-dateparser = "^1.2.0.20250408"
 types-pytz = "^2025.2.0.20250326"

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/LICENSE RENAMED Viewed

File without changes

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/README.md RENAMED Viewed

File without changes

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/playwrightcapture/__init__.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/playwrightcapture/exceptions.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/playwrightcapture/helpers.py RENAMED Viewed

File without changes

{playwrightcapture-1.28.5 → playwrightcapture-1.29.0}/playwrightcapture/py.typed RENAMED Viewed

File without changes

PlaywrightCapture 1.28.5__tar.gz → 1.29.0__tar.gz

PlaywrightCapture 1.28.5tar.gz → 1.29.0tar.gz