PlaywrightCapture 1.28.6__tar.gz → 1.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.28.6
3
+ Version: 1.29.0
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -20,15 +20,17 @@ Classifier: Topic :: Security
20
20
  Provides-Extra: recaptcha
21
21
  Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
22
22
  Requires-Dist: aiohttp-socks (>=0.10.1)
23
- Requires-Dist: aiohttp[speedups] (>=3.11.16)
23
+ Requires-Dist: aiohttp[speedups] (>=3.11.18)
24
24
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
25
- Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
25
+ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
26
26
  Requires-Dist: dateparser (>=1.2.1)
27
+ Requires-Dist: dnspython (>=2.7.0,<3.0.0)
27
28
  Requires-Dist: playwright (>=1.51.0)
28
29
  Requires-Dist: playwright-stealth (>=1.0.6)
29
30
  Requires-Dist: puremagic (>=1.28)
30
31
  Requires-Dist: pydub (>=0.25.1) ; extra == "recaptcha"
31
- Requires-Dist: setuptools (>=78.1.0)
32
+ Requires-Dist: python-socks (>=2.7.1,<3.0.0)
33
+ Requires-Dist: setuptools (>=79.0.0)
32
34
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
33
35
  Requires-Dist: tzdata (>=2025.2)
34
36
  Requires-Dist: w3lib (>=2.3.1)
@@ -37,6 +37,7 @@ from w3lib.html import strip_html5_whitespace
37
37
  from w3lib.url import canonicalize_url, safe_url_string
38
38
 
39
39
  from .exceptions import UnknownPlaywrightBrowser, UnknownPlaywrightDevice, InvalidPlaywrightParameter
40
+ from .socks5dnslookup import Socks5Resolver
40
41
 
41
42
  from zoneinfo import available_timezones
42
43
  all_timezones_set = available_timezones()
@@ -144,6 +145,7 @@ class Capture():
144
145
 
145
146
  def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
146
147
  proxy: str | dict[str, str] | None=None,
148
+ socks5_dns_resolver: str | list[str] | None=None,
147
149
  general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
148
150
  uuid: str | None=None, headless: bool=True):
149
151
  """Captures a page with Playwright.
@@ -151,6 +153,7 @@ class Capture():
151
153
  :param browser: The browser to use for the capture.
152
154
  :param device_name: The pre-defined device to use for the capture (from playwright).)
153
155
  :param proxy: The external proxy to use for the capture.
156
+ :param socks5_dns_resolver: DNS resolver to use for the socks5 proxy and fill the HAR file.
154
157
  :param general_timeout_in_sec: The general timeout for the capture, including children.
155
158
  :param loglevel: Python loglevel
156
159
  :param uuid: The UUID of the capture.
@@ -177,6 +180,7 @@ class Capture():
177
180
  self.device_name: str | None = device_name
178
181
  self.headless: bool = headless
179
182
  self.proxy: ProxySettings = {}
183
+ self.socks5_dns_resolver: str | list[str] | None = socks5_dns_resolver
180
184
  if proxy:
181
185
  if isinstance(proxy, str):
182
186
  self.proxy = self.__prepare_proxy_playwright(proxy)
@@ -1238,6 +1242,12 @@ class Capture():
1238
1242
  with open(self._temp_harfile.name) as _har:
1239
1243
  to_return['har'] = json.load(_har)
1240
1244
  self.logger.debug('Got HAR.')
1245
+ if (to_return.get('har') and self.proxy and self.proxy.get('server')
1246
+ and self.proxy['server'].startswith('socks5')):
1247
+ # Only if the capture was not done via a socks5 proxy
1248
+ if har := to_return['har']: # Could be None
1249
+ async with timeout(30):
1250
+ await self.socks5_resolver(har)
1241
1251
  except (TimeoutError, asyncio.TimeoutError):
1242
1252
  self.logger.warning("Unable to close page and context at the end of the capture.")
1243
1253
  errors.append("Unable to close page and context at the end of the capture.")
@@ -1713,3 +1723,31 @@ class Capture():
1713
1723
  return to_return
1714
1724
 
1715
1725
  # END FAVICON EXTRACTOR
1726
+
1727
+ # ##### Run DNS resolution over socks5 proxy #####
1728
+ # This is only use when the capture is done over a socks5 proxy, and not on a .onion
1729
+ # We get the HAR file, iterate over the entries an update the IPs
1730
+
1731
+ async def socks5_resolver(self, harfile: dict[str, Any]) -> None:
1732
+ resolver = Socks5Resolver(logger=self.logger, socks5_proxy=self.proxy['server'],
1733
+ dns_resolver=self.socks5_dns_resolver)
1734
+ # get all the hostnames from the HAR file
1735
+ hostnames = set()
1736
+ for entry in harfile['log']['entries']:
1737
+ if entry['request']['url']:
1738
+ parsed = urlparse(entry['request']['url'])
1739
+ if parsed.netloc and not parsed.netloc.endswith('onion'):
1740
+ hostnames.add(parsed.netloc)
1741
+ # use the same technique as in lookyloo to resolve many domains in parallel
1742
+ semaphore = asyncio.Semaphore(20)
1743
+ all_requests = [resolver.resolve(hostname, semaphore) for hostname in hostnames]
1744
+ await asyncio.gather(*all_requests)
1745
+ self.logger.debug('Resolved all domains through the proxy.')
1746
+ for entry in harfile['log']['entries']:
1747
+ if entry['request']['url']:
1748
+ parsed = urlparse(entry['request']['url'])
1749
+ if parsed.netloc and not parsed.netloc.endswith('onion'):
1750
+ answer = resolver.get_cache(parsed.netloc)
1751
+ if answer:
1752
+ entry['serverIPAddress'] = {str(b) for b in answer}.pop()
1753
+ self.logger.debug('Done updating HAR file')
@@ -0,0 +1,111 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import socket
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ import dns
11
+
12
+ from dns.asyncresolver import Resolver
13
+ from dns.resolver import Cache
14
+ from dns._asyncio_backend import _maybe_wait_for, StreamSocket, Backend
15
+ from python_socks.async_.asyncio import Proxy
16
+
17
+
18
+ if TYPE_CHECKING:
19
+ from .capture import PlaywrightCaptureLogAdapter
20
+ from logging import Logger
21
+
22
+
23
+ class Socks5Backend(Backend):
24
+
25
+ def __init__(self, socks5_proxy_url: str):
26
+ super().__init__()
27
+ self.proxy = Proxy.from_url(socks5_proxy_url)
28
+
29
+ def name(self) -> str:
30
+ return "asyncio socks5"
31
+
32
+ async def make_socket( # type: ignore[no-untyped-def]
33
+ self,
34
+ af,
35
+ socktype,
36
+ proto=0,
37
+ source=None,
38
+ destination=None,
39
+ timeout=None,
40
+ ssl_context=None,
41
+ server_hostname=None,
42
+ ):
43
+ if socktype == socket.SOCK_STREAM:
44
+ if destination is None:
45
+ # This shouldn't happen, but we check to make code analysis software
46
+ # happier.
47
+ raise ValueError("destination required for stream sockets")
48
+ sock = await self.proxy.connect(dest_host=destination[0], dest_port=destination[1])
49
+ (r, w) = await _maybe_wait_for( # type: ignore[no-untyped-call]
50
+ asyncio.open_connection(
51
+ None,
52
+ None,
53
+ sock=sock,
54
+ ssl=ssl_context,
55
+ family=af,
56
+ proto=proto,
57
+ local_addr=source,
58
+ server_hostname=server_hostname,
59
+ ),
60
+ timeout,
61
+ )
62
+ return StreamSocket(af, r, w) # type: ignore[no-untyped-call]
63
+ raise NotImplementedError(
64
+ "unsupported socket " + f"type {socktype}"
65
+ ) # pragma: no cover
66
+
67
+
68
+ class Socks5Resolver:
69
+
70
+ def __init__(self, logger: Logger | PlaywrightCaptureLogAdapter, socks5_proxy: str, dns_resolver: str | list[str] | None=None):
71
+ self.logger = logger
72
+ # configure set to false means we don't want to load resolv.conf
73
+ self.resolver = Resolver(configure=False)
74
+ self.resolver.cache = Cache(900)
75
+ self.resolver.timeout = 2
76
+ self.resolver.lifetime = 4
77
+
78
+ if not dns_resolver:
79
+ # Fallback to 1.1.1.1
80
+ dns_resolver = ['1.1.1.1']
81
+ elif isinstance(dns_resolver, str):
82
+ dns_resolver = [dns_resolver]
83
+ self.resolver.nameservers = dns_resolver
84
+
85
+ self.backend = Socks5Backend(socks5_proxy_url=socks5_proxy)
86
+
87
+ def get_cache(self, domain: str, rdatatype: dns.rdatatype.RdataType=dns.rdatatype.A) -> dns.resolver.Answer | None:
88
+ # Get domain from cache
89
+ return self.resolver.cache.get((dns.name.from_text(domain), rdatatype, dns.rdataclass.IN))
90
+
91
+ async def resolve(self, domain: str, semaphore: asyncio.Semaphore, rdatatype: dns.rdatatype.RdataType=dns.rdatatype.A) -> dns.resolver.Answer | None:
92
+ # Resolve the A record only for the domain, might want to do AAAA instead.
93
+ async with semaphore:
94
+ max_retries = 3
95
+ while max_retries > 0:
96
+ try:
97
+ response = await self.resolver.resolve(domain, rdatatype,
98
+ tcp=True, backend=self.backend)
99
+ return response
100
+ except dns.resolver.LifetimeTimeout:
101
+ # Retry a few times on timeout, it happens.
102
+ max_retries -= 1
103
+ if max_retries > 0:
104
+ self.logger.debug(f"[Socks5] Timeout resolving {domain}, retrying.")
105
+ await asyncio.sleep(1)
106
+ else:
107
+ self.logger.info(f"[Socks5] Timeout resolving {domain}.")
108
+ except Exception as e:
109
+ self.logger.info(f"[Socks5] Error resolving {domain}: {e}")
110
+ break
111
+ return None
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "PlaywrightCapture"
3
- version = "1.28.6"
3
+ version = "1.29.0"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
@@ -14,16 +14,18 @@ dynamic = [ "classifiers" ]
14
14
  dependencies = [
15
15
  "playwright (>=1.51.0)",
16
16
  "dateparser (>=1.2.1)",
17
- "beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)",
17
+ "beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)",
18
18
  "w3lib (>=2.3.1)",
19
19
  "tzdata (>=2025.2)",
20
20
  "playwright-stealth (>=1.0.6)",
21
- "setuptools (>=78.1.0)",
21
+ "setuptools (>=79.0.0)",
22
22
  "puremagic (>=1.28)",
23
23
  "async-timeout (>=5.0.1) ; python_version < \"3.11\"",
24
- "aiohttp[speedups] (>=3.11.16)",
24
+ "aiohttp[speedups] (>=3.11.18)",
25
25
  "aiohttp-socks (>=0.10.1)",
26
- "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\""
26
+ "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\"",
27
+ "dnspython (>=2.7.0,<3.0.0)",
28
+ "python-socks (>=2.7.1,<3.0.0)"
27
29
  ]
28
30
 
29
31
  [project.urls]