PlaywrightCapture 1.28.5__tar.gz → 1.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.28.5
3
+ Version: 1.29.0
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -20,15 +20,17 @@ Classifier: Topic :: Security
20
20
  Provides-Extra: recaptcha
21
21
  Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
22
22
  Requires-Dist: aiohttp-socks (>=0.10.1)
23
- Requires-Dist: aiohttp[speedups] (>=3.11.16)
23
+ Requires-Dist: aiohttp[speedups] (>=3.11.18)
24
24
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
25
- Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)
25
+ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
26
26
  Requires-Dist: dateparser (>=1.2.1)
27
+ Requires-Dist: dnspython (>=2.7.0,<3.0.0)
27
28
  Requires-Dist: playwright (>=1.51.0)
28
29
  Requires-Dist: playwright-stealth (>=1.0.6)
29
30
  Requires-Dist: puremagic (>=1.28)
30
31
  Requires-Dist: pydub (>=0.25.1) ; extra == "recaptcha"
31
- Requires-Dist: setuptools (>=78.1.0)
32
+ Requires-Dist: python-socks (>=2.7.1,<3.0.0)
33
+ Requires-Dist: setuptools (>=79.0.0)
32
34
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
33
35
  Requires-Dist: tzdata (>=2025.2)
34
36
  Requires-Dist: w3lib (>=2.3.1)
@@ -37,6 +37,7 @@ from w3lib.html import strip_html5_whitespace
37
37
  from w3lib.url import canonicalize_url, safe_url_string
38
38
 
39
39
  from .exceptions import UnknownPlaywrightBrowser, UnknownPlaywrightDevice, InvalidPlaywrightParameter
40
+ from .socks5dnslookup import Socks5Resolver
40
41
 
41
42
  from zoneinfo import available_timezones
42
43
  all_timezones_set = available_timezones()
@@ -144,6 +145,7 @@ class Capture():
144
145
 
145
146
  def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
146
147
  proxy: str | dict[str, str] | None=None,
148
+ socks5_dns_resolver: str | list[str] | None=None,
147
149
  general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
148
150
  uuid: str | None=None, headless: bool=True):
149
151
  """Captures a page with Playwright.
@@ -151,6 +153,7 @@ class Capture():
151
153
  :param browser: The browser to use for the capture.
152
154
  :param device_name: The pre-defined device to use for the capture (from playwright).)
153
155
  :param proxy: The external proxy to use for the capture.
156
+ :param socks5_dns_resolver: DNS resolver to use for the socks5 proxy and fill the HAR file.
154
157
  :param general_timeout_in_sec: The general timeout for the capture, including children.
155
158
  :param loglevel: Python loglevel
156
159
  :param uuid: The UUID of the capture.
@@ -177,6 +180,7 @@ class Capture():
177
180
  self.device_name: str | None = device_name
178
181
  self.headless: bool = headless
179
182
  self.proxy: ProxySettings = {}
183
+ self.socks5_dns_resolver: str | list[str] | None = socks5_dns_resolver
180
184
  if proxy:
181
185
  if isinstance(proxy, str):
182
186
  self.proxy = self.__prepare_proxy_playwright(proxy)
@@ -935,6 +939,7 @@ class Capture():
935
939
  ) -> CaptureResponse:
936
940
 
937
941
  to_return: CaptureResponse = {}
942
+ errors: list[str] = []
938
943
  got_favicons = False
939
944
 
940
945
  # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
@@ -997,6 +1002,7 @@ class Capture():
997
1002
  except Error as e:
998
1003
  self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}')
999
1004
  self.should_retry = True
1005
+ to_return['error'] = f'Unable to create new page: {e}'
1000
1006
  return to_return
1001
1007
 
1002
1008
  if allow_tracking:
@@ -1050,8 +1056,8 @@ class Capture():
1050
1056
  error_msg = download.failure()
1051
1057
  if not error_msg:
1052
1058
  raise e
1053
- to_return['error'] = f"Error while downloading: {error_msg}"
1054
- self.logger.info(to_return['error'])
1059
+ errors.append(f"Error while downloading: {error_msg}")
1060
+ self.logger.info(f'Error while downloading: {error_msg}')
1055
1061
  self.should_retry = True
1056
1062
  except Exception:
1057
1063
  raise e
@@ -1137,7 +1143,7 @@ class Capture():
1137
1143
  if consecutive_errors >= 5:
1138
1144
  # if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
1139
1145
  self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
1140
- to_return['error'] = "Got more than 5 consecutive errors while capturing children"
1146
+ errors.append("Got more than 5 consecutive errors while capturing children")
1141
1147
  self.should_retry = True
1142
1148
  break
1143
1149
 
@@ -1149,19 +1155,19 @@ class Capture():
1149
1155
  self.logger.info(f'Unable to go back: {e}.')
1150
1156
 
1151
1157
  except PlaywrightTimeoutError as e:
1152
- to_return['error'] = f"The capture took too long - {e.message}"
1158
+ errors.append(f"The capture took too long - {e.message}")
1153
1159
  self.should_retry = True
1154
1160
  except (asyncio.TimeoutError, TimeoutError):
1155
- to_return['error'] = "Something in the capture took too long"
1161
+ errors.append("Something in the capture took too long")
1156
1162
  self.should_retry = True
1157
1163
  except TargetClosedError as e:
1158
- to_return['error'] = f"The target was closed - {e}"
1164
+ errors.append(f"The target was closed - {e}")
1159
1165
  self.should_retry = True
1160
1166
  except Error as e:
1161
- # NOTE: there are a lot of errors that look like duplicates and they are trggered at different times in the process.
1162
- # it is tricky to figure our which one whouls (and should not) trigger a retry. Below is our best guess and it will change over time.
1167
+ # NOTE: there are a lot of errors that look like duplicates and they are triggered at different times in the process.
1168
+ # it is tricky to figure our which one should (and should not) trigger a retry. Below is our best guess and it will change over time.
1163
1169
  self._update_exceptions(e)
1164
- to_return['error'] = e.message
1170
+ errors.append(e.message)
1165
1171
  to_return['error_name'] = e.name
1166
1172
  # TODO: check e.message and figure out if it is worth retrying or not.
1167
1173
  # NOTE: e.name is generally (always?) "Error"
@@ -1170,6 +1176,7 @@ class Capture():
1170
1176
  elif self._retry_network_error(e) or self._retry_browser_error(e):
1171
1177
  # this one sounds like something we can retry...
1172
1178
  self.logger.info(f'Issue with {url} (retrying): {e.message}')
1179
+ errors.append(f'Issue with {url}: {e.message}')
1173
1180
  self.should_retry = True
1174
1181
  else:
1175
1182
  # Unexpected ones
@@ -1177,9 +1184,10 @@ class Capture():
1177
1184
  except Exception as e:
1178
1185
  # we may get a non-playwright exception to.
1179
1186
  # The ones we try to handle here should be treated as if they were.
1180
- to_return['error'] = str(e)
1181
- if to_return['error'] in ['Connection closed while reading from the driver']:
1187
+ errors.append(str(e))
1188
+ if str(e) in ['Connection closed while reading from the driver']:
1182
1189
  self.logger.info(f'Issue with {url} (retrying): {e}')
1190
+ errors.append(f'Issue with {url}: {e}')
1183
1191
  self.should_retry = True
1184
1192
  else:
1185
1193
  raise e
@@ -1201,15 +1209,31 @@ class Capture():
1201
1209
  to_return["downloaded_file"] = mem_zip.getvalue()
1202
1210
 
1203
1211
  try:
1204
- to_return['storage'] = await self._failsafe_get_storage()
1205
- to_return['cookies'] = await self._failsafe_get_cookies()
1206
- self.logger.debug('Done with cookies and storage.')
1207
- except Exception as e:
1208
- if 'error' not in to_return:
1209
- to_return['error'] = f'Unable to get the storage: {e}'
1212
+ async with timeout(15):
1213
+ to_return['cookies'] = await self.context.cookies()
1214
+ except (TimeoutError, asyncio.TimeoutError):
1215
+ self.logger.warning("Unable to get cookies (timeout).")
1216
+ errors.append("Unable to get the cookies (timeout).")
1217
+ self.should_retry = True
1218
+ except Error as e:
1219
+ self.logger.warning(f"Unable to get cookies: {e}")
1220
+ errors.append(f'Unable to get the cookies: {e}')
1221
+ self.should_retry = True
1222
+
1223
+ try:
1224
+ async with timeout(15):
1225
+ to_return['storage'] = await self.context.storage_state(indexed_db=True)
1226
+ except (TimeoutError, asyncio.TimeoutError):
1227
+ self.logger.warning("Unable to get storage (timeout).")
1228
+ errors.append("Unable to get the storage (timeout).")
1229
+ self.should_retry = True
1230
+ except Error as e:
1231
+ self.logger.warning(f"Unable to get the storage: {e}")
1232
+ errors.append(f'Unable to get the storage: {e}')
1233
+ self.should_retry = True
1210
1234
  # frames_tree = self.make_frame_tree(page.main_frame)
1211
1235
  try:
1212
- async with timeout(60):
1236
+ async with timeout(30):
1213
1237
  page.remove_listener("requestfinished", store_request)
1214
1238
  await page.close(reason="Closing the page because the capture finished.")
1215
1239
  self.logger.debug('Page closed.')
@@ -1218,34 +1242,24 @@ class Capture():
1218
1242
  with open(self._temp_harfile.name) as _har:
1219
1243
  to_return['har'] = json.load(_har)
1220
1244
  self.logger.debug('Got HAR.')
1245
+ if (to_return.get('har') and self.proxy and self.proxy.get('server')
1246
+ and self.proxy['server'].startswith('socks5')):
1247
+ # Only if the capture was not done via a socks5 proxy
1248
+ if har := to_return['har']: # Could be None
1249
+ async with timeout(30):
1250
+ await self.socks5_resolver(har)
1221
1251
  except (TimeoutError, asyncio.TimeoutError):
1222
1252
  self.logger.warning("Unable to close page and context at the end of the capture.")
1253
+ errors.append("Unable to close page and context at the end of the capture.")
1223
1254
  self.should_retry = True
1224
1255
  except Exception as e:
1225
1256
  self.logger.warning(f"Other exception while finishing up the capture: {e}.")
1226
- if 'error' not in to_return:
1227
- to_return['error'] = f'Unable to generate HAR file: {e}'
1257
+ errors.append(f'Unable to generate HAR file: {e}')
1228
1258
  self.logger.debug('Capture done')
1259
+ if errors:
1260
+ to_return['error'] = '\n'.join(errors)
1229
1261
  return to_return
1230
1262
 
1231
- async def _failsafe_get_cookies(self) -> list[Cookie] | None:
1232
- try:
1233
- async with timeout(15):
1234
- return await self.context.cookies()
1235
- except (TimeoutError, asyncio.TimeoutError):
1236
- self.logger.warning("Unable to get cookies (timeout).")
1237
- return None
1238
-
1239
- async def _failsafe_get_storage(self) -> StorageState | None:
1240
- try:
1241
- async with timeout(15):
1242
- return await self.context.storage_state(indexed_db=True)
1243
- except (TimeoutError, asyncio.TimeoutError):
1244
- self.logger.warning("Unable to get storage (timeout).")
1245
- except Error as e:
1246
- self.logger.warning(f"Unable to get storage: {e}")
1247
- return None
1248
-
1249
1263
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
1250
1264
  self.logger.debug("Capturing a screenshot of the full page.")
1251
1265
  try:
@@ -1296,7 +1310,7 @@ class Capture():
1296
1310
  tries = 3
1297
1311
  while tries:
1298
1312
  try:
1299
- async with timeout(30):
1313
+ async with timeout(15):
1300
1314
  return await page.content()
1301
1315
  except (Error, TimeoutError, asyncio.TimeoutError):
1302
1316
  self.logger.debug('Unable to get page content, trying again.')
@@ -1709,3 +1723,31 @@ class Capture():
1709
1723
  return to_return
1710
1724
 
1711
1725
  # END FAVICON EXTRACTOR
1726
+
1727
+ # ##### Run DNS resolution over socks5 proxy #####
1728
+ # This is only use when the capture is done over a socks5 proxy, and not on a .onion
1729
+ # We get the HAR file, iterate over the entries an update the IPs
1730
+
1731
+ async def socks5_resolver(self, harfile: dict[str, Any]) -> None:
1732
+ resolver = Socks5Resolver(logger=self.logger, socks5_proxy=self.proxy['server'],
1733
+ dns_resolver=self.socks5_dns_resolver)
1734
+ # get all the hostnames from the HAR file
1735
+ hostnames = set()
1736
+ for entry in harfile['log']['entries']:
1737
+ if entry['request']['url']:
1738
+ parsed = urlparse(entry['request']['url'])
1739
+ if parsed.netloc and not parsed.netloc.endswith('onion'):
1740
+ hostnames.add(parsed.netloc)
1741
+ # use the same technique as in lookyloo to resolve many domains in parallel
1742
+ semaphore = asyncio.Semaphore(20)
1743
+ all_requests = [resolver.resolve(hostname, semaphore) for hostname in hostnames]
1744
+ await asyncio.gather(*all_requests)
1745
+ self.logger.debug('Resolved all domains through the proxy.')
1746
+ for entry in harfile['log']['entries']:
1747
+ if entry['request']['url']:
1748
+ parsed = urlparse(entry['request']['url'])
1749
+ if parsed.netloc and not parsed.netloc.endswith('onion'):
1750
+ answer = resolver.get_cache(parsed.netloc)
1751
+ if answer:
1752
+ entry['serverIPAddress'] = {str(b) for b in answer}.pop()
1753
+ self.logger.debug('Done updating HAR file')
@@ -0,0 +1,111 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import socket
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ import dns
11
+
12
+ from dns.asyncresolver import Resolver
13
+ from dns.resolver import Cache
14
+ from dns._asyncio_backend import _maybe_wait_for, StreamSocket, Backend
15
+ from python_socks.async_.asyncio import Proxy
16
+
17
+
18
+ if TYPE_CHECKING:
19
+ from .capture import PlaywrightCaptureLogAdapter
20
+ from logging import Logger
21
+
22
+
23
+ class Socks5Backend(Backend):
24
+
25
+ def __init__(self, socks5_proxy_url: str):
26
+ super().__init__()
27
+ self.proxy = Proxy.from_url(socks5_proxy_url)
28
+
29
+ def name(self) -> str:
30
+ return "asyncio socks5"
31
+
32
+ async def make_socket( # type: ignore[no-untyped-def]
33
+ self,
34
+ af,
35
+ socktype,
36
+ proto=0,
37
+ source=None,
38
+ destination=None,
39
+ timeout=None,
40
+ ssl_context=None,
41
+ server_hostname=None,
42
+ ):
43
+ if socktype == socket.SOCK_STREAM:
44
+ if destination is None:
45
+ # This shouldn't happen, but we check to make code analysis software
46
+ # happier.
47
+ raise ValueError("destination required for stream sockets")
48
+ sock = await self.proxy.connect(dest_host=destination[0], dest_port=destination[1])
49
+ (r, w) = await _maybe_wait_for( # type: ignore[no-untyped-call]
50
+ asyncio.open_connection(
51
+ None,
52
+ None,
53
+ sock=sock,
54
+ ssl=ssl_context,
55
+ family=af,
56
+ proto=proto,
57
+ local_addr=source,
58
+ server_hostname=server_hostname,
59
+ ),
60
+ timeout,
61
+ )
62
+ return StreamSocket(af, r, w) # type: ignore[no-untyped-call]
63
+ raise NotImplementedError(
64
+ "unsupported socket " + f"type {socktype}"
65
+ ) # pragma: no cover
66
+
67
+
68
+ class Socks5Resolver:
69
+
70
+ def __init__(self, logger: Logger | PlaywrightCaptureLogAdapter, socks5_proxy: str, dns_resolver: str | list[str] | None=None):
71
+ self.logger = logger
72
+ # configure set to false means we don't want to load resolv.conf
73
+ self.resolver = Resolver(configure=False)
74
+ self.resolver.cache = Cache(900)
75
+ self.resolver.timeout = 2
76
+ self.resolver.lifetime = 4
77
+
78
+ if not dns_resolver:
79
+ # Fallback to 1.1.1.1
80
+ dns_resolver = ['1.1.1.1']
81
+ elif isinstance(dns_resolver, str):
82
+ dns_resolver = [dns_resolver]
83
+ self.resolver.nameservers = dns_resolver
84
+
85
+ self.backend = Socks5Backend(socks5_proxy_url=socks5_proxy)
86
+
87
+ def get_cache(self, domain: str, rdatatype: dns.rdatatype.RdataType=dns.rdatatype.A) -> dns.resolver.Answer | None:
88
+ # Get domain from cache
89
+ return self.resolver.cache.get((dns.name.from_text(domain), rdatatype, dns.rdataclass.IN))
90
+
91
+ async def resolve(self, domain: str, semaphore: asyncio.Semaphore, rdatatype: dns.rdatatype.RdataType=dns.rdatatype.A) -> dns.resolver.Answer | None:
92
+ # Resolve the A record only for the domain, might want to do AAAA instead.
93
+ async with semaphore:
94
+ max_retries = 3
95
+ while max_retries > 0:
96
+ try:
97
+ response = await self.resolver.resolve(domain, rdatatype,
98
+ tcp=True, backend=self.backend)
99
+ return response
100
+ except dns.resolver.LifetimeTimeout:
101
+ # Retry a few times on timeout, it happens.
102
+ max_retries -= 1
103
+ if max_retries > 0:
104
+ self.logger.debug(f"[Socks5] Timeout resolving {domain}, retrying.")
105
+ await asyncio.sleep(1)
106
+ else:
107
+ self.logger.info(f"[Socks5] Timeout resolving {domain}.")
108
+ except Exception as e:
109
+ self.logger.info(f"[Socks5] Error resolving {domain}: {e}")
110
+ break
111
+ return None
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "PlaywrightCapture"
3
- version = "1.28.5"
3
+ version = "1.29.0"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
@@ -14,16 +14,18 @@ dynamic = [ "classifiers" ]
14
14
  dependencies = [
15
15
  "playwright (>=1.51.0)",
16
16
  "dateparser (>=1.2.1)",
17
- "beautifulsoup4[charset-normalizer,lxml] (>=4.13.3)",
17
+ "beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)",
18
18
  "w3lib (>=2.3.1)",
19
19
  "tzdata (>=2025.2)",
20
20
  "playwright-stealth (>=1.0.6)",
21
- "setuptools (>=78.1.0)",
21
+ "setuptools (>=79.0.0)",
22
22
  "puremagic (>=1.28)",
23
23
  "async-timeout (>=5.0.1) ; python_version < \"3.11\"",
24
- "aiohttp[speedups] (>=3.11.16)",
24
+ "aiohttp[speedups] (>=3.11.18)",
25
25
  "aiohttp-socks (>=0.10.1)",
26
- "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\""
26
+ "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\"",
27
+ "dnspython (>=2.7.0,<3.0.0)",
28
+ "python-socks (>=2.7.1,<3.0.0)"
27
29
  ]
28
30
 
29
31
  [project.urls]
@@ -49,7 +51,7 @@ recaptcha = [
49
51
  types-beautifulsoup4 = "^4.12.0.20250204"
50
52
  pytest = "^8.3.5"
51
53
  mypy = "^1.15.0"
52
- types-dateparser = "^1.2.0.20250208"
54
+ types-dateparser = "^1.2.0.20250408"
53
55
  types-pytz = "^2025.2.0.20250326"
54
56
 
55
57