PlaywrightCapture 1.32.0__py3-none-any.whl → 1.32.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  from .capture import Capture # noqa
2
2
  from .capture import CaptureResponse # noqa
3
3
  from .capture import SetCookieParam, Cookie # noqa
4
+ from .capture import TrustedTimestampSettings # noqa
4
5
  from .helpers import get_devices # noqa
5
6
  from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType, # noqa
6
7
  UnknownPlaywrightBrowser, UnknownPlaywrightDevice,
@@ -9,6 +10,7 @@ from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType
9
10
  __all__ = [
10
11
  'Capture',
11
12
  'CaptureResponse',
13
+ 'TrustedTimestampSettings',
12
14
  'SetCookieParam', 'Cookie',
13
15
  'get_devices',
14
16
  'PlaywrightCaptureException',
@@ -12,7 +12,7 @@ import re
12
12
  import sys
13
13
  import time
14
14
 
15
- from base64 import b64decode
15
+ from base64 import b64decode, b64encode
16
16
  from io import BytesIO
17
17
  from logging import LoggerAdapter, Logger
18
18
  from tempfile import NamedTemporaryFile
@@ -32,6 +32,7 @@ from playwright.async_api import async_playwright, Frame, Error, Page, Download,
32
32
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
33
33
  from playwright_stealth import Stealth, ALL_EVASIONS_DISABLED_KWARGS # type: ignore[attr-defined]
34
34
  from puremagic import PureError, from_string
35
+ from rfc3161_client import TimestampRequestBuilder, TimeStampRequest, HashAlgorithm
35
36
  from w3lib.html import strip_html5_whitespace
36
37
  from w3lib.url import canonicalize_url, safe_url_string
37
38
 
@@ -119,6 +120,10 @@ class CaptureResponse(TypedDict, total=False):
119
120
  downloaded_file: bytes | None
120
121
  children: list[CaptureResponse] | None
121
122
 
123
+ # if the capture is triggered with with_trusted_timestamps, the response contains a
124
+ # dict[<entry name>] = '<base64 encoded timestamp response>'
125
+ trusted_timestamps: dict[str, str] | None
126
+
122
127
  # One day, playwright will support getting the favicon from the capture itself
123
128
  # favicon: Optional[bytes]
124
129
  # in the meantime, we need a workaround: https://github.com/Lookyloo/PlaywrightCapture/issues/45
@@ -135,6 +140,13 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
135
140
  return msg, kwargs
136
141
 
137
142
 
143
+ class TrustedTimestampSettings(TypedDict, total=False):
144
+
145
+ url: str
146
+ hash_algorithm: HashAlgorithm | None
147
+ # NOTE: can add other settings such as auth mechanism, if needed.
148
+
149
+
138
150
  # good test pages:
139
151
  # https://kaliiiiiiiiii.github.io/brotector/?crash=false
140
152
  # https://www.browserscan.net/bot-detection
@@ -155,7 +167,7 @@ class Capture():
155
167
  socks5_dns_resolver: str | list[str] | None=None,
156
168
  general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
157
169
  uuid: str | None=None, headless: bool=True,
158
- *, init_script: str | None=None):
170
+ *, init_script: str | None=None, tt_settings: TrustedTimestampSettings | None=None):
159
171
  """Captures a page with Playwright.
160
172
 
161
173
  :param browser: The browser to use for the capture.
@@ -219,6 +231,8 @@ class Capture():
219
231
 
220
232
  self._init_script = init_script
221
233
 
234
+ self.tt_settings = tt_settings
235
+
222
236
  def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
223
237
  splitted = urlsplit(proxy)
224
238
  if splitted.username and splitted.password:
@@ -1007,6 +1021,7 @@ class Capture():
1007
1021
  with_screenshot: bool=True,
1008
1022
  with_favicon: bool=False,
1009
1023
  allow_tracking: bool=False,
1024
+ with_trusted_timestamps: bool=False,
1010
1025
  ) -> CaptureResponse:
1011
1026
 
1012
1027
  to_return: CaptureResponse = {}
@@ -1199,6 +1214,11 @@ class Capture():
1199
1214
  rendered_hostname_only=rendered_hostname_only,
1200
1215
  max_depth_capture_time=max_capture_time,
1201
1216
  with_screenshot=with_screenshot)
1217
+ if with_trusted_timestamps:
1218
+ try:
1219
+ await self._get_trusted_timestamps(child_capture)
1220
+ except Exception as e:
1221
+ self.logger.warning(f'Unable to get the trusted timestamps for the clild capture : {e}.')
1202
1222
  to_return['children'].append(child_capture) # type: ignore[union-attr]
1203
1223
  except (TimeoutError, asyncio.TimeoutError):
1204
1224
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
@@ -1331,8 +1351,56 @@ class Capture():
1331
1351
  self.logger.debug('Capture done')
1332
1352
  if errors:
1333
1353
  to_return['error'] = '\n'.join(errors)
1354
+ if with_trusted_timestamps:
1355
+ try:
1356
+ await self._get_trusted_timestamps(to_return)
1357
+ except Exception as e:
1358
+ self.logger.warning(f'Unable to get trusted timestamps: {e}')
1334
1359
  return to_return
1335
1360
 
1361
+ async def _get_trusted_timestamps(self, capture_response: CaptureResponse) -> None:
1362
+ """Get trusted timestamps for the relevant values in the response"""
1363
+ if not self.tt_settings:
1364
+ self.logger.warning('The remote timestamper is not configured.')
1365
+ return None
1366
+ to_timestamp: dict[str, TimestampRequestBuilder] = {}
1367
+ if last_redirected_url := capture_response.get('last_redirected_url'):
1368
+ to_timestamp['last_redirected_url'] = TimestampRequestBuilder().data(last_redirected_url.encode())
1369
+ if har := capture_response.get('har'):
1370
+ to_timestamp['har'] = TimestampRequestBuilder().data(json.dumps(har).encode())
1371
+ if storage := capture_response.get('storage'):
1372
+ to_timestamp['storage'] = TimestampRequestBuilder().data(json.dumps(storage).encode())
1373
+ if html := capture_response.get('html'):
1374
+ to_timestamp['html'] = TimestampRequestBuilder().data(html.encode())
1375
+ if png := capture_response.get('png'):
1376
+ to_timestamp['png'] = TimestampRequestBuilder().data(png)
1377
+ if downloaded_filename := capture_response.get('downloaded_filename'):
1378
+ to_timestamp['downloaded_filename'] = TimestampRequestBuilder().data(downloaded_filename.encode())
1379
+ if downloaded_file := capture_response.get('downloaded_file'):
1380
+ to_timestamp['downloaded_file'] = TimestampRequestBuilder().data(downloaded_file)
1381
+ # if potential_favicons := capture_response.get('potential_favicons'):
1382
+ # to_timestamp['potential_favicons'] = TimestampRequestBuilder().data(potential_favicons)
1383
+
1384
+ tt_requests: dict[str, TimeStampRequest] = {}
1385
+ for k, trb in to_timestamp.items():
1386
+ if h_algo := self.tt_settings.get('hash_algorithm'):
1387
+ trb.hash_algorithm(h_algo)
1388
+ tt_requests[k] = trb.build()
1389
+
1390
+ trusted_timestamps: dict[str, bytes] = {}
1391
+ async with aiohttp.ClientSession() as session:
1392
+ for k, tsr in tt_requests.items():
1393
+ async with session.post(self.tt_settings['url'], data=tsr.as_bytes(),
1394
+ headers={"Content-Type": "application/timestamp-query"}) as response:
1395
+ try:
1396
+ response.raise_for_status()
1397
+ except aiohttp.ClientResponseError as e:
1398
+ self.logger.warning(f'Unable to get Trusted Timestamp for {k}: {e}')
1399
+ continue
1400
+ trusted_timestamps[k] = await response.read()
1401
+
1402
+ capture_response['trusted_timestamps'] = {k: b64encode(v).decode() for k, v in trusted_timestamps.items()}
1403
+
1336
1404
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
1337
1405
  self.logger.debug("Capturing a screenshot of the full page.")
1338
1406
  try:
@@ -1,16 +1,15 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.32.0
3
+ Version: 1.32.2
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
7
7
  Author-email: raphael.vinot@circl.lu
8
- Requires-Python: >=3.9,<4.0
8
+ Requires-Python: >=3.9.2,<4.0
9
9
  Classifier: Intended Audience :: Science/Research
10
10
  Classifier: Intended Audience :: Telecommunications Industry
11
11
  Classifier: License :: OSI Approved :: BSD License
12
12
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
@@ -25,12 +24,13 @@ Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
25
24
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)
26
25
  Requires-Dist: dateparser (>=1.2.2)
27
26
  Requires-Dist: dnspython (>=2.7.0,<3.0.0)
28
- Requires-Dist: playwright (>=1.54.0)
27
+ Requires-Dist: playwright (>=1.55.0)
29
28
  Requires-Dist: playwright-stealth (>=2)
30
29
  Requires-Dist: puremagic (>=1.30)
31
30
  Requires-Dist: pydub (>=0.25.1) ; (python_version < "3.10") and (extra == "recaptcha")
32
31
  Requires-Dist: pydub-ng (>=0.2.0) ; (python_version >= "3.10") and (extra == "recaptcha")
33
32
  Requires-Dist: python-socks (>=2.7.1,<3.0.0)
33
+ Requires-Dist: rfc3161-client (>=1.0.4,<2.0.0)
34
34
  Requires-Dist: setuptools (>=80.9.0)
35
35
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
36
36
  Requires-Dist: tzdata (>=2025.2)
@@ -0,0 +1,10 @@
1
+ playwrightcapture/__init__.py,sha256=48NKCITflPkKB74WvDmUik4RH6ojRotsL98vf3l8MbY,748
2
+ playwrightcapture/capture.py,sha256=QoapDpppXh4dqPij7YNaTuHFURWXnZnAfjYL1PmKCbA,92852
3
+ playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
+ playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
5
+ playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ playwrightcapture/socks5dnslookup.py,sha256=ZpOf8tgsRQZi-WDcn9JbbG1bKz9DSfK_jz1l53UI1Ho,4058
7
+ playwrightcapture-1.32.2.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
8
+ playwrightcapture-1.32.2.dist-info/METADATA,sha256=cbSxeMK8YZ6fM0LGW-JwZ8k9Tywazm1_63zEJXEy2-Y,3284
9
+ playwrightcapture-1.32.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
+ playwrightcapture-1.32.2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- playwrightcapture/__init__.py,sha256=NAL0-ymloDBm_ghp1PsefIwRMQmEFKPhn83WVUD7t_0,663
2
- playwrightcapture/capture.py,sha256=yPGhrhFrwKS7mlTeMw_acYIjlDHZnAR0psOdXKoi4Lw,88922
3
- playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
- playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
5
- playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture/socks5dnslookup.py,sha256=ZpOf8tgsRQZi-WDcn9JbbG1bKz9DSfK_jz1l53UI1Ho,4058
7
- playwrightcapture-1.32.0.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
8
- playwrightcapture-1.32.0.dist-info/METADATA,sha256=2xaXTHpeA5DICsbA2k7ygsPrjOGZzLlSiYc6tl6h33E,3285
9
- playwrightcapture-1.32.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- playwrightcapture-1.32.0.dist-info/RECORD,,