PlaywrightCapture 1.32.1__tar.gz → 1.32.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,15 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.32.1
3
+ Version: 1.32.3
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
7
7
  Author-email: raphael.vinot@circl.lu
8
- Requires-Python: >=3.9,<4.0
8
+ Requires-Python: >=3.9.2,<4.0
9
9
  Classifier: Intended Audience :: Science/Research
10
10
  Classifier: Intended Audience :: Telecommunications Industry
11
11
  Classifier: License :: OSI Approved :: BSD License
12
12
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
@@ -31,6 +30,7 @@ Requires-Dist: puremagic (>=1.30)
31
30
  Requires-Dist: pydub (>=0.25.1) ; (python_version < "3.10") and (extra == "recaptcha")
32
31
  Requires-Dist: pydub-ng (>=0.2.0) ; (python_version >= "3.10") and (extra == "recaptcha")
33
32
  Requires-Dist: python-socks (>=2.7.1,<3.0.0)
33
+ Requires-Dist: rfc3161-client (>=1.0.4,<2.0.0)
34
34
  Requires-Dist: setuptools (>=80.9.0)
35
35
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
36
36
  Requires-Dist: tzdata (>=2025.2)
@@ -1,6 +1,7 @@
1
1
  from .capture import Capture # noqa
2
2
  from .capture import CaptureResponse # noqa
3
3
  from .capture import SetCookieParam, Cookie # noqa
4
+ from .capture import TrustedTimestampSettings # noqa
4
5
  from .helpers import get_devices # noqa
5
6
  from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType, # noqa
6
7
  UnknownPlaywrightBrowser, UnknownPlaywrightDevice,
@@ -9,6 +10,7 @@ from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType
9
10
  __all__ = [
10
11
  'Capture',
11
12
  'CaptureResponse',
13
+ 'TrustedTimestampSettings',
12
14
  'SetCookieParam', 'Cookie',
13
15
  'get_devices',
14
16
  'PlaywrightCaptureException',
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import asyncio
6
6
  import binascii
7
+ # import hashlib
7
8
  import json
8
9
  import logging
9
10
  import os
@@ -12,13 +13,13 @@ import re
12
13
  import sys
13
14
  import time
14
15
 
15
- from base64 import b64decode
16
+ from base64 import b64decode, b64encode
16
17
  from io import BytesIO
17
18
  from logging import LoggerAdapter, Logger
18
19
  from tempfile import NamedTemporaryFile
19
20
  from typing import Any, Literal, TYPE_CHECKING
20
21
  from collections.abc import MutableMapping
21
- from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit
22
+ from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit, unquote_plus
22
23
  from zipfile import ZipFile
23
24
 
24
25
  import aiohttp
@@ -32,6 +33,7 @@ from playwright.async_api import async_playwright, Frame, Error, Page, Download,
32
33
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
33
34
  from playwright_stealth import Stealth, ALL_EVASIONS_DISABLED_KWARGS # type: ignore[attr-defined]
34
35
  from puremagic import PureError, from_string
36
+ from rfc3161_client import TimestampRequestBuilder, TimeStampRequest, HashAlgorithm
35
37
  from w3lib.html import strip_html5_whitespace
36
38
  from w3lib.url import canonicalize_url, safe_url_string
37
39
 
@@ -119,6 +121,10 @@ class CaptureResponse(TypedDict, total=False):
119
121
  downloaded_file: bytes | None
120
122
  children: list[CaptureResponse] | None
121
123
 
124
+ # if the capture is triggered with with_trusted_timestamps, the response contains a
125
+ # dict[<entry name>] = '<base64 encoded timestamp response>'
126
+ trusted_timestamps: dict[str, str] | None
127
+
122
128
  # One day, playwright will support getting the favicon from the capture itself
123
129
  # favicon: Optional[bytes]
124
130
  # in the meantime, we need a workaround: https://github.com/Lookyloo/PlaywrightCapture/issues/45
@@ -135,6 +141,13 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
135
141
  return msg, kwargs
136
142
 
137
143
 
144
+ class TrustedTimestampSettings(TypedDict, total=False):
145
+
146
+ url: str
147
+ hash_algorithm: HashAlgorithm | None
148
+ # NOTE: can add other settings such as auth mechanism, if needed.
149
+
150
+
138
151
  # good test pages:
139
152
  # https://kaliiiiiiiiii.github.io/brotector/?crash=false
140
153
  # https://www.browserscan.net/bot-detection
@@ -155,7 +168,7 @@ class Capture():
155
168
  socks5_dns_resolver: str | list[str] | None=None,
156
169
  general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
157
170
  uuid: str | None=None, headless: bool=True,
158
- *, init_script: str | None=None):
171
+ *, init_script: str | None=None, tt_settings: TrustedTimestampSettings | None=None):
159
172
  """Captures a page with Playwright.
160
173
 
161
174
  :param browser: The browser to use for the capture.
@@ -219,6 +232,8 @@ class Capture():
219
232
 
220
233
  self._init_script = init_script
221
234
 
235
+ self.tt_settings = tt_settings
236
+
222
237
  def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
223
238
  splitted = urlsplit(proxy)
224
239
  if splitted.username and splitted.password:
@@ -1007,6 +1022,7 @@ class Capture():
1007
1022
  with_screenshot: bool=True,
1008
1023
  with_favicon: bool=False,
1009
1024
  allow_tracking: bool=False,
1025
+ with_trusted_timestamps: bool=False,
1010
1026
  ) -> CaptureResponse:
1011
1027
 
1012
1028
  to_return: CaptureResponse = {}
@@ -1163,7 +1179,7 @@ class Capture():
1163
1179
  except Exception as e:
1164
1180
  self.logger.warning(f'Unable to get favicons: {e}')
1165
1181
 
1166
- to_return['last_redirected_url'] = page.url
1182
+ to_return['last_redirected_url'] = unquote_plus(page.url)
1167
1183
 
1168
1184
  if with_screenshot:
1169
1185
  to_return['png'] = await self._failsafe_get_screenshot(page)
@@ -1199,6 +1215,11 @@ class Capture():
1199
1215
  rendered_hostname_only=rendered_hostname_only,
1200
1216
  max_depth_capture_time=max_capture_time,
1201
1217
  with_screenshot=with_screenshot)
1218
+ if with_trusted_timestamps:
1219
+ try:
1220
+ await self._get_trusted_timestamps(child_capture)
1221
+ except Exception as e:
1222
+ self.logger.warning(f'Unable to get the trusted timestamps for the clild capture : {e}.')
1202
1223
  to_return['children'].append(child_capture) # type: ignore[union-attr]
1203
1224
  except (TimeoutError, asyncio.TimeoutError):
1204
1225
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
@@ -1331,8 +1352,56 @@ class Capture():
1331
1352
  self.logger.debug('Capture done')
1332
1353
  if errors:
1333
1354
  to_return['error'] = '\n'.join(errors)
1355
+ if with_trusted_timestamps:
1356
+ try:
1357
+ await self._get_trusted_timestamps(to_return)
1358
+ except Exception as e:
1359
+ self.logger.warning(f'Unable to get trusted timestamps: {e}')
1334
1360
  return to_return
1335
1361
 
1362
+ async def _get_trusted_timestamps(self, capture_response: CaptureResponse) -> None:
1363
+ """Get trusted timestamps for the relevant values in the response"""
1364
+ if not self.tt_settings:
1365
+ self.logger.warning('The remote timestamper is not configured.')
1366
+ return None
1367
+ to_timestamp: dict[str, TimestampRequestBuilder] = {}
1368
+ if last_redirected_url := capture_response.get('last_redirected_url'):
1369
+ to_timestamp['last_redirected_url'] = TimestampRequestBuilder().data(last_redirected_url.encode())
1370
+ if har := capture_response.get('har'):
1371
+ to_timestamp['har'] = TimestampRequestBuilder().data(json.dumps(har).encode())
1372
+ if storage := capture_response.get('storage'):
1373
+ to_timestamp['storage'] = TimestampRequestBuilder().data(json.dumps(storage).encode())
1374
+ if html := capture_response.get('html'):
1375
+ to_timestamp['html'] = TimestampRequestBuilder().data(html.encode())
1376
+ if png := capture_response.get('png'):
1377
+ to_timestamp['png'] = TimestampRequestBuilder().data(png)
1378
+ if downloaded_filename := capture_response.get('downloaded_filename'):
1379
+ to_timestamp['downloaded_filename'] = TimestampRequestBuilder().data(downloaded_filename.encode())
1380
+ if downloaded_file := capture_response.get('downloaded_file'):
1381
+ to_timestamp['downloaded_file'] = TimestampRequestBuilder().data(downloaded_file)
1382
+ # if potential_favicons := capture_response.get('potential_favicons'):
1383
+ # to_timestamp['potential_favicons'] = TimestampRequestBuilder().data(potential_favicons)
1384
+
1385
+ tt_requests: dict[str, TimeStampRequest] = {}
1386
+ for k, trb in to_timestamp.items():
1387
+ if h_algo := self.tt_settings.get('hash_algorithm'):
1388
+ trb.hash_algorithm(h_algo)
1389
+ tt_requests[k] = trb.build()
1390
+
1391
+ trusted_timestamps: dict[str, bytes] = {}
1392
+ async with aiohttp.ClientSession() as session:
1393
+ for k, tsr in tt_requests.items():
1394
+ async with session.post(self.tt_settings['url'], data=tsr.as_bytes(),
1395
+ headers={"Content-Type": "application/timestamp-query"}) as response:
1396
+ try:
1397
+ response.raise_for_status()
1398
+ except aiohttp.ClientResponseError as e:
1399
+ self.logger.warning(f'Unable to get Trusted Timestamp for {k}: {e}')
1400
+ continue
1401
+ trusted_timestamps[k] = await response.read()
1402
+
1403
+ capture_response['trusted_timestamps'] = {k: b64encode(v).decode() for k, v in trusted_timestamps.items()}
1404
+
1336
1405
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
1337
1406
  self.logger.debug("Capturing a screenshot of the full page.")
1338
1407
  try:
@@ -1,13 +1,13 @@
1
1
  [project]
2
2
  name = "PlaywrightCapture"
3
- version = "1.32.1"
3
+ version = "1.32.3"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = [
6
6
  {name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
7
7
  ]
8
8
  license = "BSD-3-Clause"
9
9
  readme = "README.md"
10
- requires-python = ">=3.9,<4.0"
10
+ requires-python = ">=3.9.2,<4.0"
11
11
 
12
12
  dynamic = [ "classifiers" ]
13
13
 
@@ -25,7 +25,8 @@ dependencies = [
25
25
  "aiohttp-socks (>=0.10.1)",
26
26
  "typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\"",
27
27
  "dnspython (>=2.7.0,<3.0.0)",
28
- "python-socks (>=2.7.1,<3.0.0)"
28
+ "python-socks (>=2.7.1,<3.0.0)",
29
+ "rfc3161-client (>=1.0.4,<2.0.0)"
29
30
  ]
30
31
 
31
32
  [project.urls]
@@ -50,7 +51,7 @@ recaptcha = [
50
51
 
51
52
  [tool.poetry.group.dev.dependencies]
52
53
  types-beautifulsoup4 = "^4.12.0.20250516"
53
- pytest = "^8.4.1"
54
+ pytest = "^8.4.2"
54
55
  mypy = "^1.17.1"
55
56
  types-dateparser = "^1.2.2.20250809"
56
57
  types-pytz = "^2025.2.0.20250809"