PlaywrightCapture 1.32.0__tar.gz → 1.32.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/PKG-INFO +4 -4
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/playwrightcapture/__init__.py +2 -0
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/playwrightcapture/capture.py +70 -2
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/pyproject.toml +6 -5
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/LICENSE +0 -0
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/README.md +0 -0
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/playwrightcapture/py.typed +0 -0
- {playwrightcapture-1.32.0 → playwrightcapture-1.32.2}/playwrightcapture/socks5dnslookup.py +0 -0
@@ -1,16 +1,15 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.32.
|
3
|
+
Version: 1.32.2
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
7
7
|
Author-email: raphael.vinot@circl.lu
|
8
|
-
Requires-Python: >=3.9,<4.0
|
8
|
+
Requires-Python: >=3.9.2,<4.0
|
9
9
|
Classifier: Intended Audience :: Science/Research
|
10
10
|
Classifier: Intended Audience :: Telecommunications Industry
|
11
11
|
Classifier: License :: OSI Approved :: BSD License
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
14
13
|
Classifier: Programming Language :: Python :: 3.10
|
15
14
|
Classifier: Programming Language :: Python :: 3.11
|
16
15
|
Classifier: Programming Language :: Python :: 3.12
|
@@ -25,12 +24,13 @@ Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
|
25
24
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)
|
26
25
|
Requires-Dist: dateparser (>=1.2.2)
|
27
26
|
Requires-Dist: dnspython (>=2.7.0,<3.0.0)
|
28
|
-
Requires-Dist: playwright (>=1.
|
27
|
+
Requires-Dist: playwright (>=1.55.0)
|
29
28
|
Requires-Dist: playwright-stealth (>=2)
|
30
29
|
Requires-Dist: puremagic (>=1.30)
|
31
30
|
Requires-Dist: pydub (>=0.25.1) ; (python_version < "3.10") and (extra == "recaptcha")
|
32
31
|
Requires-Dist: pydub-ng (>=0.2.0) ; (python_version >= "3.10") and (extra == "recaptcha")
|
33
32
|
Requires-Dist: python-socks (>=2.7.1,<3.0.0)
|
33
|
+
Requires-Dist: rfc3161-client (>=1.0.4,<2.0.0)
|
34
34
|
Requires-Dist: setuptools (>=80.9.0)
|
35
35
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
|
36
36
|
Requires-Dist: tzdata (>=2025.2)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from .capture import Capture # noqa
|
2
2
|
from .capture import CaptureResponse # noqa
|
3
3
|
from .capture import SetCookieParam, Cookie # noqa
|
4
|
+
from .capture import TrustedTimestampSettings # noqa
|
4
5
|
from .helpers import get_devices # noqa
|
5
6
|
from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType, # noqa
|
6
7
|
UnknownPlaywrightBrowser, UnknownPlaywrightDevice,
|
@@ -9,6 +10,7 @@ from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType
|
|
9
10
|
__all__ = [
|
10
11
|
'Capture',
|
11
12
|
'CaptureResponse',
|
13
|
+
'TrustedTimestampSettings',
|
12
14
|
'SetCookieParam', 'Cookie',
|
13
15
|
'get_devices',
|
14
16
|
'PlaywrightCaptureException',
|
@@ -12,7 +12,7 @@ import re
|
|
12
12
|
import sys
|
13
13
|
import time
|
14
14
|
|
15
|
-
from base64 import b64decode
|
15
|
+
from base64 import b64decode, b64encode
|
16
16
|
from io import BytesIO
|
17
17
|
from logging import LoggerAdapter, Logger
|
18
18
|
from tempfile import NamedTemporaryFile
|
@@ -32,6 +32,7 @@ from playwright.async_api import async_playwright, Frame, Error, Page, Download,
|
|
32
32
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
33
33
|
from playwright_stealth import Stealth, ALL_EVASIONS_DISABLED_KWARGS # type: ignore[attr-defined]
|
34
34
|
from puremagic import PureError, from_string
|
35
|
+
from rfc3161_client import TimestampRequestBuilder, TimeStampRequest, HashAlgorithm
|
35
36
|
from w3lib.html import strip_html5_whitespace
|
36
37
|
from w3lib.url import canonicalize_url, safe_url_string
|
37
38
|
|
@@ -119,6 +120,10 @@ class CaptureResponse(TypedDict, total=False):
|
|
119
120
|
downloaded_file: bytes | None
|
120
121
|
children: list[CaptureResponse] | None
|
121
122
|
|
123
|
+
# if the capture is triggered with with_trusted_timestamps, the response contains a
|
124
|
+
# dict[<entry name>] = '<base64 encoded timestamp response>'
|
125
|
+
trusted_timestamps: dict[str, str] | None
|
126
|
+
|
122
127
|
# One day, playwright will support getting the favicon from the capture itself
|
123
128
|
# favicon: Optional[bytes]
|
124
129
|
# in the meantime, we need a workaround: https://github.com/Lookyloo/PlaywrightCapture/issues/45
|
@@ -135,6 +140,13 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
135
140
|
return msg, kwargs
|
136
141
|
|
137
142
|
|
143
|
+
class TrustedTimestampSettings(TypedDict, total=False):
|
144
|
+
|
145
|
+
url: str
|
146
|
+
hash_algorithm: HashAlgorithm | None
|
147
|
+
# NOTE: can add other settings such as auth mechanism, if needed.
|
148
|
+
|
149
|
+
|
138
150
|
# good test pages:
|
139
151
|
# https://kaliiiiiiiiii.github.io/brotector/?crash=false
|
140
152
|
# https://www.browserscan.net/bot-detection
|
@@ -155,7 +167,7 @@ class Capture():
|
|
155
167
|
socks5_dns_resolver: str | list[str] | None=None,
|
156
168
|
general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
|
157
169
|
uuid: str | None=None, headless: bool=True,
|
158
|
-
*, init_script: str | None=None):
|
170
|
+
*, init_script: str | None=None, tt_settings: TrustedTimestampSettings | None=None):
|
159
171
|
"""Captures a page with Playwright.
|
160
172
|
|
161
173
|
:param browser: The browser to use for the capture.
|
@@ -219,6 +231,8 @@ class Capture():
|
|
219
231
|
|
220
232
|
self._init_script = init_script
|
221
233
|
|
234
|
+
self.tt_settings = tt_settings
|
235
|
+
|
222
236
|
def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
|
223
237
|
splitted = urlsplit(proxy)
|
224
238
|
if splitted.username and splitted.password:
|
@@ -1007,6 +1021,7 @@ class Capture():
|
|
1007
1021
|
with_screenshot: bool=True,
|
1008
1022
|
with_favicon: bool=False,
|
1009
1023
|
allow_tracking: bool=False,
|
1024
|
+
with_trusted_timestamps: bool=False,
|
1010
1025
|
) -> CaptureResponse:
|
1011
1026
|
|
1012
1027
|
to_return: CaptureResponse = {}
|
@@ -1199,6 +1214,11 @@ class Capture():
|
|
1199
1214
|
rendered_hostname_only=rendered_hostname_only,
|
1200
1215
|
max_depth_capture_time=max_capture_time,
|
1201
1216
|
with_screenshot=with_screenshot)
|
1217
|
+
if with_trusted_timestamps:
|
1218
|
+
try:
|
1219
|
+
await self._get_trusted_timestamps(child_capture)
|
1220
|
+
except Exception as e:
|
1221
|
+
self.logger.warning(f'Unable to get the trusted timestamps for the clild capture : {e}.')
|
1202
1222
|
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
1203
1223
|
except (TimeoutError, asyncio.TimeoutError):
|
1204
1224
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
@@ -1331,8 +1351,56 @@ class Capture():
|
|
1331
1351
|
self.logger.debug('Capture done')
|
1332
1352
|
if errors:
|
1333
1353
|
to_return['error'] = '\n'.join(errors)
|
1354
|
+
if with_trusted_timestamps:
|
1355
|
+
try:
|
1356
|
+
await self._get_trusted_timestamps(to_return)
|
1357
|
+
except Exception as e:
|
1358
|
+
self.logger.warning(f'Unable to get trusted timestamps: {e}')
|
1334
1359
|
return to_return
|
1335
1360
|
|
1361
|
+
async def _get_trusted_timestamps(self, capture_response: CaptureResponse) -> None:
|
1362
|
+
"""Get trusted timestamps for the relevant values in the response"""
|
1363
|
+
if not self.tt_settings:
|
1364
|
+
self.logger.warning('The remote timestamper is not configured.')
|
1365
|
+
return None
|
1366
|
+
to_timestamp: dict[str, TimestampRequestBuilder] = {}
|
1367
|
+
if last_redirected_url := capture_response.get('last_redirected_url'):
|
1368
|
+
to_timestamp['last_redirected_url'] = TimestampRequestBuilder().data(last_redirected_url.encode())
|
1369
|
+
if har := capture_response.get('har'):
|
1370
|
+
to_timestamp['har'] = TimestampRequestBuilder().data(json.dumps(har).encode())
|
1371
|
+
if storage := capture_response.get('storage'):
|
1372
|
+
to_timestamp['storage'] = TimestampRequestBuilder().data(json.dumps(storage).encode())
|
1373
|
+
if html := capture_response.get('html'):
|
1374
|
+
to_timestamp['html'] = TimestampRequestBuilder().data(html.encode())
|
1375
|
+
if png := capture_response.get('png'):
|
1376
|
+
to_timestamp['png'] = TimestampRequestBuilder().data(png)
|
1377
|
+
if downloaded_filename := capture_response.get('downloaded_filename'):
|
1378
|
+
to_timestamp['downloaded_filename'] = TimestampRequestBuilder().data(downloaded_filename.encode())
|
1379
|
+
if downloaded_file := capture_response.get('downloaded_file'):
|
1380
|
+
to_timestamp['downloaded_file'] = TimestampRequestBuilder().data(downloaded_file)
|
1381
|
+
# if potential_favicons := capture_response.get('potential_favicons'):
|
1382
|
+
# to_timestamp['potential_favicons'] = TimestampRequestBuilder().data(potential_favicons)
|
1383
|
+
|
1384
|
+
tt_requests: dict[str, TimeStampRequest] = {}
|
1385
|
+
for k, trb in to_timestamp.items():
|
1386
|
+
if h_algo := self.tt_settings.get('hash_algorithm'):
|
1387
|
+
trb.hash_algorithm(h_algo)
|
1388
|
+
tt_requests[k] = trb.build()
|
1389
|
+
|
1390
|
+
trusted_timestamps: dict[str, bytes] = {}
|
1391
|
+
async with aiohttp.ClientSession() as session:
|
1392
|
+
for k, tsr in tt_requests.items():
|
1393
|
+
async with session.post(self.tt_settings['url'], data=tsr.as_bytes(),
|
1394
|
+
headers={"Content-Type": "application/timestamp-query"}) as response:
|
1395
|
+
try:
|
1396
|
+
response.raise_for_status()
|
1397
|
+
except aiohttp.ClientResponseError as e:
|
1398
|
+
self.logger.warning(f'Unable to get Trusted Timestamp for {k}: {e}')
|
1399
|
+
continue
|
1400
|
+
trusted_timestamps[k] = await response.read()
|
1401
|
+
|
1402
|
+
capture_response['trusted_timestamps'] = {k: b64encode(v).decode() for k, v in trusted_timestamps.items()}
|
1403
|
+
|
1336
1404
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
1337
1405
|
self.logger.debug("Capturing a screenshot of the full page.")
|
1338
1406
|
try:
|
@@ -1,18 +1,18 @@
|
|
1
1
|
[project]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.32.
|
3
|
+
version = "1.32.2"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = [
|
6
6
|
{name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
|
7
7
|
]
|
8
8
|
license = "BSD-3-Clause"
|
9
9
|
readme = "README.md"
|
10
|
-
requires-python = ">=3.9,<4.0"
|
10
|
+
requires-python = ">=3.9.2,<4.0"
|
11
11
|
|
12
12
|
dynamic = [ "classifiers" ]
|
13
13
|
|
14
14
|
dependencies = [
|
15
|
-
"playwright (>=1.
|
15
|
+
"playwright (>=1.55.0)",
|
16
16
|
"dateparser (>=1.2.2)",
|
17
17
|
"beautifulsoup4[charset-normalizer,lxml] (>=4.13.5)",
|
18
18
|
"w3lib (>=2.3.1)",
|
@@ -25,7 +25,8 @@ dependencies = [
|
|
25
25
|
"aiohttp-socks (>=0.10.1)",
|
26
26
|
"typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\"",
|
27
27
|
"dnspython (>=2.7.0,<3.0.0)",
|
28
|
-
"python-socks (>=2.7.1,<3.0.0)"
|
28
|
+
"python-socks (>=2.7.1,<3.0.0)",
|
29
|
+
"rfc3161-client (>=1.0.4,<2.0.0)"
|
29
30
|
]
|
30
31
|
|
31
32
|
[project.urls]
|
@@ -50,7 +51,7 @@ recaptcha = [
|
|
50
51
|
|
51
52
|
[tool.poetry.group.dev.dependencies]
|
52
53
|
types-beautifulsoup4 = "^4.12.0.20250516"
|
53
|
-
pytest = "^8.4.
|
54
|
+
pytest = "^8.4.2"
|
54
55
|
mypy = "^1.17.1"
|
55
56
|
types-dateparser = "^1.2.2.20250809"
|
56
57
|
types-pytz = "^2025.2.0.20250809"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|