PlaywrightCapture 1.32.1__tar.gz → 1.32.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/PKG-INFO +3 -3
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/playwrightcapture/__init__.py +2 -0
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/playwrightcapture/capture.py +73 -4
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/pyproject.toml +5 -4
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/LICENSE +0 -0
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/README.md +0 -0
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/playwrightcapture/py.typed +0 -0
- {playwrightcapture-1.32.1 → playwrightcapture-1.32.3}/playwrightcapture/socks5dnslookup.py +0 -0
@@ -1,16 +1,15 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.32.
|
3
|
+
Version: 1.32.3
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
7
7
|
Author-email: raphael.vinot@circl.lu
|
8
|
-
Requires-Python: >=3.9,<4.0
|
8
|
+
Requires-Python: >=3.9.2,<4.0
|
9
9
|
Classifier: Intended Audience :: Science/Research
|
10
10
|
Classifier: Intended Audience :: Telecommunications Industry
|
11
11
|
Classifier: License :: OSI Approved :: BSD License
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
14
13
|
Classifier: Programming Language :: Python :: 3.10
|
15
14
|
Classifier: Programming Language :: Python :: 3.11
|
16
15
|
Classifier: Programming Language :: Python :: 3.12
|
@@ -31,6 +30,7 @@ Requires-Dist: puremagic (>=1.30)
|
|
31
30
|
Requires-Dist: pydub (>=0.25.1) ; (python_version < "3.10") and (extra == "recaptcha")
|
32
31
|
Requires-Dist: pydub-ng (>=0.2.0) ; (python_version >= "3.10") and (extra == "recaptcha")
|
33
32
|
Requires-Dist: python-socks (>=2.7.1,<3.0.0)
|
33
|
+
Requires-Dist: rfc3161-client (>=1.0.4,<2.0.0)
|
34
34
|
Requires-Dist: setuptools (>=80.9.0)
|
35
35
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
|
36
36
|
Requires-Dist: tzdata (>=2025.2)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from .capture import Capture # noqa
|
2
2
|
from .capture import CaptureResponse # noqa
|
3
3
|
from .capture import SetCookieParam, Cookie # noqa
|
4
|
+
from .capture import TrustedTimestampSettings # noqa
|
4
5
|
from .helpers import get_devices # noqa
|
5
6
|
from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType, # noqa
|
6
7
|
UnknownPlaywrightBrowser, UnknownPlaywrightDevice,
|
@@ -9,6 +10,7 @@ from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType
|
|
9
10
|
__all__ = [
|
10
11
|
'Capture',
|
11
12
|
'CaptureResponse',
|
13
|
+
'TrustedTimestampSettings',
|
12
14
|
'SetCookieParam', 'Cookie',
|
13
15
|
'get_devices',
|
14
16
|
'PlaywrightCaptureException',
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
4
4
|
|
5
5
|
import asyncio
|
6
6
|
import binascii
|
7
|
+
# import hashlib
|
7
8
|
import json
|
8
9
|
import logging
|
9
10
|
import os
|
@@ -12,13 +13,13 @@ import re
|
|
12
13
|
import sys
|
13
14
|
import time
|
14
15
|
|
15
|
-
from base64 import b64decode
|
16
|
+
from base64 import b64decode, b64encode
|
16
17
|
from io import BytesIO
|
17
18
|
from logging import LoggerAdapter, Logger
|
18
19
|
from tempfile import NamedTemporaryFile
|
19
20
|
from typing import Any, Literal, TYPE_CHECKING
|
20
21
|
from collections.abc import MutableMapping
|
21
|
-
from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit
|
22
|
+
from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit, unquote_plus
|
22
23
|
from zipfile import ZipFile
|
23
24
|
|
24
25
|
import aiohttp
|
@@ -32,6 +33,7 @@ from playwright.async_api import async_playwright, Frame, Error, Page, Download,
|
|
32
33
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
33
34
|
from playwright_stealth import Stealth, ALL_EVASIONS_DISABLED_KWARGS # type: ignore[attr-defined]
|
34
35
|
from puremagic import PureError, from_string
|
36
|
+
from rfc3161_client import TimestampRequestBuilder, TimeStampRequest, HashAlgorithm
|
35
37
|
from w3lib.html import strip_html5_whitespace
|
36
38
|
from w3lib.url import canonicalize_url, safe_url_string
|
37
39
|
|
@@ -119,6 +121,10 @@ class CaptureResponse(TypedDict, total=False):
|
|
119
121
|
downloaded_file: bytes | None
|
120
122
|
children: list[CaptureResponse] | None
|
121
123
|
|
124
|
+
# if the capture is triggered with with_trusted_timestamps, the response contains a
|
125
|
+
# dict[<entry name>] = '<base64 encoded timestamp response>'
|
126
|
+
trusted_timestamps: dict[str, str] | None
|
127
|
+
|
122
128
|
# One day, playwright will support getting the favicon from the capture itself
|
123
129
|
# favicon: Optional[bytes]
|
124
130
|
# in the meantime, we need a workaround: https://github.com/Lookyloo/PlaywrightCapture/issues/45
|
@@ -135,6 +141,13 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
135
141
|
return msg, kwargs
|
136
142
|
|
137
143
|
|
144
|
+
class TrustedTimestampSettings(TypedDict, total=False):
|
145
|
+
|
146
|
+
url: str
|
147
|
+
hash_algorithm: HashAlgorithm | None
|
148
|
+
# NOTE: can add other settings such as auth mechanism, if needed.
|
149
|
+
|
150
|
+
|
138
151
|
# good test pages:
|
139
152
|
# https://kaliiiiiiiiii.github.io/brotector/?crash=false
|
140
153
|
# https://www.browserscan.net/bot-detection
|
@@ -155,7 +168,7 @@ class Capture():
|
|
155
168
|
socks5_dns_resolver: str | list[str] | None=None,
|
156
169
|
general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
|
157
170
|
uuid: str | None=None, headless: bool=True,
|
158
|
-
*, init_script: str | None=None):
|
171
|
+
*, init_script: str | None=None, tt_settings: TrustedTimestampSettings | None=None):
|
159
172
|
"""Captures a page with Playwright.
|
160
173
|
|
161
174
|
:param browser: The browser to use for the capture.
|
@@ -219,6 +232,8 @@ class Capture():
|
|
219
232
|
|
220
233
|
self._init_script = init_script
|
221
234
|
|
235
|
+
self.tt_settings = tt_settings
|
236
|
+
|
222
237
|
def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
|
223
238
|
splitted = urlsplit(proxy)
|
224
239
|
if splitted.username and splitted.password:
|
@@ -1007,6 +1022,7 @@ class Capture():
|
|
1007
1022
|
with_screenshot: bool=True,
|
1008
1023
|
with_favicon: bool=False,
|
1009
1024
|
allow_tracking: bool=False,
|
1025
|
+
with_trusted_timestamps: bool=False,
|
1010
1026
|
) -> CaptureResponse:
|
1011
1027
|
|
1012
1028
|
to_return: CaptureResponse = {}
|
@@ -1163,7 +1179,7 @@ class Capture():
|
|
1163
1179
|
except Exception as e:
|
1164
1180
|
self.logger.warning(f'Unable to get favicons: {e}')
|
1165
1181
|
|
1166
|
-
to_return['last_redirected_url'] = page.url
|
1182
|
+
to_return['last_redirected_url'] = unquote_plus(page.url)
|
1167
1183
|
|
1168
1184
|
if with_screenshot:
|
1169
1185
|
to_return['png'] = await self._failsafe_get_screenshot(page)
|
@@ -1199,6 +1215,11 @@ class Capture():
|
|
1199
1215
|
rendered_hostname_only=rendered_hostname_only,
|
1200
1216
|
max_depth_capture_time=max_capture_time,
|
1201
1217
|
with_screenshot=with_screenshot)
|
1218
|
+
if with_trusted_timestamps:
|
1219
|
+
try:
|
1220
|
+
await self._get_trusted_timestamps(child_capture)
|
1221
|
+
except Exception as e:
|
1222
|
+
self.logger.warning(f'Unable to get the trusted timestamps for the clild capture : {e}.')
|
1202
1223
|
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
1203
1224
|
except (TimeoutError, asyncio.TimeoutError):
|
1204
1225
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
@@ -1331,8 +1352,56 @@ class Capture():
|
|
1331
1352
|
self.logger.debug('Capture done')
|
1332
1353
|
if errors:
|
1333
1354
|
to_return['error'] = '\n'.join(errors)
|
1355
|
+
if with_trusted_timestamps:
|
1356
|
+
try:
|
1357
|
+
await self._get_trusted_timestamps(to_return)
|
1358
|
+
except Exception as e:
|
1359
|
+
self.logger.warning(f'Unable to get trusted timestamps: {e}')
|
1334
1360
|
return to_return
|
1335
1361
|
|
1362
|
+
async def _get_trusted_timestamps(self, capture_response: CaptureResponse) -> None:
|
1363
|
+
"""Get trusted timestamps for the relevant values in the response"""
|
1364
|
+
if not self.tt_settings:
|
1365
|
+
self.logger.warning('The remote timestamper is not configured.')
|
1366
|
+
return None
|
1367
|
+
to_timestamp: dict[str, TimestampRequestBuilder] = {}
|
1368
|
+
if last_redirected_url := capture_response.get('last_redirected_url'):
|
1369
|
+
to_timestamp['last_redirected_url'] = TimestampRequestBuilder().data(last_redirected_url.encode())
|
1370
|
+
if har := capture_response.get('har'):
|
1371
|
+
to_timestamp['har'] = TimestampRequestBuilder().data(json.dumps(har).encode())
|
1372
|
+
if storage := capture_response.get('storage'):
|
1373
|
+
to_timestamp['storage'] = TimestampRequestBuilder().data(json.dumps(storage).encode())
|
1374
|
+
if html := capture_response.get('html'):
|
1375
|
+
to_timestamp['html'] = TimestampRequestBuilder().data(html.encode())
|
1376
|
+
if png := capture_response.get('png'):
|
1377
|
+
to_timestamp['png'] = TimestampRequestBuilder().data(png)
|
1378
|
+
if downloaded_filename := capture_response.get('downloaded_filename'):
|
1379
|
+
to_timestamp['downloaded_filename'] = TimestampRequestBuilder().data(downloaded_filename.encode())
|
1380
|
+
if downloaded_file := capture_response.get('downloaded_file'):
|
1381
|
+
to_timestamp['downloaded_file'] = TimestampRequestBuilder().data(downloaded_file)
|
1382
|
+
# if potential_favicons := capture_response.get('potential_favicons'):
|
1383
|
+
# to_timestamp['potential_favicons'] = TimestampRequestBuilder().data(potential_favicons)
|
1384
|
+
|
1385
|
+
tt_requests: dict[str, TimeStampRequest] = {}
|
1386
|
+
for k, trb in to_timestamp.items():
|
1387
|
+
if h_algo := self.tt_settings.get('hash_algorithm'):
|
1388
|
+
trb.hash_algorithm(h_algo)
|
1389
|
+
tt_requests[k] = trb.build()
|
1390
|
+
|
1391
|
+
trusted_timestamps: dict[str, bytes] = {}
|
1392
|
+
async with aiohttp.ClientSession() as session:
|
1393
|
+
for k, tsr in tt_requests.items():
|
1394
|
+
async with session.post(self.tt_settings['url'], data=tsr.as_bytes(),
|
1395
|
+
headers={"Content-Type": "application/timestamp-query"}) as response:
|
1396
|
+
try:
|
1397
|
+
response.raise_for_status()
|
1398
|
+
except aiohttp.ClientResponseError as e:
|
1399
|
+
self.logger.warning(f'Unable to get Trusted Timestamp for {k}: {e}')
|
1400
|
+
continue
|
1401
|
+
trusted_timestamps[k] = await response.read()
|
1402
|
+
|
1403
|
+
capture_response['trusted_timestamps'] = {k: b64encode(v).decode() for k, v in trusted_timestamps.items()}
|
1404
|
+
|
1336
1405
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
1337
1406
|
self.logger.debug("Capturing a screenshot of the full page.")
|
1338
1407
|
try:
|
@@ -1,13 +1,13 @@
|
|
1
1
|
[project]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.32.
|
3
|
+
version = "1.32.3"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = [
|
6
6
|
{name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
|
7
7
|
]
|
8
8
|
license = "BSD-3-Clause"
|
9
9
|
readme = "README.md"
|
10
|
-
requires-python = ">=3.9,<4.0"
|
10
|
+
requires-python = ">=3.9.2,<4.0"
|
11
11
|
|
12
12
|
dynamic = [ "classifiers" ]
|
13
13
|
|
@@ -25,7 +25,8 @@ dependencies = [
|
|
25
25
|
"aiohttp-socks (>=0.10.1)",
|
26
26
|
"typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\"",
|
27
27
|
"dnspython (>=2.7.0,<3.0.0)",
|
28
|
-
"python-socks (>=2.7.1,<3.0.0)"
|
28
|
+
"python-socks (>=2.7.1,<3.0.0)",
|
29
|
+
"rfc3161-client (>=1.0.4,<2.0.0)"
|
29
30
|
]
|
30
31
|
|
31
32
|
[project.urls]
|
@@ -50,7 +51,7 @@ recaptcha = [
|
|
50
51
|
|
51
52
|
[tool.poetry.group.dev.dependencies]
|
52
53
|
types-beautifulsoup4 = "^4.12.0.20250516"
|
53
|
-
pytest = "^8.4.
|
54
|
+
pytest = "^8.4.2"
|
54
55
|
mypy = "^1.17.1"
|
55
56
|
types-dateparser = "^1.2.2.20250809"
|
56
57
|
types-pytz = "^2025.2.0.20250809"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|