PlaywrightCapture 1.26.3__tar.gz → 1.27.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/PKG-INFO +5 -9
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/playwrightcapture/capture.py +23 -17
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/pyproject.toml +5 -12
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/LICENSE +0 -0
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/README.md +0 -0
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.26.3 → playwrightcapture-1.27.1}/playwrightcapture/py.typed +0 -0
@@ -1,18 +1,17 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.27.1
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
7
7
|
Author: Raphaël Vinot
|
8
8
|
Author-email: raphael.vinot@circl.lu
|
9
|
-
Requires-Python: >=3.
|
9
|
+
Requires-Python: >=3.9,<4.0
|
10
10
|
Classifier: Environment :: Console
|
11
11
|
Classifier: Intended Audience :: Science/Research
|
12
12
|
Classifier: Intended Audience :: Telecommunications Industry
|
13
13
|
Classifier: License :: OSI Approved :: BSD License
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
15
|
-
Classifier: Programming Language :: Python :: 3.8
|
16
15
|
Classifier: Programming Language :: Python :: 3.9
|
17
16
|
Classifier: Programming Language :: Python :: 3.10
|
18
17
|
Classifier: Programming Language :: Python :: 3.11
|
@@ -21,11 +20,9 @@ Classifier: Programming Language :: Python :: 3.13
|
|
21
20
|
Classifier: Topic :: Internet
|
22
21
|
Classifier: Topic :: Security
|
23
22
|
Provides-Extra: recaptcha
|
24
|
-
Requires-Dist: SpeechRecognition (
|
25
|
-
Requires-Dist: SpeechRecognition (>=3.11) ; (python_version >= "3.9") and (extra == "recaptcha")
|
23
|
+
Requires-Dist: SpeechRecognition (>=3.11.0) ; extra == "recaptcha"
|
26
24
|
Requires-Dist: aiohttp-socks (>=0.9,<0.10)
|
27
|
-
Requires-Dist: aiohttp[speedups] (
|
28
|
-
Requires-Dist: aiohttp[speedups] (>=3.10.10,<4.0.0) ; python_version >= "3.9"
|
25
|
+
Requires-Dist: aiohttp[speedups] (>=3.10.10,<4.0.0)
|
29
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
30
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
31
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
@@ -33,8 +30,7 @@ Requires-Dist: playwright (>=1.48.0,<2.0.0)
|
|
33
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
34
31
|
Requires-Dist: puremagic (>=1.28,<2.0)
|
35
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
36
|
-
Requires-Dist:
|
37
|
-
Requires-Dist: setuptools (>=75.2.0,<76.0.0)
|
33
|
+
Requires-Dist: setuptools (>=75.3.0,<76.0.0)
|
38
34
|
Requires-Dist: tzdata (>=2024.2,<2025.0)
|
39
35
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
40
36
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -17,7 +17,8 @@ from dataclasses import dataclass
|
|
17
17
|
from io import BytesIO
|
18
18
|
from logging import LoggerAdapter, Logger
|
19
19
|
from tempfile import NamedTemporaryFile
|
20
|
-
from typing import Any, TypedDict, Literal, TYPE_CHECKING
|
20
|
+
from typing import Any, TypedDict, Literal, TYPE_CHECKING
|
21
|
+
from collections.abc import MutableMapping, Generator
|
21
22
|
from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit
|
22
23
|
from zipfile import ZipFile
|
23
24
|
|
@@ -38,11 +39,8 @@ from w3lib.url import canonicalize_url, safe_url_string
|
|
38
39
|
|
39
40
|
from .exceptions import UnknownPlaywrightBrowser, UnknownPlaywrightDevice, InvalidPlaywrightParameter
|
40
41
|
|
41
|
-
|
42
|
-
|
43
|
-
else:
|
44
|
-
from zoneinfo import available_timezones
|
45
|
-
all_timezones_set = available_timezones()
|
42
|
+
from zoneinfo import available_timezones
|
43
|
+
all_timezones_set = available_timezones()
|
46
44
|
|
47
45
|
if sys.version_info < (3, 11):
|
48
46
|
from async_timeout import timeout
|
@@ -104,7 +102,7 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
104
102
|
class PCStealthConfig(StealthConfig): # type: ignore[misc]
|
105
103
|
|
106
104
|
@property
|
107
|
-
def enabled_scripts(self) -> Generator[str
|
105
|
+
def enabled_scripts(self) -> Generator[str]:
|
108
106
|
self.chrome_app = True
|
109
107
|
self.chrome_csi = True
|
110
108
|
self.chrome_runtime = True
|
@@ -734,7 +732,7 @@ class Capture():
|
|
734
732
|
await frame.get_by_label(label).click(timeout=2000)
|
735
733
|
break
|
736
734
|
except (TimeoutError, asyncio.TimeoutError) as e:
|
737
|
-
self.logger.warning(f'
|
735
|
+
self.logger.warning(f'Consent timeout (label {label}) : {e}')
|
738
736
|
|
739
737
|
try:
|
740
738
|
async with timeout(5):
|
@@ -744,9 +742,9 @@ class Capture():
|
|
744
742
|
await frame.get_by_role("button", name=label).click(timeout=2000)
|
745
743
|
break
|
746
744
|
except (TimeoutError, asyncio.TimeoutError) as e:
|
747
|
-
self.logger.warning(f'Frame consent timeout: {e}')
|
745
|
+
self.logger.warning(f'Frame consent timeout (button {label}): {e}')
|
748
746
|
except Exception as e:
|
749
|
-
self.logger.info(f'Issue with
|
747
|
+
self.logger.info(f'Issue with consent validation: {e}')
|
750
748
|
return got_button
|
751
749
|
|
752
750
|
async def _move_time_forward(self, page: Page, time: int) -> None:
|
@@ -827,12 +825,18 @@ class Capture():
|
|
827
825
|
capturing_sub = False
|
828
826
|
try:
|
829
827
|
page = await self.context.new_page()
|
830
|
-
await page.clock.install()
|
831
828
|
except Error as e:
|
832
|
-
self.logger.warning(f'
|
829
|
+
self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}')
|
833
830
|
self.should_retry = True
|
834
831
|
return to_return
|
835
832
|
|
833
|
+
try:
|
834
|
+
await page.clock.install()
|
835
|
+
clock_set = True
|
836
|
+
except Error as e:
|
837
|
+
self.logger.warning(f'Unable to install the clock: {e}')
|
838
|
+
clock_set = False
|
839
|
+
|
836
840
|
if allow_tracking:
|
837
841
|
# Add authorization clickthroughs
|
838
842
|
await self.__dialog_didomi_clickthrough(page)
|
@@ -900,8 +904,7 @@ class Capture():
|
|
900
904
|
await page.bring_to_front()
|
901
905
|
self.logger.debug('Page moved to front.')
|
902
906
|
except Error as e:
|
903
|
-
self.logger.warning('
|
904
|
-
raise e
|
907
|
+
self.logger.warning(f'Unable to bring the page to the front: {e}.')
|
905
908
|
|
906
909
|
# page instrumentation
|
907
910
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
@@ -971,7 +974,8 @@ class Capture():
|
|
971
974
|
self.logger.debug('Got button on main frame')
|
972
975
|
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
973
976
|
|
974
|
-
|
977
|
+
if clock_set:
|
978
|
+
await self._move_time_forward(page, 10)
|
975
979
|
|
976
980
|
if parsed_url.fragment:
|
977
981
|
# We got a fragment, make sure we go to it and scroll only a little bit.
|
@@ -1039,8 +1043,9 @@ class Capture():
|
|
1039
1043
|
z.writestr(f'{i}_{filename}', file_content)
|
1040
1044
|
to_return["downloaded_file"] = mem_zip.getvalue()
|
1041
1045
|
|
1042
|
-
|
1043
|
-
|
1046
|
+
if clock_set:
|
1047
|
+
# fast forward ~30s
|
1048
|
+
await self._move_time_forward(page, 30)
|
1044
1049
|
|
1045
1050
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
1046
1051
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
|
@@ -1462,6 +1467,7 @@ class Capture():
|
|
1462
1467
|
'net::ERR_CONNECTION_TIMED_OUT',
|
1463
1468
|
'net::ERR_HTTP_RESPONSE_CODE_FAILURE',
|
1464
1469
|
'net::ERR_HTTP2_PROTOCOL_ERROR',
|
1470
|
+
'net::ERR_INVALID_HTTP_RESPONSE',
|
1465
1471
|
'net::ERR_INVALID_REDIRECT',
|
1466
1472
|
'net::ERR_NAME_NOT_RESOLVED',
|
1467
1473
|
'net::ERR_NETWORK_ACCESS_DENIED',
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.
|
3
|
+
version = "1.27.1"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -18,26 +18,19 @@ classifiers=[
|
|
18
18
|
]
|
19
19
|
|
20
20
|
[tool.poetry.dependencies]
|
21
|
-
python = "^3.
|
21
|
+
python = "^3.9"
|
22
22
|
playwright = "^1.48.0"
|
23
23
|
dateparser = "^1.2.0"
|
24
24
|
beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
|
25
25
|
w3lib = "^2.2.1"
|
26
26
|
pydub = {version = "^0.25.1", optional = true}
|
27
|
-
SpeechRecognition =
|
28
|
-
{version = "<3.11", python = "<3.9", optional = true},
|
29
|
-
{version = ">=3.11", python = ">=3.9", optional = true}
|
30
|
-
]
|
31
|
-
pytz = {"version" = "^2024.2", python = "<3.9"}
|
27
|
+
SpeechRecognition = {version = ">=3.11.0", optional = true}
|
32
28
|
tzdata = "^2024.2"
|
33
29
|
playwright-stealth = "^1.0.6"
|
34
|
-
setuptools = "^75.
|
30
|
+
setuptools = "^75.3.0"
|
35
31
|
puremagic = "^1.28"
|
36
32
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
37
|
-
aiohttp = [
|
38
|
-
{extras = ["speedups"], version = "<3.11", python = "<3.9"},
|
39
|
-
{extras = ["speedups"], version = "^3.10.10", python = ">=3.9"}
|
40
|
-
]
|
33
|
+
aiohttp = {version = "^3.10.10", extras = ["speedups"]}
|
41
34
|
aiohttp-socks = "^0.9"
|
42
35
|
|
43
36
|
[tool.poetry.extras]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|