PlaywrightCapture 1.25.0__tar.gz → 1.25.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/PKG-INFO +2 -2
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/playwrightcapture/capture.py +67 -18
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/pyproject.toml +2 -2
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/LICENSE +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/README.md +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.1}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.1
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
|
31
31
|
Requires-Dist: puremagic (>=1.25,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=70.
|
34
|
+
Requires-Dist: setuptools (>=70.3.0,<71.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -13,10 +13,11 @@ import sys
|
|
13
13
|
import time
|
14
14
|
|
15
15
|
from base64 import b64decode
|
16
|
+
from dataclasses import dataclass
|
16
17
|
from io import BytesIO
|
17
18
|
from logging import LoggerAdapter, Logger
|
18
19
|
from tempfile import NamedTemporaryFile
|
19
|
-
from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
|
20
|
+
from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping, Generator
|
20
21
|
from urllib.parse import urlparse, unquote, urljoin
|
21
22
|
from zipfile import ZipFile
|
22
23
|
|
@@ -30,7 +31,7 @@ from charset_normalizer import from_bytes
|
|
30
31
|
from playwright._impl._errors import TargetClosedError
|
31
32
|
from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
|
32
33
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
33
|
-
from playwright_stealth import stealth_async # type: ignore[import-untyped]
|
34
|
+
from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
|
34
35
|
from puremagic import PureError, from_string # type: ignore[import-untyped]
|
35
36
|
from w3lib.html import strip_html5_whitespace
|
36
37
|
from w3lib.url import canonicalize_url, safe_url_string
|
@@ -95,6 +96,31 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
95
96
|
return msg, kwargs
|
96
97
|
|
97
98
|
|
99
|
+
@dataclass
|
100
|
+
class PCStealthConfig(StealthConfig): # type: ignore[misc]
|
101
|
+
|
102
|
+
@property
|
103
|
+
def enabled_scripts(self) -> Generator[str, None, None]:
|
104
|
+
self.webdriver = True
|
105
|
+
self.webgl_vendor = True
|
106
|
+
self.chrome_app = True
|
107
|
+
self.chrome_csi = True
|
108
|
+
self.chrome_load_times = True
|
109
|
+
self.chrome_runtime = True
|
110
|
+
self.iframe_content_window = True
|
111
|
+
self.media_codecs = True
|
112
|
+
self.navigator_hardware_concurrency = 4
|
113
|
+
self.navigator_languages = False # Causes issue
|
114
|
+
self.navigator_permissions = True
|
115
|
+
self.navigator_platform = True
|
116
|
+
self.navigator_plugins = True
|
117
|
+
self.navigator_user_agent = False # Causes issues
|
118
|
+
self.navigator_vendor = False # Causes issues
|
119
|
+
self.outerdimensions = True
|
120
|
+
self.hairline = True
|
121
|
+
yield from super().enabled_scripts
|
122
|
+
|
123
|
+
|
98
124
|
class Capture():
|
99
125
|
|
100
126
|
_browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
|
@@ -171,7 +197,8 @@ class Capture():
|
|
171
197
|
raise UnknownPlaywrightBrowser(f'Incorrect browser name {self.browser_name}, must be in {", ".join(self._browsers)}')
|
172
198
|
|
173
199
|
self.browser = await self.playwright[self.browser_name].launch(
|
174
|
-
proxy=self.proxy if self.proxy else None
|
200
|
+
proxy=self.proxy if self.proxy else None,
|
201
|
+
# headless=False
|
175
202
|
)
|
176
203
|
|
177
204
|
# Set of URLs that were captured in that context
|
@@ -385,6 +412,7 @@ class Capture():
|
|
385
412
|
self.context = await self.browser.new_context(
|
386
413
|
record_har_path=self._temp_harfile.name,
|
387
414
|
ignore_https_errors=True,
|
415
|
+
bypass_csp=True,
|
388
416
|
http_credentials=self.http_credentials if self.http_credentials else None,
|
389
417
|
user_agent=self.user_agent if self.user_agent else device_context_settings.pop('user_agent', None),
|
390
418
|
locale=self.locale if self.locale else None,
|
@@ -472,7 +500,7 @@ class Capture():
|
|
472
500
|
async def handler() -> None:
|
473
501
|
self.logger.debug('Didomi dialog found, clicking through.')
|
474
502
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
475
|
-
await page.locator("#didomi-notice-agree-button").click(timeout=
|
503
|
+
await page.locator("#didomi-notice-agree-button").click(timeout=30000)
|
476
504
|
|
477
505
|
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
|
478
506
|
self.logger.info('Didomi handler added')
|
@@ -654,6 +682,7 @@ class Capture():
|
|
654
682
|
capturing_sub = False
|
655
683
|
try:
|
656
684
|
page = await self.context.new_page()
|
685
|
+
await page.clock.install()
|
657
686
|
except Error as e:
|
658
687
|
self.logger.warning(f'The context is in a broken state: {e}')
|
659
688
|
self.should_retry = True
|
@@ -671,7 +700,9 @@ class Capture():
|
|
671
700
|
await self.__dialog_alert_dialog_clickthrough(page)
|
672
701
|
await self.__dialog_clickthrough(page)
|
673
702
|
|
674
|
-
await stealth_async(page)
|
703
|
+
await stealth_async(page, PCStealthConfig())
|
704
|
+
# await stealth_async(page)
|
705
|
+
|
675
706
|
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
676
707
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
677
708
|
page.on("requestfinished", store_request)
|
@@ -718,6 +749,7 @@ class Capture():
|
|
718
749
|
else:
|
719
750
|
raise initial_error
|
720
751
|
else:
|
752
|
+
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
721
753
|
try:
|
722
754
|
await page.bring_to_front()
|
723
755
|
self.logger.debug('Page moved to front.')
|
@@ -753,20 +785,31 @@ class Capture():
|
|
753
785
|
# check if we have anything on the page. If we don't, the page is not working properly.
|
754
786
|
if await self._failsafe_get_content(page):
|
755
787
|
self.logger.debug('Got rendered content')
|
788
|
+
|
789
|
+
# move mouse
|
790
|
+
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
791
|
+
self.logger.debug('Moved mouse.')
|
792
|
+
await self._wait_for_random_timeout(page, 5)
|
793
|
+
self.logger.debug('Keep going after moving mouse.')
|
794
|
+
|
756
795
|
if allow_tracking:
|
757
|
-
await self._wait_for_random_timeout(page,
|
796
|
+
await self._wait_for_random_timeout(page, 5)
|
758
797
|
# This event is required trigger the add_locator_handler
|
759
798
|
try:
|
760
799
|
if await page.locator("body").first.is_visible():
|
761
|
-
|
800
|
+
self.logger.debug('Got body.')
|
801
|
+
await page.locator("body").first.click(button="right",
|
802
|
+
timeout=5000,
|
803
|
+
delay=50)
|
804
|
+
self.logger.debug('Clicked on body.')
|
762
805
|
except Exception as e:
|
763
806
|
self.logger.warning(f'Could not find body: {e}')
|
764
807
|
|
765
|
-
#
|
766
|
-
await page.
|
767
|
-
|
768
|
-
await self._wait_for_random_timeout(page,
|
769
|
-
self.logger.
|
808
|
+
# fast forward 30s
|
809
|
+
await page.clock.run_for(10000)
|
810
|
+
await page.clock.resume()
|
811
|
+
await self._wait_for_random_timeout(page, 5) # Wait 5 sec
|
812
|
+
self.logger.warning('Moved time forward.')
|
770
813
|
|
771
814
|
if parsed_url.fragment:
|
772
815
|
# We got a fragment, make sure we go to it and scroll only a little bit.
|
@@ -774,7 +817,8 @@ class Capture():
|
|
774
817
|
try:
|
775
818
|
await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
|
776
819
|
await self._wait_for_random_timeout(page, 2)
|
777
|
-
|
820
|
+
async with timeout(3):
|
821
|
+
await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
|
778
822
|
self.logger.debug('Jumped to fragment.')
|
779
823
|
except PlaywrightTimeoutError as e:
|
780
824
|
self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}')
|
@@ -782,14 +826,20 @@ class Capture():
|
|
782
826
|
self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
|
783
827
|
except Error as e:
|
784
828
|
self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
|
829
|
+
except TimeoutError:
|
830
|
+
self.logger.debug('Unable to scroll due to timeout')
|
785
831
|
else:
|
786
832
|
# scroll more
|
787
833
|
try:
|
788
|
-
# NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes
|
789
|
-
|
834
|
+
# NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
|
835
|
+
# 2024-07-08: Also, it sometimes get stuck.
|
836
|
+
async with timeout(3):
|
837
|
+
await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
|
790
838
|
self.logger.debug('Scrolled down.')
|
791
839
|
except Error as e:
|
792
840
|
self.logger.debug(f'Unable to scroll: {e}')
|
841
|
+
except TimeoutError:
|
842
|
+
self.logger.debug('Unable to scroll due to timeout')
|
793
843
|
|
794
844
|
await self._wait_for_random_timeout(page, 3)
|
795
845
|
self.logger.debug('Keep going after moving on page.')
|
@@ -802,7 +852,6 @@ class Capture():
|
|
802
852
|
self.logger.debug('PageDown on keyboard')
|
803
853
|
except Error as e:
|
804
854
|
self.logger.debug(f'Unable to use keyboard: {e}')
|
805
|
-
|
806
855
|
if self.wait_for_download > 0:
|
807
856
|
self.logger.info('Waiting for download to finish...')
|
808
857
|
await self._safe_wait(page, 20)
|
@@ -993,12 +1042,12 @@ class Capture():
|
|
993
1042
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
994
1043
|
self.logger.debug("Capturing a screenshot of the full page.")
|
995
1044
|
try:
|
996
|
-
return await page.screenshot(full_page=True, timeout=
|
1045
|
+
return await page.screenshot(full_page=True, timeout=10000)
|
997
1046
|
except Error as e:
|
998
1047
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
|
999
1048
|
|
1000
1049
|
try:
|
1001
|
-
return await page.screenshot(full_page=True, scale="css", timeout=
|
1050
|
+
return await page.screenshot(full_page=True, scale="css", timeout=30000)
|
1002
1051
|
except Error as e:
|
1003
1052
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
|
1004
1053
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.25.
|
3
|
+
version = "1.25.1"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -28,7 +28,7 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
|
|
28
28
|
pytz = {"version" = "^2024.1", python = "<3.9"}
|
29
29
|
tzdata = "^2024.1"
|
30
30
|
playwright-stealth = "^1.0.6"
|
31
|
-
setuptools = "^70.
|
31
|
+
setuptools = "^70.3.0"
|
32
32
|
puremagic = "^1.25"
|
33
33
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
34
|
aiohttp = {extras = ["speedups"], version = "^3.9.5"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|