PlaywrightCapture 1.25.0__tar.gz → 1.25.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/PKG-INFO +2 -2
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/playwrightcapture/capture.py +66 -19
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/pyproject.toml +2 -2
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/LICENSE +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/README.md +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.25.0 → playwrightcapture-1.25.2}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.2
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
|
31
31
|
Requires-Dist: puremagic (>=1.25,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=70.
|
34
|
+
Requires-Dist: setuptools (>=70.3.0,<71.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -13,10 +13,11 @@ import sys
|
|
13
13
|
import time
|
14
14
|
|
15
15
|
from base64 import b64decode
|
16
|
+
from dataclasses import dataclass
|
16
17
|
from io import BytesIO
|
17
18
|
from logging import LoggerAdapter, Logger
|
18
19
|
from tempfile import NamedTemporaryFile
|
19
|
-
from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
|
20
|
+
from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping, Generator
|
20
21
|
from urllib.parse import urlparse, unquote, urljoin
|
21
22
|
from zipfile import ZipFile
|
22
23
|
|
@@ -30,7 +31,7 @@ from charset_normalizer import from_bytes
|
|
30
31
|
from playwright._impl._errors import TargetClosedError
|
31
32
|
from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
|
32
33
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
33
|
-
from playwright_stealth import stealth_async # type: ignore[import-untyped]
|
34
|
+
from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
|
34
35
|
from puremagic import PureError, from_string # type: ignore[import-untyped]
|
35
36
|
from w3lib.html import strip_html5_whitespace
|
36
37
|
from w3lib.url import canonicalize_url, safe_url_string
|
@@ -95,6 +96,31 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
95
96
|
return msg, kwargs
|
96
97
|
|
97
98
|
|
99
|
+
@dataclass
|
100
|
+
class PCStealthConfig(StealthConfig): # type: ignore[misc]
|
101
|
+
|
102
|
+
@property
|
103
|
+
def enabled_scripts(self) -> Generator[str, None, None]:
|
104
|
+
self.webdriver = True
|
105
|
+
self.webgl_vendor = True
|
106
|
+
self.chrome_app = True
|
107
|
+
self.chrome_csi = True
|
108
|
+
self.chrome_load_times = True
|
109
|
+
self.chrome_runtime = True
|
110
|
+
self.iframe_content_window = True
|
111
|
+
self.media_codecs = True
|
112
|
+
self.navigator_hardware_concurrency = 4
|
113
|
+
self.navigator_languages = False # Causes issue
|
114
|
+
self.navigator_permissions = True
|
115
|
+
self.navigator_platform = True
|
116
|
+
self.navigator_plugins = True
|
117
|
+
self.navigator_user_agent = False # Causes issues
|
118
|
+
self.navigator_vendor = False # Causes issues
|
119
|
+
self.outerdimensions = True
|
120
|
+
self.hairline = True
|
121
|
+
yield from super().enabled_scripts
|
122
|
+
|
123
|
+
|
98
124
|
class Capture():
|
99
125
|
|
100
126
|
_browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
|
@@ -171,7 +197,8 @@ class Capture():
|
|
171
197
|
raise UnknownPlaywrightBrowser(f'Incorrect browser name {self.browser_name}, must be in {", ".join(self._browsers)}')
|
172
198
|
|
173
199
|
self.browser = await self.playwright[self.browser_name].launch(
|
174
|
-
proxy=self.proxy if self.proxy else None
|
200
|
+
proxy=self.proxy if self.proxy else None,
|
201
|
+
# headless=False
|
175
202
|
)
|
176
203
|
|
177
204
|
# Set of URLs that were captured in that context
|
@@ -385,6 +412,7 @@ class Capture():
|
|
385
412
|
self.context = await self.browser.new_context(
|
386
413
|
record_har_path=self._temp_harfile.name,
|
387
414
|
ignore_https_errors=True,
|
415
|
+
bypass_csp=True,
|
388
416
|
http_credentials=self.http_credentials if self.http_credentials else None,
|
389
417
|
user_agent=self.user_agent if self.user_agent else device_context_settings.pop('user_agent', None),
|
390
418
|
locale=self.locale if self.locale else None,
|
@@ -472,7 +500,7 @@ class Capture():
|
|
472
500
|
async def handler() -> None:
|
473
501
|
self.logger.debug('Didomi dialog found, clicking through.')
|
474
502
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
475
|
-
await page.locator("#didomi-notice-agree-button").click(timeout=
|
503
|
+
await page.locator("#didomi-notice-agree-button").click(timeout=30000)
|
476
504
|
|
477
505
|
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
|
478
506
|
self.logger.info('Didomi handler added')
|
@@ -654,6 +682,7 @@ class Capture():
|
|
654
682
|
capturing_sub = False
|
655
683
|
try:
|
656
684
|
page = await self.context.new_page()
|
685
|
+
await page.clock.install()
|
657
686
|
except Error as e:
|
658
687
|
self.logger.warning(f'The context is in a broken state: {e}')
|
659
688
|
self.should_retry = True
|
@@ -671,7 +700,9 @@ class Capture():
|
|
671
700
|
await self.__dialog_alert_dialog_clickthrough(page)
|
672
701
|
await self.__dialog_clickthrough(page)
|
673
702
|
|
674
|
-
await stealth_async(page)
|
703
|
+
await stealth_async(page, PCStealthConfig())
|
704
|
+
# await stealth_async(page)
|
705
|
+
|
675
706
|
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
676
707
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
677
708
|
page.on("requestfinished", store_request)
|
@@ -718,6 +749,7 @@ class Capture():
|
|
718
749
|
else:
|
719
750
|
raise initial_error
|
720
751
|
else:
|
752
|
+
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
721
753
|
try:
|
722
754
|
await page.bring_to_front()
|
723
755
|
self.logger.debug('Page moved to front.')
|
@@ -753,28 +785,34 @@ class Capture():
|
|
753
785
|
# check if we have anything on the page. If we don't, the page is not working properly.
|
754
786
|
if await self._failsafe_get_content(page):
|
755
787
|
self.logger.debug('Got rendered content')
|
788
|
+
|
789
|
+
# move mouse
|
790
|
+
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
791
|
+
self.logger.debug('Moved mouse.')
|
792
|
+
await self._wait_for_random_timeout(page, 5)
|
793
|
+
self.logger.debug('Keep going after moving mouse.')
|
794
|
+
|
756
795
|
if allow_tracking:
|
757
|
-
await self._wait_for_random_timeout(page,
|
796
|
+
await self._wait_for_random_timeout(page, 5)
|
758
797
|
# This event is required trigger the add_locator_handler
|
759
798
|
try:
|
760
799
|
if await page.locator("body").first.is_visible():
|
761
|
-
|
800
|
+
self.logger.debug('Got body.')
|
801
|
+
await page.locator("body").first.click(button="right",
|
802
|
+
timeout=5000,
|
803
|
+
delay=50)
|
804
|
+
self.logger.debug('Clicked on body.')
|
762
805
|
except Exception as e:
|
763
806
|
self.logger.warning(f'Could not find body: {e}')
|
764
807
|
|
765
|
-
# move mouse
|
766
|
-
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
767
|
-
self.logger.debug('Moved mouse.')
|
768
|
-
await self._wait_for_random_timeout(page, 2)
|
769
|
-
self.logger.debug('Keep going after moving mouse.')
|
770
|
-
|
771
808
|
if parsed_url.fragment:
|
772
809
|
# We got a fragment, make sure we go to it and scroll only a little bit.
|
773
810
|
fragment = unquote(parsed_url.fragment)
|
774
811
|
try:
|
775
812
|
await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
|
776
813
|
await self._wait_for_random_timeout(page, 2)
|
777
|
-
|
814
|
+
async with timeout(3):
|
815
|
+
await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
|
778
816
|
self.logger.debug('Jumped to fragment.')
|
779
817
|
except PlaywrightTimeoutError as e:
|
780
818
|
self.logger.info(f'Unable to go to fragment "{fragment}" (timeout): {e}')
|
@@ -782,14 +820,20 @@ class Capture():
|
|
782
820
|
self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
|
783
821
|
except Error as e:
|
784
822
|
self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
|
823
|
+
except TimeoutError:
|
824
|
+
self.logger.debug('Unable to scroll due to timeout')
|
785
825
|
else:
|
786
826
|
# scroll more
|
787
827
|
try:
|
788
|
-
# NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes
|
789
|
-
|
828
|
+
# NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
|
829
|
+
# 2024-07-08: Also, it sometimes get stuck.
|
830
|
+
async with timeout(3):
|
831
|
+
await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
|
790
832
|
self.logger.debug('Scrolled down.')
|
791
833
|
except Error as e:
|
792
834
|
self.logger.debug(f'Unable to scroll: {e}')
|
835
|
+
except TimeoutError:
|
836
|
+
self.logger.debug('Unable to scroll due to timeout')
|
793
837
|
|
794
838
|
await self._wait_for_random_timeout(page, 3)
|
795
839
|
self.logger.debug('Keep going after moving on page.')
|
@@ -802,7 +846,6 @@ class Capture():
|
|
802
846
|
self.logger.debug('PageDown on keyboard')
|
803
847
|
except Error as e:
|
804
848
|
self.logger.debug(f'Unable to use keyboard: {e}')
|
805
|
-
|
806
849
|
if self.wait_for_download > 0:
|
807
850
|
self.logger.info('Waiting for download to finish...')
|
808
851
|
await self._safe_wait(page, 20)
|
@@ -821,6 +864,10 @@ class Capture():
|
|
821
864
|
z.writestr(f'{i}_{filename}', file_content)
|
822
865
|
to_return["downloaded_file"] = mem_zip.getvalue()
|
823
866
|
|
867
|
+
# fast forward 30s
|
868
|
+
await page.clock.run_for("30")
|
869
|
+
self.logger.debug('Moved time forward.')
|
870
|
+
|
824
871
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
825
872
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
|
826
873
|
await self._safe_wait(page)
|
@@ -993,12 +1040,12 @@ class Capture():
|
|
993
1040
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
994
1041
|
self.logger.debug("Capturing a screenshot of the full page.")
|
995
1042
|
try:
|
996
|
-
return await page.screenshot(full_page=True, timeout=
|
1043
|
+
return await page.screenshot(full_page=True, timeout=10000)
|
997
1044
|
except Error as e:
|
998
1045
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
|
999
1046
|
|
1000
1047
|
try:
|
1001
|
-
return await page.screenshot(full_page=True, scale="css", timeout=
|
1048
|
+
return await page.screenshot(full_page=True, scale="css", timeout=30000)
|
1002
1049
|
except Error as e:
|
1003
1050
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
|
1004
1051
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.25.
|
3
|
+
version = "1.25.2"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -28,7 +28,7 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
|
|
28
28
|
pytz = {"version" = "^2024.1", python = "<3.9"}
|
29
29
|
tzdata = "^2024.1"
|
30
30
|
playwright-stealth = "^1.0.6"
|
31
|
-
setuptools = "^70.
|
31
|
+
setuptools = "^70.3.0"
|
32
32
|
puremagic = "^1.25"
|
33
33
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
34
|
aiohttp = {extras = ["speedups"], version = "^3.9.5"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|