PlaywrightCapture 1.25.9__py3-none-any.whl → 1.25.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +79 -27
- {playwrightcapture-1.25.9.dist-info → playwrightcapture-1.25.11.dist-info}/METADATA +6 -6
- playwrightcapture-1.25.11.dist-info/RECORD +9 -0
- playwrightcapture-1.25.9.dist-info/RECORD +0 -9
- {playwrightcapture-1.25.9.dist-info → playwrightcapture-1.25.11.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.25.9.dist-info → playwrightcapture-1.25.11.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -96,28 +96,40 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
96
96
|
return msg, kwargs
|
97
97
|
|
98
98
|
|
99
|
+
# good test pages:
|
100
|
+
# https://bot.incolumitas.com/
|
101
|
+
# https://fingerprintjs.github.io/BotD/main/
|
102
|
+
|
99
103
|
@dataclass
|
100
104
|
class PCStealthConfig(StealthConfig): # type: ignore[misc]
|
101
105
|
|
102
106
|
@property
|
103
107
|
def enabled_scripts(self) -> Generator[str, None, None]:
|
104
|
-
self.webdriver = True
|
105
|
-
self.webgl_vendor = True
|
106
108
|
self.chrome_app = True
|
107
109
|
self.chrome_csi = True
|
108
|
-
self.chrome_load_times = True
|
109
110
|
self.chrome_runtime = True
|
111
|
+
self.chrome_load_times = True
|
112
|
+
self.navigator_plugins = True
|
113
|
+
self.hairline = True
|
110
114
|
self.iframe_content_window = True
|
111
115
|
self.media_codecs = True
|
116
|
+
|
117
|
+
# permissions are handled directly in playwright
|
118
|
+
self.navigator_permissions = False
|
119
|
+
# Platform is correct now
|
120
|
+
self.navigator_platform = False
|
121
|
+
# probably useless, but it will fallback to 4 regardless
|
112
122
|
self.navigator_hardware_concurrency = 4
|
123
|
+
# Webgl vendor is correct now
|
124
|
+
self.webgl_vendor = False
|
125
|
+
# Set by the viewport
|
126
|
+
self.outerdimensions = False
|
127
|
+
|
128
|
+
# Not working with Playwright 1.45+
|
113
129
|
self.navigator_languages = False # Causes issue
|
114
|
-
self.navigator_permissions = True
|
115
|
-
self.navigator_platform = True
|
116
|
-
self.navigator_plugins = True
|
117
130
|
self.navigator_user_agent = False # Causes issues
|
118
131
|
self.navigator_vendor = False # Causes issues
|
119
|
-
|
120
|
-
self.hairline = True
|
132
|
+
|
121
133
|
yield from super().enabled_scripts
|
122
134
|
|
123
135
|
|
@@ -184,7 +196,7 @@ class Capture():
|
|
184
196
|
self._viewport: ViewportSize | None = None
|
185
197
|
self._user_agent: str = ''
|
186
198
|
self._timezone_id: str = ''
|
187
|
-
self._locale: str = ''
|
199
|
+
self._locale: str = 'en-US'
|
188
200
|
self._color_scheme: Literal['dark', 'light', 'no-preference', 'null'] | None = None
|
189
201
|
|
190
202
|
def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
|
@@ -462,21 +474,22 @@ class Capture():
|
|
462
474
|
# NOTE: Which perms are supported by which browsers varies
|
463
475
|
# See https://github.com/microsoft/playwright/issues/16577
|
464
476
|
chromium_permissions = [
|
465
|
-
'geolocation',
|
466
|
-
'midi',
|
467
|
-
'midi-sysex',
|
468
|
-
'notifications',
|
469
|
-
'camera',
|
470
|
-
'microphone',
|
471
|
-
'background-sync',
|
472
|
-
'ambient-light-sensor',
|
473
477
|
'accelerometer',
|
474
|
-
'gyroscope',
|
475
|
-
'magnetometer',
|
476
478
|
'accessibility-events',
|
479
|
+
'ambient-light-sensor',
|
480
|
+
'background-sync',
|
481
|
+
'camera',
|
477
482
|
'clipboard-read',
|
478
483
|
'clipboard-write',
|
479
|
-
'
|
484
|
+
'geolocation',
|
485
|
+
'gyroscope',
|
486
|
+
'magnetometer',
|
487
|
+
'microphone',
|
488
|
+
'midi-sysex',
|
489
|
+
'midi',
|
490
|
+
'notifications',
|
491
|
+
'payment-handler',
|
492
|
+
'storage-access'
|
480
493
|
]
|
481
494
|
|
482
495
|
firefox_permissions = ['geolocation', 'notifications']
|
@@ -588,6 +601,8 @@ class Capture():
|
|
588
601
|
elif await page.get_by_test_id("uc-accept-all-button").is_visible():
|
589
602
|
self.logger.info('Consent window found, clicking through.')
|
590
603
|
await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
|
604
|
+
elif await page.locator('#axeptio_btn_acceptAll').is_visible():
|
605
|
+
await page.locator('#axeptio_btn_acceptAll').click(timeout=2000)
|
591
606
|
else:
|
592
607
|
self.logger.info('Consent window found (dialog), but no button to click through.')
|
593
608
|
await page.add_locator_handler(
|
@@ -633,6 +648,24 @@ class Capture():
|
|
633
648
|
)
|
634
649
|
self.logger.info('Piwik handler added')
|
635
650
|
|
651
|
+
async def __frame_consent(self, frame: Frame) -> bool:
|
652
|
+
"""Search & Click content in iframes. Cannot easily use the locator handler for this without having many many handlers.
|
653
|
+
And the iframes don't have a title or a role to easily identify them so we just try with generic locators that vary by language."""
|
654
|
+
got_button: bool = False
|
655
|
+
if await frame.get_by_label("Alle akzeptieren").is_visible():
|
656
|
+
got_button = True
|
657
|
+
await frame.get_by_label("Alle akzeptieren").click(timeout=2000)
|
658
|
+
elif await frame.get_by_label("Accept & continue").is_visible():
|
659
|
+
got_button = True
|
660
|
+
await frame.get_by_label("Accept & continue").click(timeout=2000)
|
661
|
+
elif await frame.get_by_label("Accepter et continuer").is_visible():
|
662
|
+
got_button = True
|
663
|
+
await frame.get_by_label("Accepter et continuer").click(timeout=2000)
|
664
|
+
elif await frame.get_by_label("Accepteer").is_visible():
|
665
|
+
got_button = True
|
666
|
+
await frame.get_by_label("Accepteer").click(timeout=2000)
|
667
|
+
return got_button
|
668
|
+
|
636
669
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
637
670
|
referer: str | None=None,
|
638
671
|
page: Page | None=None, depth: int=0,
|
@@ -720,7 +753,6 @@ class Capture():
|
|
720
753
|
await self.__dialog_clickthrough(page)
|
721
754
|
|
722
755
|
await stealth_async(page, PCStealthConfig())
|
723
|
-
# await stealth_async(page)
|
724
756
|
|
725
757
|
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
726
758
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
@@ -799,6 +831,8 @@ class Capture():
|
|
799
831
|
self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
|
800
832
|
except Error as e:
|
801
833
|
self.logger.warning(f'Error while resolving captcha on {url}: {e}')
|
834
|
+
except (TimeoutError, asyncio.TimeoutError) as e:
|
835
|
+
self.logger.warning(f'[Timeout] Error while resolving captcha on {url}: {e}')
|
802
836
|
except Exception as e:
|
803
837
|
self.logger.exception(f'General error with captcha solving on {url}: {e}')
|
804
838
|
# ======
|
@@ -897,16 +931,32 @@ class Capture():
|
|
897
931
|
# self.logger.warning('Unable to move time forward.')
|
898
932
|
|
899
933
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
934
|
+
if allow_tracking:
|
935
|
+
self.logger.debug('Check iFrames for button')
|
936
|
+
for frame in page.frames:
|
937
|
+
frame_title = await frame.title()
|
938
|
+
self.logger.debug(f'Check button on {frame_title}')
|
939
|
+
if await self.__frame_consent(frame):
|
940
|
+
self.logger.debug(f'Got button on {frame_title}')
|
941
|
+
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
942
|
+
self.logger.debug('Done with iFrames.')
|
943
|
+
|
900
944
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
|
901
945
|
await self._safe_wait(page)
|
946
|
+
|
902
947
|
self.logger.debug('Done with instrumentation, done with waiting.')
|
903
948
|
|
904
949
|
if content := await self._failsafe_get_content(page):
|
905
950
|
to_return['html'] = content
|
906
951
|
|
907
952
|
if 'html' in to_return and to_return['html'] is not None and with_favicon:
|
908
|
-
|
909
|
-
|
953
|
+
try:
|
954
|
+
to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html'])
|
955
|
+
got_favicons = True
|
956
|
+
except (TimeoutError, asyncio.TimeoutError) as e:
|
957
|
+
self.logger.warning(f'[Timeout] Unable to get favicons: {e}')
|
958
|
+
except Exception as e:
|
959
|
+
self.logger.warning(f'Unable to get favicons: {e}')
|
910
960
|
|
911
961
|
to_return['last_redirected_url'] = page.url
|
912
962
|
to_return['png'] = await self._failsafe_get_screenshot(page)
|
@@ -1209,7 +1259,8 @@ class Capture():
|
|
1209
1259
|
if self.proxy and self.proxy.get('server'):
|
1210
1260
|
connector = ProxyConnector.from_url(self.proxy['server'])
|
1211
1261
|
|
1212
|
-
|
1262
|
+
timeout = aiohttp.ClientTimeout(total=10)
|
1263
|
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
1213
1264
|
while True:
|
1214
1265
|
try:
|
1215
1266
|
href = await main_frame.get_by_role("link", name="Alternatively, download audio as MP3").get_attribute("href")
|
@@ -1219,7 +1270,7 @@ class Capture():
|
|
1219
1270
|
if not href:
|
1220
1271
|
self.logger.warning('Unable to find download link for captcha.')
|
1221
1272
|
return False
|
1222
|
-
async with session.get(href,
|
1273
|
+
async with session.get(href, ssl=False) as response:
|
1223
1274
|
response.raise_for_status()
|
1224
1275
|
mp3_content = await response.read()
|
1225
1276
|
with NamedTemporaryFile() as mp3_file, NamedTemporaryFile() as wav_file:
|
@@ -1417,7 +1468,8 @@ class Capture():
|
|
1417
1468
|
return set()
|
1418
1469
|
to_fetch, to_return = extracted_favicons
|
1419
1470
|
to_fetch.add('/favicon.ico')
|
1420
|
-
|
1471
|
+
timeout = aiohttp.ClientTimeout(total=10)
|
1472
|
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
1421
1473
|
session.headers['user-agent'] = self.user_agent
|
1422
1474
|
for u in to_fetch:
|
1423
1475
|
try:
|
@@ -1427,7 +1479,7 @@ class Capture():
|
|
1427
1479
|
if url_to_fetch in self._requests:
|
1428
1480
|
favicon = self._requests[url_to_fetch]
|
1429
1481
|
if not favicon:
|
1430
|
-
async with session.get(url_to_fetch,
|
1482
|
+
async with session.get(url_to_fetch, ssl=False) as favicon_response:
|
1431
1483
|
favicon_response.raise_for_status()
|
1432
1484
|
favicon = await favicon_response.read()
|
1433
1485
|
if favicon:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.11
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -21,17 +21,17 @@ Classifier: Topic :: Internet
|
|
21
21
|
Classifier: Topic :: Security
|
22
22
|
Provides-Extra: recaptcha
|
23
23
|
Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
|
24
|
-
Requires-Dist: aiohttp-socks (>=0.
|
25
|
-
Requires-Dist: aiohttp[speedups] (>=3.
|
24
|
+
Requires-Dist: aiohttp-socks (>=0.9,<0.10)
|
25
|
+
Requires-Dist: aiohttp[speedups] (>=3.10.3,<4.0.0)
|
26
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
27
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
|
-
Requires-Dist: playwright (>=1.
|
29
|
+
Requires-Dist: playwright (>=1.46.0,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
|
-
Requires-Dist: puremagic (>=1.
|
31
|
+
Requires-Dist: puremagic (>=1.27,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=
|
34
|
+
Requires-Dist: setuptools (>=72.1.0,<73.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -0,0 +1,9 @@
|
|
1
|
+
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
+
playwrightcapture/capture.py,sha256=Rmo_EVRlR9btsgE2H99OtGPRZwIe8RVq-JCc2GzUWiI,74446
|
3
|
+
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
+
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
|
+
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
playwrightcapture-1.25.11.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.25.11.dist-info/METADATA,sha256=nGuO6TAlz2lKM15HiIgZJ4iERLBO_AXNBBpgqo8nfhM,3172
|
8
|
+
playwrightcapture-1.25.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.25.11.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=uS8e87-7jl8F7TgfzhKhlV4pGf8n6twu9rVzzlqIhXM,71671
|
3
|
-
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
-
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
|
-
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.25.9.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
-
playwrightcapture-1.25.9.dist-info/METADATA,sha256=7ds0ymzTNfkYLajoJTc-t1G4wOhPHRbxmg6CCZkMtUE,3173
|
8
|
-
playwrightcapture-1.25.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
-
playwrightcapture-1.25.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|