PlaywrightCapture 1.24.10__py3-none-any.whl → 1.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +61 -27
- {playwrightcapture-1.24.10.dist-info → playwrightcapture-1.25.0.dist-info}/METADATA +5 -5
- playwrightcapture-1.25.0.dist-info/RECORD +9 -0
- playwrightcapture-1.24.10.dist-info/RECORD +0 -9
- {playwrightcapture-1.24.10.dist-info → playwrightcapture-1.25.0.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.24.10.dist-info → playwrightcapture-1.25.0.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -474,7 +474,7 @@ class Capture():
|
|
474
474
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
475
475
|
await page.locator("#didomi-notice-agree-button").click(timeout=2000)
|
476
476
|
|
477
|
-
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler)
|
477
|
+
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
|
478
478
|
self.logger.info('Didomi handler added')
|
479
479
|
|
480
480
|
async def __dialog_onetrust_clickthrough(self, page: Page) -> None:
|
@@ -484,7 +484,8 @@ class Capture():
|
|
484
484
|
|
485
485
|
await page.add_locator_handler(
|
486
486
|
page.locator('#onetrust-banner-sdk').last,
|
487
|
-
handler
|
487
|
+
handler,
|
488
|
+
times=1, no_wait_after=True
|
488
489
|
)
|
489
490
|
self.logger.info('OT handler added')
|
490
491
|
|
@@ -495,7 +496,8 @@ class Capture():
|
|
495
496
|
|
496
497
|
await page.add_locator_handler(
|
497
498
|
page.locator('#hs-eu-cookie-confirmation').last,
|
498
|
-
handler
|
499
|
+
handler,
|
500
|
+
times=1, no_wait_after=True
|
499
501
|
)
|
500
502
|
self.logger.info('HS handler added')
|
501
503
|
|
@@ -506,7 +508,8 @@ class Capture():
|
|
506
508
|
|
507
509
|
await page.add_locator_handler(
|
508
510
|
page.locator('#CybotCookiebotDialogBody'),
|
509
|
-
handler
|
511
|
+
handler,
|
512
|
+
times=1, no_wait_after=True
|
510
513
|
)
|
511
514
|
self.logger.info('Cookiebot handler added')
|
512
515
|
|
@@ -518,11 +521,12 @@ class Capture():
|
|
518
521
|
elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
519
522
|
await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
|
520
523
|
else:
|
521
|
-
self.logger.info('Consent window found, but no button to click through.')
|
524
|
+
self.logger.info('Consent window found (alert dialog), but no button to click through.')
|
522
525
|
|
523
526
|
await page.add_locator_handler(
|
524
527
|
page.get_by_role("alertdialog").last,
|
525
|
-
handler
|
528
|
+
handler,
|
529
|
+
times=1, no_wait_after=True
|
526
530
|
)
|
527
531
|
self.logger.info('alert dialog handler added')
|
528
532
|
|
@@ -538,21 +542,23 @@ class Capture():
|
|
538
542
|
self.logger.info('Consent window found, clicking through.')
|
539
543
|
await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
|
540
544
|
else:
|
541
|
-
self.logger.info('Consent window found, but no button to click through.')
|
545
|
+
self.logger.info('Consent window found (dialog), but no button to click through.')
|
542
546
|
await page.add_locator_handler(
|
543
547
|
page.get_by_role("dialog").last,
|
544
|
-
handler
|
548
|
+
handler,
|
549
|
+
times=1, no_wait_after=True
|
545
550
|
)
|
546
551
|
self.logger.info('dialog handler added')
|
547
552
|
|
548
553
|
async def __dialog_complianz_clickthrough(self, page: Page) -> None:
|
549
554
|
async def handler() -> None:
|
550
|
-
if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
|
551
|
-
await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
|
555
|
+
if await page.locator('.cmplz-show').first.locator("button.cmplz-accept").is_visible():
|
556
|
+
await page.locator('.cmplz-show').first.locator("button.cmplz-accept").click(timeout=2000)
|
552
557
|
|
553
558
|
await page.add_locator_handler(
|
554
|
-
page.locator('.cmplz-show'),
|
555
|
-
handler
|
559
|
+
page.locator('.cmplz-show').first,
|
560
|
+
handler,
|
561
|
+
times=1, no_wait_after=True
|
556
562
|
)
|
557
563
|
self.logger.info('Complianz handler added')
|
558
564
|
|
@@ -563,7 +569,8 @@ class Capture():
|
|
563
569
|
|
564
570
|
await page.add_locator_handler(
|
565
571
|
page.locator('.con-wizard'),
|
566
|
-
handler
|
572
|
+
handler,
|
573
|
+
times=1, no_wait_after=True
|
567
574
|
)
|
568
575
|
self.logger.info('Yahoo handler added')
|
569
576
|
|
@@ -574,9 +581,10 @@ class Capture():
|
|
574
581
|
|
575
582
|
await page.add_locator_handler(
|
576
583
|
page.locator('#ppms_cm_popup_overlay'),
|
577
|
-
handler
|
584
|
+
handler,
|
585
|
+
times=1, no_wait_after=True
|
578
586
|
)
|
579
|
-
self.logger.info('
|
587
|
+
self.logger.info('Piwik handler added')
|
580
588
|
|
581
589
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
582
590
|
referer: str | None=None,
|
@@ -644,7 +652,13 @@ class Capture():
|
|
644
652
|
capturing_sub = True
|
645
653
|
else:
|
646
654
|
capturing_sub = False
|
647
|
-
|
655
|
+
try:
|
656
|
+
page = await self.context.new_page()
|
657
|
+
except Error as e:
|
658
|
+
self.logger.warning(f'The context is in a broken state: {e}')
|
659
|
+
self.should_retry = True
|
660
|
+
return to_return
|
661
|
+
|
648
662
|
if allow_tracking:
|
649
663
|
# Add authorization clickthroughs
|
650
664
|
await self.__dialog_didomi_clickthrough(page)
|
@@ -704,8 +718,12 @@ class Capture():
|
|
704
718
|
else:
|
705
719
|
raise initial_error
|
706
720
|
else:
|
707
|
-
|
708
|
-
|
721
|
+
try:
|
722
|
+
await page.bring_to_front()
|
723
|
+
self.logger.debug('Page moved to front.')
|
724
|
+
except Error as e:
|
725
|
+
self.logger.warning('Page in a broken state.')
|
726
|
+
raise e
|
709
727
|
|
710
728
|
# page instrumentation
|
711
729
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
@@ -738,8 +756,11 @@ class Capture():
|
|
738
756
|
if allow_tracking:
|
739
757
|
await self._wait_for_random_timeout(page, 2)
|
740
758
|
# This event is required trigger the add_locator_handler
|
741
|
-
|
742
|
-
await page.locator("body").first.
|
759
|
+
try:
|
760
|
+
if await page.locator("body").first.is_visible():
|
761
|
+
await page.locator("body").first.click(button="right", timeout=5000)
|
762
|
+
except Exception as e:
|
763
|
+
self.logger.warning(f'Could not find body: {e}')
|
743
764
|
|
744
765
|
# move mouse
|
745
766
|
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
@@ -901,15 +922,20 @@ class Capture():
|
|
901
922
|
'Navigation interrupted by another one',
|
902
923
|
'Navigation failed because page was closed!',
|
903
924
|
'Target page, context or browser has been closed',
|
904
|
-
'
|
925
|
+
'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
|
905
926
|
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
906
927
|
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
907
928
|
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
908
|
-
'Peer sent fatal TLS alert:
|
929
|
+
'Peer sent fatal TLS alert: Handshake failed',
|
909
930
|
'Peer sent fatal TLS alert: Internal error',
|
931
|
+
'Peer sent fatal TLS alert: The server name sent was not recognized',
|
910
932
|
'Load cannot follow more than 20 redirections',
|
911
933
|
'Page crashed',
|
912
|
-
'Error receiving data: Connection reset by peer'
|
934
|
+
'Error receiving data: Connection reset by peer',
|
935
|
+
'Internal SOCKSv5 proxy server error.',
|
936
|
+
'Host unreachable through SOCKSv5 server.',
|
937
|
+
'HTTP/2 Error: NO_ERROR',
|
938
|
+
'HTTP/2 Error: PROTOCOL_ERROR']:
|
913
939
|
# Other errors, let's give it another shot
|
914
940
|
self.logger.info(f'Issue with {url} (retrying): {e.message}')
|
915
941
|
self.should_retry = True
|
@@ -919,16 +945,18 @@ class Capture():
|
|
919
945
|
self.should_retry = True
|
920
946
|
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
|
921
947
|
'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
|
922
|
-
'net::
|
923
|
-
|
948
|
+
'net::ERR_CERT_DATE_INVALID',
|
949
|
+
'net::ERR_UNEXPECTED_PROXY_AUTH',
|
950
|
+
'net::ERR_UNSAFE_PORT']:
|
951
|
+
# No need to retry, the credentials/certs are wrong/missing.
|
924
952
|
pass
|
925
|
-
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
|
953
|
+
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to', 'Page.bringToFront']]):
|
926
954
|
self.should_retry = True
|
927
955
|
elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
|
928
956
|
pass
|
929
957
|
else:
|
930
958
|
# Unexpected ones
|
931
|
-
self.logger.exception(f'Something went poorly with {url}: {e.message}')
|
959
|
+
self.logger.exception(f'Something went poorly with {url}: "{e.name}" - {e.message}')
|
932
960
|
except Exception as e:
|
933
961
|
# we may get a non-playwright exception to.
|
934
962
|
# The ones we try to handle here should be treated as if they were.
|
@@ -1155,6 +1183,7 @@ class Capture():
|
|
1155
1183
|
'NS_ERROR_ABORT',
|
1156
1184
|
'NS_ERROR_CONNECTION_REFUSED',
|
1157
1185
|
'NS_ERROR_NET_INTERRUPT',
|
1186
|
+
'NS_ERROR_NET_PARTIAL_TRANSFER',
|
1158
1187
|
'NS_ERROR_NET_RESET',
|
1159
1188
|
'NS_ERROR_NET_TIMEOUT',
|
1160
1189
|
'NS_ERROR_REDIRECT_LOOP',
|
@@ -1170,8 +1199,13 @@ class Capture():
|
|
1170
1199
|
'net::ERR_EMPTY_RESPONSE',
|
1171
1200
|
'net::ERR_HTTP_RESPONSE_CODE_FAILURE',
|
1172
1201
|
'net::ERR_HTTP2_PROTOCOL_ERROR',
|
1202
|
+
'net::ERR_INVALID_REDIRECT',
|
1173
1203
|
'net::ERR_INVALID_RESPONSE',
|
1174
1204
|
'net::ERR_NAME_NOT_RESOLVED',
|
1205
|
+
'net::ERR_NETWORK_ACCESS_DENIED',
|
1206
|
+
'net::ERR_QUIC_PROTOCOL_ERROR',
|
1207
|
+
'net::ERR_RESPONSE_HEADERS_TRUNCATED',
|
1208
|
+
'net::ERR_SOCKET_NOT_CONNECTED',
|
1175
1209
|
'net::ERR_SOCKS_CONNECTION_FAILED',
|
1176
1210
|
'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
|
1177
1211
|
'net::ERR_SSL_PROTOCOL_ERROR',
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.25.0
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -26,14 +26,14 @@ Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
|
|
26
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
27
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
|
-
Requires-Dist: playwright (>=1.
|
29
|
+
Requires-Dist: playwright (>=1.45.0,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
|
-
Requires-Dist: puremagic (>=1.
|
31
|
+
Requires-Dist: puremagic (>=1.25,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=
|
34
|
+
Requires-Dist: setuptools (>=70.2.0,<71.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
|
-
Requires-Dist: w3lib (>=2.1
|
36
|
+
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
38
38
|
Description-Content-Type: text/markdown
|
39
39
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
+
playwrightcapture/capture.py,sha256=zzoZQItpKDbxpfF0PqFANfeWTmQlSwnvChuz_l1Ah-I,67333
|
3
|
+
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
+
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
|
+
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
playwrightcapture-1.25.0.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.25.0.dist-info/METADATA,sha256=XBYGqQxi3Qvc-ktd1lLGFBfSKRmCLkH5UkbzNPeL8kA,3173
|
8
|
+
playwrightcapture-1.25.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.25.0.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=SQKfPz_PoySwvW3GCMRTTsElYVgZ5c9lB55srIxis8s,65604
|
3
|
-
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
-
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
|
-
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.24.10.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
-
playwrightcapture-1.24.10.dist-info/METADATA,sha256=wd_znffwaPZexymxHzn2Tzl2P0UHFHcbiWeZuSpyHpg,3174
|
8
|
-
playwrightcapture-1.24.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
-
playwrightcapture-1.24.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|