PlaywrightCapture 1.24.5__py3-none-any.whl → 1.24.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +52 -23
- {playwrightcapture-1.24.5.dist-info → playwrightcapture-1.24.7.dist-info}/METADATA +3 -2
- {playwrightcapture-1.24.5.dist-info → playwrightcapture-1.24.7.dist-info}/RECORD +5 -5
- {playwrightcapture-1.24.5.dist-info → playwrightcapture-1.24.7.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.24.5.dist-info → playwrightcapture-1.24.7.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -43,6 +43,11 @@ else:
|
|
43
43
|
from zoneinfo import available_timezones
|
44
44
|
all_timezones_set = available_timezones()
|
45
45
|
|
46
|
+
if sys.version_info < (3, 11):
|
47
|
+
from async_timeout import timeout
|
48
|
+
else:
|
49
|
+
from asyncio import timeout
|
50
|
+
|
46
51
|
if TYPE_CHECKING:
|
47
52
|
from playwright._impl._api_structures import (SetCookieParam, Geolocation,
|
48
53
|
HttpCredentials, Headers,
|
@@ -469,36 +474,33 @@ class Capture():
|
|
469
474
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
470
475
|
await page.locator("#didomi-notice-agree-button").click(timeout=2000)
|
471
476
|
|
472
|
-
await page.add_locator_handler(page.locator(".didomi-popup-view"), handler)
|
477
|
+
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler)
|
473
478
|
self.logger.info('Didomi handler added')
|
474
479
|
|
475
480
|
async def __dialog_onetrust_clickthrough(self, page: Page) -> None:
|
476
481
|
async def handler() -> None:
|
477
|
-
self.logger.info('######## OT Dialog found, clicking through.')
|
478
482
|
if await page.locator("#onetrust-accept-btn-handler").is_visible():
|
479
483
|
await page.locator("#onetrust-accept-btn-handler").click(timeout=2000)
|
480
484
|
|
481
485
|
await page.add_locator_handler(
|
482
|
-
page.locator('#onetrust-banner-sdk'),
|
486
|
+
page.locator('#onetrust-banner-sdk').last,
|
483
487
|
handler
|
484
488
|
)
|
485
489
|
self.logger.info('OT handler added')
|
486
490
|
|
487
491
|
async def __dialog_hubspot_clickthrough(self, page: Page) -> None:
|
488
492
|
async def handler() -> None:
|
489
|
-
self.logger.info('######## HS Dialog found, clicking through.')
|
490
493
|
if await page.locator("#hs-eu-confirmation-button").is_visible():
|
491
494
|
await page.locator("#hs-eu-confirmation-button").click(timeout=2000)
|
492
495
|
|
493
496
|
await page.add_locator_handler(
|
494
|
-
page.locator('#hs-eu-cookie-confirmation'),
|
497
|
+
page.locator('#hs-eu-cookie-confirmation').last,
|
495
498
|
handler
|
496
499
|
)
|
497
500
|
self.logger.info('HS handler added')
|
498
501
|
|
499
502
|
async def __dialog_cookiebot_clickthrough(self, page: Page) -> None:
|
500
503
|
async def handler() -> None:
|
501
|
-
self.logger.info('######## Cookiebot Dialog found, clicking through.')
|
502
504
|
if await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").is_visible():
|
503
505
|
await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").click(timeout=2000)
|
504
506
|
|
@@ -519,7 +521,7 @@ class Capture():
|
|
519
521
|
self.logger.info('Consent window found, but no button to click through.')
|
520
522
|
|
521
523
|
await page.add_locator_handler(
|
522
|
-
page.get_by_role("alertdialog"),
|
524
|
+
page.get_by_role("alertdialog").last,
|
523
525
|
handler
|
524
526
|
)
|
525
527
|
self.logger.info('alert dialog handler added')
|
@@ -545,7 +547,6 @@ class Capture():
|
|
545
547
|
|
546
548
|
async def __dialog_complianz_clickthrough(self, page: Page) -> None:
|
547
549
|
async def handler() -> None:
|
548
|
-
self.logger.info('######## Complianz found, clicking through.')
|
549
550
|
if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
|
550
551
|
await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
|
551
552
|
|
@@ -557,7 +558,6 @@ class Capture():
|
|
557
558
|
|
558
559
|
async def __dialog_yahoo_clickthrough(self, page: Page) -> None:
|
559
560
|
async def handler() -> None:
|
560
|
-
self.logger.info('######## Yahoo found, clicking through.')
|
561
561
|
if await page.locator('.con-wizard').locator("button.accept-all").is_visible():
|
562
562
|
await page.locator('.con-wizard').locator("button.accept-all").click(timeout=2000)
|
563
563
|
|
@@ -569,7 +569,6 @@ class Capture():
|
|
569
569
|
|
570
570
|
async def __dialog_ppms_clickthrough(self, page: Page) -> None:
|
571
571
|
async def handler() -> None:
|
572
|
-
self.logger.info('######## piwik found, clicking through.')
|
573
572
|
if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
|
574
573
|
await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
|
575
574
|
|
@@ -610,7 +609,11 @@ class Capture():
|
|
610
609
|
multiple_downloads.append((filename, file_content))
|
611
610
|
self.logger.info('Done with download.')
|
612
611
|
except Exception as e:
|
613
|
-
|
612
|
+
if download.page.is_closed():
|
613
|
+
# Page is closed, skip logging.
|
614
|
+
pass
|
615
|
+
else:
|
616
|
+
self.logger.warning(f'Unable to finish download triggered from JS: {e}')
|
614
617
|
finally:
|
615
618
|
self.wait_for_download -= 1
|
616
619
|
|
@@ -664,7 +667,6 @@ class Capture():
|
|
664
667
|
parsed_url = urlparse(url, allow_fragments=True)
|
665
668
|
|
666
669
|
try:
|
667
|
-
# NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
|
668
670
|
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
669
671
|
page.on("download", handle_download)
|
670
672
|
except Error as initial_error:
|
@@ -829,24 +831,39 @@ class Capture():
|
|
829
831
|
else:
|
830
832
|
child_urls = child_urls[:max_captures]
|
831
833
|
self.logger.info(f'Capturing children, {max_captures} URLs')
|
834
|
+
consecutive_errors = 0
|
832
835
|
for index, url in enumerate(child_urls):
|
833
836
|
self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
|
834
837
|
start_time = time.time()
|
838
|
+
if page.is_closed():
|
839
|
+
self.logger.info('Page is closed, unable to capture children.')
|
840
|
+
break
|
835
841
|
try:
|
836
|
-
|
837
|
-
self.capture_page(
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
842
|
+
async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first
|
843
|
+
child_capture = await self.capture_page(
|
844
|
+
url=url, referer=page.url,
|
845
|
+
page=page, depth=depth,
|
846
|
+
rendered_hostname_only=rendered_hostname_only,
|
847
|
+
max_depth_capture_time=max_capture_time)
|
848
|
+
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
849
|
+
except (TimeoutError, asyncio.exceptions.TimeoutError, asyncio.TimeoutError):
|
844
850
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
851
|
+
consecutive_errors += 1
|
845
852
|
except Exception as e:
|
846
853
|
self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.')
|
854
|
+
consecutive_errors += 1
|
847
855
|
else:
|
856
|
+
consecutive_errors = 0
|
848
857
|
runtime = int(time.time() - start_time)
|
849
858
|
self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.')
|
859
|
+
|
860
|
+
if consecutive_errors >= 5:
|
861
|
+
# if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
|
862
|
+
self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
|
863
|
+
to_return['error'] = "Got more than 5 consecutive errors while capturing children"
|
864
|
+
self.should_retry = True
|
865
|
+
break
|
866
|
+
|
850
867
|
try:
|
851
868
|
await page.go_back()
|
852
869
|
except PlaywrightTimeoutError:
|
@@ -880,12 +897,16 @@ class Capture():
|
|
880
897
|
self.should_retry = True
|
881
898
|
elif e.name in ['Download is starting',
|
882
899
|
'Connection closed',
|
900
|
+
'Connection terminated unexpectedly',
|
883
901
|
'Navigation interrupted by another one',
|
884
902
|
'Navigation failed because page was closed!',
|
903
|
+
'Target page, context or browser has been closed',
|
885
904
|
'Protocol error (Page.bringToFront): Not attached to an active page',
|
886
905
|
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
887
906
|
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
907
|
+
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
888
908
|
'Peer sent fatal TLS alert: The server name sent was not recognized',
|
909
|
+
'Peer sent fatal TLS alert: Internal error',
|
889
910
|
'Load cannot follow more than 20 redirections',
|
890
911
|
'Page crashed',
|
891
912
|
'Error receiving data: Connection reset by peer']:
|
@@ -896,9 +917,15 @@ class Capture():
|
|
896
917
|
# The browser barfed, let's try again
|
897
918
|
self.logger.info(f'Browser barfed on {url} (retrying): {e.message}')
|
898
919
|
self.should_retry = True
|
899
|
-
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS'
|
920
|
+
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
|
921
|
+
'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
|
922
|
+
'net::ERR_UNEXPECTED_PROXY_AUTH']:
|
900
923
|
# No need to retry, the credentials are wrong/missing.
|
901
924
|
pass
|
925
|
+
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
|
926
|
+
self.should_retry = True
|
927
|
+
elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
|
928
|
+
pass
|
902
929
|
else:
|
903
930
|
# Unexpected ones
|
904
931
|
self.logger.exception(f'Something went poorly with {url}: {e.message}')
|
@@ -949,7 +976,7 @@ class Capture():
|
|
949
976
|
try:
|
950
977
|
return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
|
951
978
|
except Error as e:
|
952
|
-
self.logger.
|
979
|
+
self.logger.info(f"Unable to get any screenshot: {e}")
|
953
980
|
raise e
|
954
981
|
|
955
982
|
async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
|
@@ -1137,11 +1164,13 @@ class Capture():
|
|
1137
1164
|
'net::ERR_INVALID_RESPONSE',
|
1138
1165
|
'net::ERR_NAME_NOT_RESOLVED',
|
1139
1166
|
'net::ERR_SOCKS_CONNECTION_FAILED',
|
1167
|
+
'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
|
1168
|
+
'net::ERR_SSL_PROTOCOL_ERROR',
|
1140
1169
|
'net::ERR_SSL_UNRECOGNIZED_NAME_ALERT',
|
1141
1170
|
'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
|
1142
|
-
'net::ERR_SSL_PROTOCOL_ERROR',
|
1143
1171
|
'net::ERR_TIMED_OUT',
|
1144
1172
|
'net::ERR_TOO_MANY_REDIRECTS',
|
1173
|
+
'SSL_ERROR_UNKNOWN',
|
1145
1174
|
]:
|
1146
1175
|
return True
|
1147
1176
|
return False
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.24.
|
3
|
+
Version: 1.24.7
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -21,11 +21,12 @@ Classifier: Topic :: Internet
|
|
21
21
|
Classifier: Topic :: Security
|
22
22
|
Provides-Extra: recaptcha
|
23
23
|
Requires-Dist: SpeechRecognition (>=3.10.3,<4.0.0) ; extra == "recaptcha"
|
24
|
+
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
24
25
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
25
26
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
26
27
|
Requires-Dist: playwright (>=1.43.0,<2.0.0)
|
27
28
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
28
|
-
Requires-Dist: puremagic (>=1.
|
29
|
+
Requires-Dist: puremagic (>=1.22,<2.0)
|
29
30
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
30
31
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
31
32
|
Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
|
@@ -1,9 +1,9 @@
|
|
1
1
|
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256
|
2
|
+
playwrightcapture/capture.py,sha256=z_5IfCbJl3pmLKnm8YTcxVC0XFPXhaqL9TXLCSQ6t0s,64760
|
3
3
|
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
4
|
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
5
|
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.24.
|
7
|
-
playwrightcapture-1.24.
|
8
|
-
playwrightcapture-1.24.
|
9
|
-
playwrightcapture-1.24.
|
6
|
+
playwrightcapture-1.24.7.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.24.7.dist-info/METADATA,sha256=W2FGoCXAodeRAD_ydGPQd8r2KTAQZr1oPAOzOcFzsmU,3149
|
8
|
+
playwrightcapture-1.24.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.24.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|