PlaywrightCapture 1.24.5__tar.gz → 1.24.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/PKG-INFO +1 -1
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/playwrightcapture/capture.py +39 -21
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/pyproject.toml +3 -3
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/LICENSE +0 -0
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/README.md +0 -0
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.24.5 → playwrightcapture-1.24.6}/playwrightcapture/py.typed +0 -0
@@ -43,6 +43,11 @@ else:
|
|
43
43
|
from zoneinfo import available_timezones
|
44
44
|
all_timezones_set = available_timezones()
|
45
45
|
|
46
|
+
if sys.version_info < (3, 11):
|
47
|
+
from async_timeout import timeout
|
48
|
+
else:
|
49
|
+
from asyncio import timeout
|
50
|
+
|
46
51
|
if TYPE_CHECKING:
|
47
52
|
from playwright._impl._api_structures import (SetCookieParam, Geolocation,
|
48
53
|
HttpCredentials, Headers,
|
@@ -469,36 +474,33 @@ class Capture():
|
|
469
474
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
470
475
|
await page.locator("#didomi-notice-agree-button").click(timeout=2000)
|
471
476
|
|
472
|
-
await page.add_locator_handler(page.locator(".didomi-popup-view"), handler)
|
477
|
+
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler)
|
473
478
|
self.logger.info('Didomi handler added')
|
474
479
|
|
475
480
|
async def __dialog_onetrust_clickthrough(self, page: Page) -> None:
|
476
481
|
async def handler() -> None:
|
477
|
-
self.logger.info('######## OT Dialog found, clicking through.')
|
478
482
|
if await page.locator("#onetrust-accept-btn-handler").is_visible():
|
479
483
|
await page.locator("#onetrust-accept-btn-handler").click(timeout=2000)
|
480
484
|
|
481
485
|
await page.add_locator_handler(
|
482
|
-
page.locator('#onetrust-banner-sdk'),
|
486
|
+
page.locator('#onetrust-banner-sdk').last,
|
483
487
|
handler
|
484
488
|
)
|
485
489
|
self.logger.info('OT handler added')
|
486
490
|
|
487
491
|
async def __dialog_hubspot_clickthrough(self, page: Page) -> None:
|
488
492
|
async def handler() -> None:
|
489
|
-
self.logger.info('######## HS Dialog found, clicking through.')
|
490
493
|
if await page.locator("#hs-eu-confirmation-button").is_visible():
|
491
494
|
await page.locator("#hs-eu-confirmation-button").click(timeout=2000)
|
492
495
|
|
493
496
|
await page.add_locator_handler(
|
494
|
-
page.locator('#hs-eu-cookie-confirmation'),
|
497
|
+
page.locator('#hs-eu-cookie-confirmation').last,
|
495
498
|
handler
|
496
499
|
)
|
497
500
|
self.logger.info('HS handler added')
|
498
501
|
|
499
502
|
async def __dialog_cookiebot_clickthrough(self, page: Page) -> None:
|
500
503
|
async def handler() -> None:
|
501
|
-
self.logger.info('######## Cookiebot Dialog found, clicking through.')
|
502
504
|
if await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").is_visible():
|
503
505
|
await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").click(timeout=2000)
|
504
506
|
|
@@ -519,7 +521,7 @@ class Capture():
|
|
519
521
|
self.logger.info('Consent window found, but no button to click through.')
|
520
522
|
|
521
523
|
await page.add_locator_handler(
|
522
|
-
page.get_by_role("alertdialog"),
|
524
|
+
page.get_by_role("alertdialog").last,
|
523
525
|
handler
|
524
526
|
)
|
525
527
|
self.logger.info('alert dialog handler added')
|
@@ -545,7 +547,6 @@ class Capture():
|
|
545
547
|
|
546
548
|
async def __dialog_complianz_clickthrough(self, page: Page) -> None:
|
547
549
|
async def handler() -> None:
|
548
|
-
self.logger.info('######## Complianz found, clicking through.')
|
549
550
|
if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
|
550
551
|
await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
|
551
552
|
|
@@ -557,7 +558,6 @@ class Capture():
|
|
557
558
|
|
558
559
|
async def __dialog_yahoo_clickthrough(self, page: Page) -> None:
|
559
560
|
async def handler() -> None:
|
560
|
-
self.logger.info('######## Yahoo found, clicking through.')
|
561
561
|
if await page.locator('.con-wizard').locator("button.accept-all").is_visible():
|
562
562
|
await page.locator('.con-wizard').locator("button.accept-all").click(timeout=2000)
|
563
563
|
|
@@ -569,7 +569,6 @@ class Capture():
|
|
569
569
|
|
570
570
|
async def __dialog_ppms_clickthrough(self, page: Page) -> None:
|
571
571
|
async def handler() -> None:
|
572
|
-
self.logger.info('######## piwik found, clicking through.')
|
573
572
|
if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
|
574
573
|
await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
|
575
574
|
|
@@ -610,7 +609,11 @@ class Capture():
|
|
610
609
|
multiple_downloads.append((filename, file_content))
|
611
610
|
self.logger.info('Done with download.')
|
612
611
|
except Exception as e:
|
613
|
-
|
612
|
+
if download.page.is_closed():
|
613
|
+
# Page is closed, skip logging.
|
614
|
+
pass
|
615
|
+
else:
|
616
|
+
self.logger.warning(f'Unable to finish download triggered from JS: {e}')
|
614
617
|
finally:
|
615
618
|
self.wait_for_download -= 1
|
616
619
|
|
@@ -832,14 +835,17 @@ class Capture():
|
|
832
835
|
for index, url in enumerate(child_urls):
|
833
836
|
self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
|
834
837
|
start_time = time.time()
|
838
|
+
if page.is_closed():
|
839
|
+
self.logger.info('Page is closed, unable to capture children.')
|
840
|
+
break
|
835
841
|
try:
|
836
|
-
|
837
|
-
self.capture_page(
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
842
|
+
async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first
|
843
|
+
child_capture = await self.capture_page(
|
844
|
+
url=url, referer=page.url,
|
845
|
+
page=page, depth=depth,
|
846
|
+
rendered_hostname_only=rendered_hostname_only,
|
847
|
+
max_depth_capture_time=max_capture_time)
|
848
|
+
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
843
849
|
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
844
850
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
845
851
|
except Exception as e:
|
@@ -880,12 +886,16 @@ class Capture():
|
|
880
886
|
self.should_retry = True
|
881
887
|
elif e.name in ['Download is starting',
|
882
888
|
'Connection closed',
|
889
|
+
'Connection terminated unexpectedly',
|
883
890
|
'Navigation interrupted by another one',
|
884
891
|
'Navigation failed because page was closed!',
|
892
|
+
'Target page, context or browser has been closed',
|
885
893
|
'Protocol error (Page.bringToFront): Not attached to an active page',
|
886
894
|
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
887
895
|
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
896
|
+
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
888
897
|
'Peer sent fatal TLS alert: The server name sent was not recognized',
|
898
|
+
'Peer sent fatal TLS alert: Internal error',
|
889
899
|
'Load cannot follow more than 20 redirections',
|
890
900
|
'Page crashed',
|
891
901
|
'Error receiving data: Connection reset by peer']:
|
@@ -896,9 +906,15 @@ class Capture():
|
|
896
906
|
# The browser barfed, let's try again
|
897
907
|
self.logger.info(f'Browser barfed on {url} (retrying): {e.message}')
|
898
908
|
self.should_retry = True
|
899
|
-
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS'
|
909
|
+
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
|
910
|
+
'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
|
911
|
+
'net::ERR_UNEXPECTED_PROXY_AUTH']:
|
900
912
|
# No need to retry, the credentials are wrong/missing.
|
901
913
|
pass
|
914
|
+
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
|
915
|
+
self.should_retry = True
|
916
|
+
elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
|
917
|
+
pass
|
902
918
|
else:
|
903
919
|
# Unexpected ones
|
904
920
|
self.logger.exception(f'Something went poorly with {url}: {e.message}')
|
@@ -949,7 +965,7 @@ class Capture():
|
|
949
965
|
try:
|
950
966
|
return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
|
951
967
|
except Error as e:
|
952
|
-
self.logger.
|
968
|
+
self.logger.info(f"Unable to get any screenshot: {e}")
|
953
969
|
raise e
|
954
970
|
|
955
971
|
async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
|
@@ -1137,11 +1153,13 @@ class Capture():
|
|
1137
1153
|
'net::ERR_INVALID_RESPONSE',
|
1138
1154
|
'net::ERR_NAME_NOT_RESOLVED',
|
1139
1155
|
'net::ERR_SOCKS_CONNECTION_FAILED',
|
1156
|
+
'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
|
1157
|
+
'net::ERR_SSL_PROTOCOL_ERROR',
|
1140
1158
|
'net::ERR_SSL_UNRECOGNIZED_NAME_ALERT',
|
1141
1159
|
'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
|
1142
|
-
'net::ERR_SSL_PROTOCOL_ERROR',
|
1143
1160
|
'net::ERR_TIMED_OUT',
|
1144
1161
|
'net::ERR_TOO_MANY_REDIRECTS',
|
1162
|
+
'SSL_ERROR_UNKNOWN',
|
1145
1163
|
]:
|
1146
1164
|
return True
|
1147
1165
|
return False
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.24.
|
3
|
+
version = "1.24.6"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -41,8 +41,8 @@ optional = true
|
|
41
41
|
[tool.poetry.group.dev.dependencies]
|
42
42
|
types-beautifulsoup4 = "^4.12.0.20240229"
|
43
43
|
pytest = "^8.1.1"
|
44
|
-
mypy = "^1.
|
45
|
-
types-dateparser = "^1.
|
44
|
+
mypy = "^1.10.0"
|
45
|
+
types-dateparser = "^1.2.0.20240420"
|
46
46
|
types-requests = "^2.31.0.20240406"
|
47
47
|
types-pytz = "^2024.1.0.20240417"
|
48
48
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|