PlaywrightCapture 1.24.4__tar.gz → 1.24.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/PKG-INFO +1 -1
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/playwrightcapture/capture.py +66 -29
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/pyproject.toml +4 -4
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/LICENSE +0 -0
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/README.md +0 -0
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.24.4 → playwrightcapture-1.24.6}/playwrightcapture/py.typed +0 -0
@@ -14,8 +14,9 @@ import time
|
|
14
14
|
|
15
15
|
from base64 import b64decode
|
16
16
|
from io import BytesIO
|
17
|
+
from logging import LoggerAdapter, Logger
|
17
18
|
from tempfile import NamedTemporaryFile
|
18
|
-
from typing import Any, TypedDict, Literal, TYPE_CHECKING
|
19
|
+
from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
|
19
20
|
from urllib.parse import urlparse, unquote, urljoin
|
20
21
|
from zipfile import ZipFile
|
21
22
|
|
@@ -42,6 +43,11 @@ else:
|
|
42
43
|
from zoneinfo import available_timezones
|
43
44
|
all_timezones_set = available_timezones()
|
44
45
|
|
46
|
+
if sys.version_info < (3, 11):
|
47
|
+
from async_timeout import timeout
|
48
|
+
else:
|
49
|
+
from asyncio import timeout
|
50
|
+
|
45
51
|
if TYPE_CHECKING:
|
46
52
|
from playwright._impl._api_structures import (SetCookieParam, Geolocation,
|
47
53
|
HttpCredentials, Headers,
|
@@ -79,6 +85,16 @@ class CaptureResponse(TypedDict, total=False):
|
|
79
85
|
potential_favicons: set[bytes] | None
|
80
86
|
|
81
87
|
|
88
|
+
class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
89
|
+
"""
|
90
|
+
Prepend log entry with the UUID of the capture
|
91
|
+
"""
|
92
|
+
def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
|
93
|
+
if self.extra:
|
94
|
+
return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
|
95
|
+
return msg, kwargs
|
96
|
+
|
97
|
+
|
82
98
|
class Capture():
|
83
99
|
|
84
100
|
_browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
|
@@ -90,7 +106,8 @@ class Capture():
|
|
90
106
|
|
91
107
|
def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
|
92
108
|
proxy: str | dict[str, str] | None=None,
|
93
|
-
general_timeout_in_sec: int | None = None, loglevel: str | int='INFO'
|
109
|
+
general_timeout_in_sec: int | None = None, loglevel: str | int='INFO',
|
110
|
+
uuid: str | None=None):
|
94
111
|
"""Captures a page with Playwright.
|
95
112
|
|
96
113
|
:param browser: The browser to use for the capture.
|
@@ -98,9 +115,15 @@ class Capture():
|
|
98
115
|
:param proxy: The external proxy to use for the capture.
|
99
116
|
:param general_timeout_in_sec: The general timeout for the capture, including children.
|
100
117
|
:param loglevel: Python loglevel
|
118
|
+
:param uuid: The UUID of the capture.
|
101
119
|
"""
|
102
|
-
|
103
|
-
|
120
|
+
master_logger = logging.getLogger('playwrightcapture')
|
121
|
+
master_logger.setLevel(loglevel)
|
122
|
+
self.logger: Logger | PlaywrightCaptureLogAdapter
|
123
|
+
if uuid is not None:
|
124
|
+
self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': uuid})
|
125
|
+
else:
|
126
|
+
self.logger = master_logger
|
104
127
|
self.browser_name: BROWSER = browser if browser else 'chromium'
|
105
128
|
|
106
129
|
if general_timeout_in_sec is None:
|
@@ -451,36 +474,33 @@ class Capture():
|
|
451
474
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
452
475
|
await page.locator("#didomi-notice-agree-button").click(timeout=2000)
|
453
476
|
|
454
|
-
await page.add_locator_handler(page.locator(".didomi-popup-view"), handler)
|
477
|
+
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler)
|
455
478
|
self.logger.info('Didomi handler added')
|
456
479
|
|
457
480
|
async def __dialog_onetrust_clickthrough(self, page: Page) -> None:
|
458
481
|
async def handler() -> None:
|
459
|
-
self.logger.info('######## OT Dialog found, clicking through.')
|
460
482
|
if await page.locator("#onetrust-accept-btn-handler").is_visible():
|
461
483
|
await page.locator("#onetrust-accept-btn-handler").click(timeout=2000)
|
462
484
|
|
463
485
|
await page.add_locator_handler(
|
464
|
-
page.locator('#onetrust-banner-sdk'),
|
486
|
+
page.locator('#onetrust-banner-sdk').last,
|
465
487
|
handler
|
466
488
|
)
|
467
489
|
self.logger.info('OT handler added')
|
468
490
|
|
469
491
|
async def __dialog_hubspot_clickthrough(self, page: Page) -> None:
|
470
492
|
async def handler() -> None:
|
471
|
-
self.logger.info('######## HS Dialog found, clicking through.')
|
472
493
|
if await page.locator("#hs-eu-confirmation-button").is_visible():
|
473
494
|
await page.locator("#hs-eu-confirmation-button").click(timeout=2000)
|
474
495
|
|
475
496
|
await page.add_locator_handler(
|
476
|
-
page.locator('#hs-eu-cookie-confirmation'),
|
497
|
+
page.locator('#hs-eu-cookie-confirmation').last,
|
477
498
|
handler
|
478
499
|
)
|
479
500
|
self.logger.info('HS handler added')
|
480
501
|
|
481
502
|
async def __dialog_cookiebot_clickthrough(self, page: Page) -> None:
|
482
503
|
async def handler() -> None:
|
483
|
-
self.logger.info('######## Cookiebot Dialog found, clicking through.')
|
484
504
|
if await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").is_visible():
|
485
505
|
await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").click(timeout=2000)
|
486
506
|
|
@@ -501,7 +521,7 @@ class Capture():
|
|
501
521
|
self.logger.info('Consent window found, but no button to click through.')
|
502
522
|
|
503
523
|
await page.add_locator_handler(
|
504
|
-
page.get_by_role("alertdialog"),
|
524
|
+
page.get_by_role("alertdialog").last,
|
505
525
|
handler
|
506
526
|
)
|
507
527
|
self.logger.info('alert dialog handler added')
|
@@ -527,7 +547,6 @@ class Capture():
|
|
527
547
|
|
528
548
|
async def __dialog_complianz_clickthrough(self, page: Page) -> None:
|
529
549
|
async def handler() -> None:
|
530
|
-
self.logger.info('######## Complianz found, clicking through.')
|
531
550
|
if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
|
532
551
|
await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
|
533
552
|
|
@@ -539,7 +558,6 @@ class Capture():
|
|
539
558
|
|
540
559
|
async def __dialog_yahoo_clickthrough(self, page: Page) -> None:
|
541
560
|
async def handler() -> None:
|
542
|
-
self.logger.info('######## Yahoo found, clicking through.')
|
543
561
|
if await page.locator('.con-wizard').locator("button.accept-all").is_visible():
|
544
562
|
await page.locator('.con-wizard').locator("button.accept-all").click(timeout=2000)
|
545
563
|
|
@@ -551,7 +569,6 @@ class Capture():
|
|
551
569
|
|
552
570
|
async def __dialog_ppms_clickthrough(self, page: Page) -> None:
|
553
571
|
async def handler() -> None:
|
554
|
-
self.logger.info('######## piwik found, clicking through.')
|
555
572
|
if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
|
556
573
|
await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
|
557
574
|
|
@@ -592,7 +609,11 @@ class Capture():
|
|
592
609
|
multiple_downloads.append((filename, file_content))
|
593
610
|
self.logger.info('Done with download.')
|
594
611
|
except Exception as e:
|
595
|
-
|
612
|
+
if download.page.is_closed():
|
613
|
+
# Page is closed, skip logging.
|
614
|
+
pass
|
615
|
+
else:
|
616
|
+
self.logger.warning(f'Unable to finish download triggered from JS: {e}')
|
596
617
|
finally:
|
597
618
|
self.wait_for_download -= 1
|
598
619
|
|
@@ -682,9 +703,6 @@ class Capture():
|
|
682
703
|
except Exception:
|
683
704
|
raise e
|
684
705
|
else:
|
685
|
-
if not self._exception_is_network_error(initial_error):
|
686
|
-
# TODO: Do something?
|
687
|
-
self.logger.warning(f'Unexpected error: {initial_error}')
|
688
706
|
raise initial_error
|
689
707
|
else:
|
690
708
|
await page.bring_to_front()
|
@@ -817,14 +835,17 @@ class Capture():
|
|
817
835
|
for index, url in enumerate(child_urls):
|
818
836
|
self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
|
819
837
|
start_time = time.time()
|
838
|
+
if page.is_closed():
|
839
|
+
self.logger.info('Page is closed, unable to capture children.')
|
840
|
+
break
|
820
841
|
try:
|
821
|
-
|
822
|
-
self.capture_page(
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
842
|
+
async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first
|
843
|
+
child_capture = await self.capture_page(
|
844
|
+
url=url, referer=page.url,
|
845
|
+
page=page, depth=depth,
|
846
|
+
rendered_hostname_only=rendered_hostname_only,
|
847
|
+
max_depth_capture_time=max_capture_time)
|
848
|
+
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
828
849
|
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
829
850
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
830
851
|
except Exception as e:
|
@@ -865,11 +886,19 @@ class Capture():
|
|
865
886
|
self.should_retry = True
|
866
887
|
elif e.name in ['Download is starting',
|
867
888
|
'Connection closed',
|
889
|
+
'Connection terminated unexpectedly',
|
868
890
|
'Navigation interrupted by another one',
|
869
891
|
'Navigation failed because page was closed!',
|
892
|
+
'Target page, context or browser has been closed',
|
870
893
|
'Protocol error (Page.bringToFront): Not attached to an active page',
|
871
894
|
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
872
|
-
'
|
895
|
+
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
896
|
+
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
897
|
+
'Peer sent fatal TLS alert: The server name sent was not recognized',
|
898
|
+
'Peer sent fatal TLS alert: Internal error',
|
899
|
+
'Load cannot follow more than 20 redirections',
|
900
|
+
'Page crashed',
|
901
|
+
'Error receiving data: Connection reset by peer']:
|
873
902
|
# Other errors, let's give it another shot
|
874
903
|
self.logger.info(f'Issue with {url} (retrying): {e.message}')
|
875
904
|
self.should_retry = True
|
@@ -877,9 +906,15 @@ class Capture():
|
|
877
906
|
# The browser barfed, let's try again
|
878
907
|
self.logger.info(f'Browser barfed on {url} (retrying): {e.message}')
|
879
908
|
self.should_retry = True
|
880
|
-
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS'
|
909
|
+
elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
|
910
|
+
'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
|
911
|
+
'net::ERR_UNEXPECTED_PROXY_AUTH']:
|
881
912
|
# No need to retry, the credentials are wrong/missing.
|
882
913
|
pass
|
914
|
+
elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
|
915
|
+
self.should_retry = True
|
916
|
+
elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
|
917
|
+
pass
|
883
918
|
else:
|
884
919
|
# Unexpected ones
|
885
920
|
self.logger.exception(f'Something went poorly with {url}: {e.message}')
|
@@ -930,7 +965,7 @@ class Capture():
|
|
930
965
|
try:
|
931
966
|
return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
|
932
967
|
except Error as e:
|
933
|
-
self.logger.
|
968
|
+
self.logger.info(f"Unable to get any screenshot: {e}")
|
934
969
|
raise e
|
935
970
|
|
936
971
|
async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
|
@@ -1118,11 +1153,13 @@ class Capture():
|
|
1118
1153
|
'net::ERR_INVALID_RESPONSE',
|
1119
1154
|
'net::ERR_NAME_NOT_RESOLVED',
|
1120
1155
|
'net::ERR_SOCKS_CONNECTION_FAILED',
|
1156
|
+
'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
|
1157
|
+
'net::ERR_SSL_PROTOCOL_ERROR',
|
1121
1158
|
'net::ERR_SSL_UNRECOGNIZED_NAME_ALERT',
|
1122
1159
|
'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
|
1123
|
-
'net::ERR_SSL_PROTOCOL_ERROR',
|
1124
1160
|
'net::ERR_TIMED_OUT',
|
1125
1161
|
'net::ERR_TOO_MANY_REDIRECTS',
|
1162
|
+
'SSL_ERROR_UNKNOWN',
|
1126
1163
|
]:
|
1127
1164
|
return True
|
1128
1165
|
return False
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.24.
|
3
|
+
version = "1.24.6"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -41,10 +41,10 @@ optional = true
|
|
41
41
|
[tool.poetry.group.dev.dependencies]
|
42
42
|
types-beautifulsoup4 = "^4.12.0.20240229"
|
43
43
|
pytest = "^8.1.1"
|
44
|
-
mypy = "^1.
|
45
|
-
types-dateparser = "^1.
|
44
|
+
mypy = "^1.10.0"
|
45
|
+
types-dateparser = "^1.2.0.20240420"
|
46
46
|
types-requests = "^2.31.0.20240406"
|
47
|
-
types-pytz = "^2024.1.0.
|
47
|
+
types-pytz = "^2024.1.0.20240417"
|
48
48
|
|
49
49
|
|
50
50
|
[build-system]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|