PlaywrightCapture 1.24.4__tar.gz → 1.24.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.24.4
3
+ Version: 1.24.6
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -14,8 +14,9 @@ import time
14
14
 
15
15
  from base64 import b64decode
16
16
  from io import BytesIO
17
+ from logging import LoggerAdapter, Logger
17
18
  from tempfile import NamedTemporaryFile
18
- from typing import Any, TypedDict, Literal, TYPE_CHECKING
19
+ from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
19
20
  from urllib.parse import urlparse, unquote, urljoin
20
21
  from zipfile import ZipFile
21
22
 
@@ -42,6 +43,11 @@ else:
42
43
  from zoneinfo import available_timezones
43
44
  all_timezones_set = available_timezones()
44
45
 
46
+ if sys.version_info < (3, 11):
47
+ from async_timeout import timeout
48
+ else:
49
+ from asyncio import timeout
50
+
45
51
  if TYPE_CHECKING:
46
52
  from playwright._impl._api_structures import (SetCookieParam, Geolocation,
47
53
  HttpCredentials, Headers,
@@ -79,6 +85,16 @@ class CaptureResponse(TypedDict, total=False):
79
85
  potential_favicons: set[bytes] | None
80
86
 
81
87
 
88
+ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
89
+ """
90
+ Prepend log entry with the UUID of the capture
91
+ """
92
+ def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
93
+ if self.extra:
94
+ return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
95
+ return msg, kwargs
96
+
97
+
82
98
  class Capture():
83
99
 
84
100
  _browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
@@ -90,7 +106,8 @@ class Capture():
90
106
 
91
107
  def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
92
108
  proxy: str | dict[str, str] | None=None,
93
- general_timeout_in_sec: int | None = None, loglevel: str | int='INFO'):
109
+ general_timeout_in_sec: int | None = None, loglevel: str | int='INFO',
110
+ uuid: str | None=None):
94
111
  """Captures a page with Playwright.
95
112
 
96
113
  :param browser: The browser to use for the capture.
@@ -98,9 +115,15 @@ class Capture():
98
115
  :param proxy: The external proxy to use for the capture.
99
116
  :param general_timeout_in_sec: The general timeout for the capture, including children.
100
117
  :param loglevel: Python loglevel
118
+ :param uuid: The UUID of the capture.
101
119
  """
102
- self.logger = logging.getLogger('playwrightcapture')
103
- self.logger.setLevel(loglevel)
120
+ master_logger = logging.getLogger('playwrightcapture')
121
+ master_logger.setLevel(loglevel)
122
+ self.logger: Logger | PlaywrightCaptureLogAdapter
123
+ if uuid is not None:
124
+ self.logger = PlaywrightCaptureLogAdapter(master_logger, {'uuid': uuid})
125
+ else:
126
+ self.logger = master_logger
104
127
  self.browser_name: BROWSER = browser if browser else 'chromium'
105
128
 
106
129
  if general_timeout_in_sec is None:
@@ -451,36 +474,33 @@ class Capture():
451
474
  if await page.locator("#didomi-notice-agree-button").is_visible():
452
475
  await page.locator("#didomi-notice-agree-button").click(timeout=2000)
453
476
 
454
- await page.add_locator_handler(page.locator(".didomi-popup-view"), handler)
477
+ await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler)
455
478
  self.logger.info('Didomi handler added')
456
479
 
457
480
  async def __dialog_onetrust_clickthrough(self, page: Page) -> None:
458
481
  async def handler() -> None:
459
- self.logger.info('######## OT Dialog found, clicking through.')
460
482
  if await page.locator("#onetrust-accept-btn-handler").is_visible():
461
483
  await page.locator("#onetrust-accept-btn-handler").click(timeout=2000)
462
484
 
463
485
  await page.add_locator_handler(
464
- page.locator('#onetrust-banner-sdk'),
486
+ page.locator('#onetrust-banner-sdk').last,
465
487
  handler
466
488
  )
467
489
  self.logger.info('OT handler added')
468
490
 
469
491
  async def __dialog_hubspot_clickthrough(self, page: Page) -> None:
470
492
  async def handler() -> None:
471
- self.logger.info('######## HS Dialog found, clicking through.')
472
493
  if await page.locator("#hs-eu-confirmation-button").is_visible():
473
494
  await page.locator("#hs-eu-confirmation-button").click(timeout=2000)
474
495
 
475
496
  await page.add_locator_handler(
476
- page.locator('#hs-eu-cookie-confirmation'),
497
+ page.locator('#hs-eu-cookie-confirmation').last,
477
498
  handler
478
499
  )
479
500
  self.logger.info('HS handler added')
480
501
 
481
502
  async def __dialog_cookiebot_clickthrough(self, page: Page) -> None:
482
503
  async def handler() -> None:
483
- self.logger.info('######## Cookiebot Dialog found, clicking through.')
484
504
  if await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").is_visible():
485
505
  await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").click(timeout=2000)
486
506
 
@@ -501,7 +521,7 @@ class Capture():
501
521
  self.logger.info('Consent window found, but no button to click through.')
502
522
 
503
523
  await page.add_locator_handler(
504
- page.get_by_role("alertdialog"),
524
+ page.get_by_role("alertdialog").last,
505
525
  handler
506
526
  )
507
527
  self.logger.info('alert dialog handler added')
@@ -527,7 +547,6 @@ class Capture():
527
547
 
528
548
  async def __dialog_complianz_clickthrough(self, page: Page) -> None:
529
549
  async def handler() -> None:
530
- self.logger.info('######## Complianz found, clicking through.')
531
550
  if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
532
551
  await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
533
552
 
@@ -539,7 +558,6 @@ class Capture():
539
558
 
540
559
  async def __dialog_yahoo_clickthrough(self, page: Page) -> None:
541
560
  async def handler() -> None:
542
- self.logger.info('######## Yahoo found, clicking through.')
543
561
  if await page.locator('.con-wizard').locator("button.accept-all").is_visible():
544
562
  await page.locator('.con-wizard').locator("button.accept-all").click(timeout=2000)
545
563
 
@@ -551,7 +569,6 @@ class Capture():
551
569
 
552
570
  async def __dialog_ppms_clickthrough(self, page: Page) -> None:
553
571
  async def handler() -> None:
554
- self.logger.info('######## piwik found, clicking through.')
555
572
  if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
556
573
  await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
557
574
 
@@ -592,7 +609,11 @@ class Capture():
592
609
  multiple_downloads.append((filename, file_content))
593
610
  self.logger.info('Done with download.')
594
611
  except Exception as e:
595
- self.logger.warning(f'Unable to finish download triggered from JS: {e}')
612
+ if download.page.is_closed():
613
+ # Page is closed, skip logging.
614
+ pass
615
+ else:
616
+ self.logger.warning(f'Unable to finish download triggered from JS: {e}')
596
617
  finally:
597
618
  self.wait_for_download -= 1
598
619
 
@@ -682,9 +703,6 @@ class Capture():
682
703
  except Exception:
683
704
  raise e
684
705
  else:
685
- if not self._exception_is_network_error(initial_error):
686
- # TODO: Do something?
687
- self.logger.warning(f'Unexpected error: {initial_error}')
688
706
  raise initial_error
689
707
  else:
690
708
  await page.bring_to_front()
@@ -817,14 +835,17 @@ class Capture():
817
835
  for index, url in enumerate(child_urls):
818
836
  self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
819
837
  start_time = time.time()
838
+ if page.is_closed():
839
+ self.logger.info('Page is closed, unable to capture children.')
840
+ break
820
841
  try:
821
- child_capture = await asyncio.wait_for(
822
- self.capture_page(url=url, referer=page.url,
823
- page=page, depth=depth,
824
- rendered_hostname_only=rendered_hostname_only,
825
- max_depth_capture_time=max_capture_time),
826
- timeout=max_capture_time + 1) # just adding a bit of padding so playwright has the chance to raise the exception first
827
- to_return['children'].append(child_capture) # type: ignore[union-attr]
842
+ async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first
843
+ child_capture = await self.capture_page(
844
+ url=url, referer=page.url,
845
+ page=page, depth=depth,
846
+ rendered_hostname_only=rendered_hostname_only,
847
+ max_depth_capture_time=max_capture_time)
848
+ to_return['children'].append(child_capture) # type: ignore[union-attr]
828
849
  except (TimeoutError, asyncio.exceptions.TimeoutError):
829
850
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
830
851
  except Exception as e:
@@ -865,11 +886,19 @@ class Capture():
865
886
  self.should_retry = True
866
887
  elif e.name in ['Download is starting',
867
888
  'Connection closed',
889
+ 'Connection terminated unexpectedly',
868
890
  'Navigation interrupted by another one',
869
891
  'Navigation failed because page was closed!',
892
+ 'Target page, context or browser has been closed',
870
893
  'Protocol error (Page.bringToFront): Not attached to an active page',
871
894
  'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
872
- 'Load cannot follow more than 20 redirections']:
895
+ 'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
896
+ 'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
897
+ 'Peer sent fatal TLS alert: The server name sent was not recognized',
898
+ 'Peer sent fatal TLS alert: Internal error',
899
+ 'Load cannot follow more than 20 redirections',
900
+ 'Page crashed',
901
+ 'Error receiving data: Connection reset by peer']:
873
902
  # Other errors, let's give it another shot
874
903
  self.logger.info(f'Issue with {url} (retrying): {e.message}')
875
904
  self.should_retry = True
@@ -877,9 +906,15 @@ class Capture():
877
906
  # The browser barfed, let's try again
878
907
  self.logger.info(f'Browser barfed on {url} (retrying): {e.message}')
879
908
  self.should_retry = True
880
- elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS']:
909
+ elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
910
+ 'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
911
+ 'net::ERR_UNEXPECTED_PROXY_AUTH']:
881
912
  # No need to retry, the credentials are wrong/missing.
882
913
  pass
914
+ elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
915
+ self.should_retry = True
916
+ elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
917
+ pass
883
918
  else:
884
919
  # Unexpected ones
885
920
  self.logger.exception(f'Something went poorly with {url}: {e.message}')
@@ -930,7 +965,7 @@ class Capture():
930
965
  try:
931
966
  return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
932
967
  except Error as e:
933
- self.logger.warning(f"Unable to get any screenshot: {e}")
968
+ self.logger.info(f"Unable to get any screenshot: {e}")
934
969
  raise e
935
970
 
936
971
  async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
@@ -1118,11 +1153,13 @@ class Capture():
1118
1153
  'net::ERR_INVALID_RESPONSE',
1119
1154
  'net::ERR_NAME_NOT_RESOLVED',
1120
1155
  'net::ERR_SOCKS_CONNECTION_FAILED',
1156
+ 'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
1157
+ 'net::ERR_SSL_PROTOCOL_ERROR',
1121
1158
  'net::ERR_SSL_UNRECOGNIZED_NAME_ALERT',
1122
1159
  'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
1123
- 'net::ERR_SSL_PROTOCOL_ERROR',
1124
1160
  'net::ERR_TIMED_OUT',
1125
1161
  'net::ERR_TOO_MANY_REDIRECTS',
1162
+ 'SSL_ERROR_UNKNOWN',
1126
1163
  ]:
1127
1164
  return True
1128
1165
  return False
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.24.4"
3
+ version = "1.24.6"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -41,10 +41,10 @@ optional = true
41
41
  [tool.poetry.group.dev.dependencies]
42
42
  types-beautifulsoup4 = "^4.12.0.20240229"
43
43
  pytest = "^8.1.1"
44
- mypy = "^1.9.0"
45
- types-dateparser = "^1.1.4.20240331"
44
+ mypy = "^1.10.0"
45
+ types-dateparser = "^1.2.0.20240420"
46
46
  types-requests = "^2.31.0.20240406"
47
- types-pytz = "^2024.1.0.20240203"
47
+ types-pytz = "^2024.1.0.20240417"
48
48
 
49
49
 
50
50
  [build-system]