PlaywrightCapture 1.24.5__tar.gz → 1.24.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.24.5
3
+ Version: 1.24.6
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -43,6 +43,11 @@ else:
43
43
  from zoneinfo import available_timezones
44
44
  all_timezones_set = available_timezones()
45
45
 
46
+ if sys.version_info < (3, 11):
47
+ from async_timeout import timeout
48
+ else:
49
+ from asyncio import timeout
50
+
46
51
  if TYPE_CHECKING:
47
52
  from playwright._impl._api_structures import (SetCookieParam, Geolocation,
48
53
  HttpCredentials, Headers,
@@ -469,36 +474,33 @@ class Capture():
469
474
  if await page.locator("#didomi-notice-agree-button").is_visible():
470
475
  await page.locator("#didomi-notice-agree-button").click(timeout=2000)
471
476
 
472
- await page.add_locator_handler(page.locator(".didomi-popup-view"), handler)
477
+ await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler)
473
478
  self.logger.info('Didomi handler added')
474
479
 
475
480
  async def __dialog_onetrust_clickthrough(self, page: Page) -> None:
476
481
  async def handler() -> None:
477
- self.logger.info('######## OT Dialog found, clicking through.')
478
482
  if await page.locator("#onetrust-accept-btn-handler").is_visible():
479
483
  await page.locator("#onetrust-accept-btn-handler").click(timeout=2000)
480
484
 
481
485
  await page.add_locator_handler(
482
- page.locator('#onetrust-banner-sdk'),
486
+ page.locator('#onetrust-banner-sdk').last,
483
487
  handler
484
488
  )
485
489
  self.logger.info('OT handler added')
486
490
 
487
491
  async def __dialog_hubspot_clickthrough(self, page: Page) -> None:
488
492
  async def handler() -> None:
489
- self.logger.info('######## HS Dialog found, clicking through.')
490
493
  if await page.locator("#hs-eu-confirmation-button").is_visible():
491
494
  await page.locator("#hs-eu-confirmation-button").click(timeout=2000)
492
495
 
493
496
  await page.add_locator_handler(
494
- page.locator('#hs-eu-cookie-confirmation'),
497
+ page.locator('#hs-eu-cookie-confirmation').last,
495
498
  handler
496
499
  )
497
500
  self.logger.info('HS handler added')
498
501
 
499
502
  async def __dialog_cookiebot_clickthrough(self, page: Page) -> None:
500
503
  async def handler() -> None:
501
- self.logger.info('######## Cookiebot Dialog found, clicking through.')
502
504
  if await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").is_visible():
503
505
  await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").click(timeout=2000)
504
506
 
@@ -519,7 +521,7 @@ class Capture():
519
521
  self.logger.info('Consent window found, but no button to click through.')
520
522
 
521
523
  await page.add_locator_handler(
522
- page.get_by_role("alertdialog"),
524
+ page.get_by_role("alertdialog").last,
523
525
  handler
524
526
  )
525
527
  self.logger.info('alert dialog handler added')
@@ -545,7 +547,6 @@ class Capture():
545
547
 
546
548
  async def __dialog_complianz_clickthrough(self, page: Page) -> None:
547
549
  async def handler() -> None:
548
- self.logger.info('######## Complianz found, clicking through.')
549
550
  if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
550
551
  await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
551
552
 
@@ -557,7 +558,6 @@ class Capture():
557
558
 
558
559
  async def __dialog_yahoo_clickthrough(self, page: Page) -> None:
559
560
  async def handler() -> None:
560
- self.logger.info('######## Yahoo found, clicking through.')
561
561
  if await page.locator('.con-wizard').locator("button.accept-all").is_visible():
562
562
  await page.locator('.con-wizard').locator("button.accept-all").click(timeout=2000)
563
563
 
@@ -569,7 +569,6 @@ class Capture():
569
569
 
570
570
  async def __dialog_ppms_clickthrough(self, page: Page) -> None:
571
571
  async def handler() -> None:
572
- self.logger.info('######## piwik found, clicking through.')
573
572
  if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
574
573
  await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
575
574
 
@@ -610,7 +609,11 @@ class Capture():
610
609
  multiple_downloads.append((filename, file_content))
611
610
  self.logger.info('Done with download.')
612
611
  except Exception as e:
613
- self.logger.warning(f'Unable to finish download triggered from JS: {e}')
612
+ if download.page.is_closed():
613
+ # Page is closed, skip logging.
614
+ pass
615
+ else:
616
+ self.logger.warning(f'Unable to finish download triggered from JS: {e}')
614
617
  finally:
615
618
  self.wait_for_download -= 1
616
619
 
@@ -832,14 +835,17 @@ class Capture():
832
835
  for index, url in enumerate(child_urls):
833
836
  self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
834
837
  start_time = time.time()
838
+ if page.is_closed():
839
+ self.logger.info('Page is closed, unable to capture children.')
840
+ break
835
841
  try:
836
- child_capture = await asyncio.wait_for(
837
- self.capture_page(url=url, referer=page.url,
838
- page=page, depth=depth,
839
- rendered_hostname_only=rendered_hostname_only,
840
- max_depth_capture_time=max_capture_time),
841
- timeout=max_capture_time + 1) # just adding a bit of padding so playwright has the chance to raise the exception first
842
- to_return['children'].append(child_capture) # type: ignore[union-attr]
842
+ async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first
843
+ child_capture = await self.capture_page(
844
+ url=url, referer=page.url,
845
+ page=page, depth=depth,
846
+ rendered_hostname_only=rendered_hostname_only,
847
+ max_depth_capture_time=max_capture_time)
848
+ to_return['children'].append(child_capture) # type: ignore[union-attr]
843
849
  except (TimeoutError, asyncio.exceptions.TimeoutError):
844
850
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
845
851
  except Exception as e:
@@ -880,12 +886,16 @@ class Capture():
880
886
  self.should_retry = True
881
887
  elif e.name in ['Download is starting',
882
888
  'Connection closed',
889
+ 'Connection terminated unexpectedly',
883
890
  'Navigation interrupted by another one',
884
891
  'Navigation failed because page was closed!',
892
+ 'Target page, context or browser has been closed',
885
893
  'Protocol error (Page.bringToFront): Not attached to an active page',
886
894
  'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
887
895
  'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
896
+ 'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
888
897
  'Peer sent fatal TLS alert: The server name sent was not recognized',
898
+ 'Peer sent fatal TLS alert: Internal error',
889
899
  'Load cannot follow more than 20 redirections',
890
900
  'Page crashed',
891
901
  'Error receiving data: Connection reset by peer']:
@@ -896,9 +906,15 @@ class Capture():
896
906
  # The browser barfed, let's try again
897
907
  self.logger.info(f'Browser barfed on {url} (retrying): {e.message}')
898
908
  self.should_retry = True
899
- elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS']:
909
+ elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
910
+ 'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
911
+ 'net::ERR_UNEXPECTED_PROXY_AUTH']:
900
912
  # No need to retry, the credentials are wrong/missing.
901
913
  pass
914
+ elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
915
+ self.should_retry = True
916
+ elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
917
+ pass
902
918
  else:
903
919
  # Unexpected ones
904
920
  self.logger.exception(f'Something went poorly with {url}: {e.message}')
@@ -949,7 +965,7 @@ class Capture():
949
965
  try:
950
966
  return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
951
967
  except Error as e:
952
- self.logger.warning(f"Unable to get any screenshot: {e}")
968
+ self.logger.info(f"Unable to get any screenshot: {e}")
953
969
  raise e
954
970
 
955
971
  async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
@@ -1137,11 +1153,13 @@ class Capture():
1137
1153
  'net::ERR_INVALID_RESPONSE',
1138
1154
  'net::ERR_NAME_NOT_RESOLVED',
1139
1155
  'net::ERR_SOCKS_CONNECTION_FAILED',
1156
+ 'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
1157
+ 'net::ERR_SSL_PROTOCOL_ERROR',
1140
1158
  'net::ERR_SSL_UNRECOGNIZED_NAME_ALERT',
1141
1159
  'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
1142
- 'net::ERR_SSL_PROTOCOL_ERROR',
1143
1160
  'net::ERR_TIMED_OUT',
1144
1161
  'net::ERR_TOO_MANY_REDIRECTS',
1162
+ 'SSL_ERROR_UNKNOWN',
1145
1163
  ]:
1146
1164
  return True
1147
1165
  return False
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.24.5"
3
+ version = "1.24.6"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -41,8 +41,8 @@ optional = true
41
41
  [tool.poetry.group.dev.dependencies]
42
42
  types-beautifulsoup4 = "^4.12.0.20240229"
43
43
  pytest = "^8.1.1"
44
- mypy = "^1.9.0"
45
- types-dateparser = "^1.1.4.20240331"
44
+ mypy = "^1.10.0"
45
+ types-dateparser = "^1.2.0.20240420"
46
46
  types-requests = "^2.31.0.20240406"
47
47
  types-pytz = "^2024.1.0.20240417"
48
48