PlaywrightCapture 1.24.5__tar.gz → 1.24.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.24.5
3
+ Version: 1.24.7
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -21,11 +21,12 @@ Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
23
  Requires-Dist: SpeechRecognition (>=3.10.3,<4.0.0) ; extra == "recaptcha"
24
+ Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
24
25
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
25
26
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
26
27
  Requires-Dist: playwright (>=1.43.0,<2.0.0)
27
28
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
28
- Requires-Dist: puremagic (>=1.21,<2.0)
29
+ Requires-Dist: puremagic (>=1.22,<2.0)
29
30
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
30
31
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
31
32
  Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
@@ -43,6 +43,11 @@ else:
43
43
  from zoneinfo import available_timezones
44
44
  all_timezones_set = available_timezones()
45
45
 
46
+ if sys.version_info < (3, 11):
47
+ from async_timeout import timeout
48
+ else:
49
+ from asyncio import timeout
50
+
46
51
  if TYPE_CHECKING:
47
52
  from playwright._impl._api_structures import (SetCookieParam, Geolocation,
48
53
  HttpCredentials, Headers,
@@ -469,36 +474,33 @@ class Capture():
469
474
  if await page.locator("#didomi-notice-agree-button").is_visible():
470
475
  await page.locator("#didomi-notice-agree-button").click(timeout=2000)
471
476
 
472
- await page.add_locator_handler(page.locator(".didomi-popup-view"), handler)
477
+ await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler)
473
478
  self.logger.info('Didomi handler added')
474
479
 
475
480
  async def __dialog_onetrust_clickthrough(self, page: Page) -> None:
476
481
  async def handler() -> None:
477
- self.logger.info('######## OT Dialog found, clicking through.')
478
482
  if await page.locator("#onetrust-accept-btn-handler").is_visible():
479
483
  await page.locator("#onetrust-accept-btn-handler").click(timeout=2000)
480
484
 
481
485
  await page.add_locator_handler(
482
- page.locator('#onetrust-banner-sdk'),
486
+ page.locator('#onetrust-banner-sdk').last,
483
487
  handler
484
488
  )
485
489
  self.logger.info('OT handler added')
486
490
 
487
491
  async def __dialog_hubspot_clickthrough(self, page: Page) -> None:
488
492
  async def handler() -> None:
489
- self.logger.info('######## HS Dialog found, clicking through.')
490
493
  if await page.locator("#hs-eu-confirmation-button").is_visible():
491
494
  await page.locator("#hs-eu-confirmation-button").click(timeout=2000)
492
495
 
493
496
  await page.add_locator_handler(
494
- page.locator('#hs-eu-cookie-confirmation'),
497
+ page.locator('#hs-eu-cookie-confirmation').last,
495
498
  handler
496
499
  )
497
500
  self.logger.info('HS handler added')
498
501
 
499
502
  async def __dialog_cookiebot_clickthrough(self, page: Page) -> None:
500
503
  async def handler() -> None:
501
- self.logger.info('######## Cookiebot Dialog found, clicking through.')
502
504
  if await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").is_visible():
503
505
  await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").click(timeout=2000)
504
506
 
@@ -519,7 +521,7 @@ class Capture():
519
521
  self.logger.info('Consent window found, but no button to click through.')
520
522
 
521
523
  await page.add_locator_handler(
522
- page.get_by_role("alertdialog"),
524
+ page.get_by_role("alertdialog").last,
523
525
  handler
524
526
  )
525
527
  self.logger.info('alert dialog handler added')
@@ -545,7 +547,6 @@ class Capture():
545
547
 
546
548
  async def __dialog_complianz_clickthrough(self, page: Page) -> None:
547
549
  async def handler() -> None:
548
- self.logger.info('######## Complianz found, clicking through.')
549
550
  if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
550
551
  await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
551
552
 
@@ -557,7 +558,6 @@ class Capture():
557
558
 
558
559
  async def __dialog_yahoo_clickthrough(self, page: Page) -> None:
559
560
  async def handler() -> None:
560
- self.logger.info('######## Yahoo found, clicking through.')
561
561
  if await page.locator('.con-wizard').locator("button.accept-all").is_visible():
562
562
  await page.locator('.con-wizard').locator("button.accept-all").click(timeout=2000)
563
563
 
@@ -569,7 +569,6 @@ class Capture():
569
569
 
570
570
  async def __dialog_ppms_clickthrough(self, page: Page) -> None:
571
571
  async def handler() -> None:
572
- self.logger.info('######## piwik found, clicking through.')
573
572
  if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
574
573
  await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
575
574
 
@@ -610,7 +609,11 @@ class Capture():
610
609
  multiple_downloads.append((filename, file_content))
611
610
  self.logger.info('Done with download.')
612
611
  except Exception as e:
613
- self.logger.warning(f'Unable to finish download triggered from JS: {e}')
612
+ if download.page.is_closed():
613
+ # Page is closed, skip logging.
614
+ pass
615
+ else:
616
+ self.logger.warning(f'Unable to finish download triggered from JS: {e}')
614
617
  finally:
615
618
  self.wait_for_download -= 1
616
619
 
@@ -664,7 +667,6 @@ class Capture():
664
667
  parsed_url = urlparse(url, allow_fragments=True)
665
668
 
666
669
  try:
667
- # NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
668
670
  await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
669
671
  page.on("download", handle_download)
670
672
  except Error as initial_error:
@@ -829,24 +831,39 @@ class Capture():
829
831
  else:
830
832
  child_urls = child_urls[:max_captures]
831
833
  self.logger.info(f'Capturing children, {max_captures} URLs')
834
+ consecutive_errors = 0
832
835
  for index, url in enumerate(child_urls):
833
836
  self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
834
837
  start_time = time.time()
838
+ if page.is_closed():
839
+ self.logger.info('Page is closed, unable to capture children.')
840
+ break
835
841
  try:
836
- child_capture = await asyncio.wait_for(
837
- self.capture_page(url=url, referer=page.url,
838
- page=page, depth=depth,
839
- rendered_hostname_only=rendered_hostname_only,
840
- max_depth_capture_time=max_capture_time),
841
- timeout=max_capture_time + 1) # just adding a bit of padding so playwright has the chance to raise the exception first
842
- to_return['children'].append(child_capture) # type: ignore[union-attr]
843
- except (TimeoutError, asyncio.exceptions.TimeoutError):
842
+ async with timeout(max_capture_time + 1): # just adding a bit of padding so playwright has the chance to raise the exception first
843
+ child_capture = await self.capture_page(
844
+ url=url, referer=page.url,
845
+ page=page, depth=depth,
846
+ rendered_hostname_only=rendered_hostname_only,
847
+ max_depth_capture_time=max_capture_time)
848
+ to_return['children'].append(child_capture) # type: ignore[union-attr]
849
+ except (TimeoutError, asyncio.exceptions.TimeoutError, asyncio.TimeoutError):
844
850
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
851
+ consecutive_errors += 1
845
852
  except Exception as e:
846
853
  self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.')
854
+ consecutive_errors += 1
847
855
  else:
856
+ consecutive_errors = 0
848
857
  runtime = int(time.time() - start_time)
849
858
  self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.')
859
+
860
+ if consecutive_errors >= 5:
861
+ # if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
862
+ self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
863
+ to_return['error'] = "Got more than 5 consecutive errors while capturing children"
864
+ self.should_retry = True
865
+ break
866
+
850
867
  try:
851
868
  await page.go_back()
852
869
  except PlaywrightTimeoutError:
@@ -880,12 +897,16 @@ class Capture():
880
897
  self.should_retry = True
881
898
  elif e.name in ['Download is starting',
882
899
  'Connection closed',
900
+ 'Connection terminated unexpectedly',
883
901
  'Navigation interrupted by another one',
884
902
  'Navigation failed because page was closed!',
903
+ 'Target page, context or browser has been closed',
885
904
  'Protocol error (Page.bringToFront): Not attached to an active page',
886
905
  'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
887
906
  'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
907
+ 'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
888
908
  'Peer sent fatal TLS alert: The server name sent was not recognized',
909
+ 'Peer sent fatal TLS alert: Internal error',
889
910
  'Load cannot follow more than 20 redirections',
890
911
  'Page crashed',
891
912
  'Error receiving data: Connection reset by peer']:
@@ -896,9 +917,15 @@ class Capture():
896
917
  # The browser barfed, let's try again
897
918
  self.logger.info(f'Browser barfed on {url} (retrying): {e.message}')
898
919
  self.should_retry = True
899
- elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS']:
920
+ elif e.name in ['net::ERR_INVALID_AUTH_CREDENTIALS',
921
+ 'net::ERR_BAD_SSL_CLIENT_AUTH_CERT',
922
+ 'net::ERR_UNEXPECTED_PROXY_AUTH']:
900
923
  # No need to retry, the credentials are wrong/missing.
901
924
  pass
925
+ elif e.name and any([msg in e.name for msg in ['is interrupted by another navigation to']]):
926
+ self.should_retry = True
927
+ elif e.name and any([msg in e.name for msg in ['Error resolving', 'Could not connect to']]):
928
+ pass
902
929
  else:
903
930
  # Unexpected ones
904
931
  self.logger.exception(f'Something went poorly with {url}: {e.message}')
@@ -949,7 +976,7 @@ class Capture():
949
976
  try:
950
977
  return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
951
978
  except Error as e:
952
- self.logger.warning(f"Unable to get any screenshot: {e}")
979
+ self.logger.info(f"Unable to get any screenshot: {e}")
953
980
  raise e
954
981
 
955
982
  async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
@@ -1137,11 +1164,13 @@ class Capture():
1137
1164
  'net::ERR_INVALID_RESPONSE',
1138
1165
  'net::ERR_NAME_NOT_RESOLVED',
1139
1166
  'net::ERR_SOCKS_CONNECTION_FAILED',
1167
+ 'net::ERR_SSL_KEY_USAGE_INCOMPATIBLE',
1168
+ 'net::ERR_SSL_PROTOCOL_ERROR',
1140
1169
  'net::ERR_SSL_UNRECOGNIZED_NAME_ALERT',
1141
1170
  'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH',
1142
- 'net::ERR_SSL_PROTOCOL_ERROR',
1143
1171
  'net::ERR_TIMED_OUT',
1144
1172
  'net::ERR_TOO_MANY_REDIRECTS',
1173
+ 'SSL_ERROR_UNKNOWN',
1145
1174
  ]:
1146
1175
  return True
1147
1176
  return False
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.24.5"
3
+ version = "1.24.7"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -30,7 +30,8 @@ pytz = {"version" = "^2024.1", python = "<3.9"}
30
30
  tzdata = "^2024.1"
31
31
  playwright-stealth = "^1.0.6"
32
32
  setuptools = "^69.5.1"
33
- puremagic = "^1.21"
33
+ puremagic = "^1.22"
34
+ async-timeout = {version = "^4.0.3", python = "<3.11"}
34
35
 
35
36
  [tool.poetry.extras]
36
37
  recaptcha = ["requests", "pydub", "SpeechRecognition"]
@@ -40,9 +41,9 @@ optional = true
40
41
 
41
42
  [tool.poetry.group.dev.dependencies]
42
43
  types-beautifulsoup4 = "^4.12.0.20240229"
43
- pytest = "^8.1.1"
44
- mypy = "^1.9.0"
45
- types-dateparser = "^1.1.4.20240331"
44
+ pytest = "^8.2.0"
45
+ mypy = "^1.10.0"
46
+ types-dateparser = "^1.2.0.20240420"
46
47
  types-requests = "^2.31.0.20240406"
47
48
  types-pytz = "^2024.1.0.20240417"
48
49