PlaywrightCapture 1.21.9__tar.gz → 1.21.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.21.9
3
+ Version: 1.21.10
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -20,7 +20,7 @@ import requests
20
20
 
21
21
  from bs4 import BeautifulSoup
22
22
  from charset_normalizer import from_bytes
23
- from playwright.async_api import async_playwright, Frame, Error, Page
23
+ from playwright.async_api import async_playwright, Frame, Error, Page, Download
24
24
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
25
25
  from w3lib.html import strip_html5_whitespace
26
26
  from w3lib.url import canonicalize_url, safe_url_string
@@ -429,7 +429,28 @@ class Capture():
429
429
  rendered_hostname_only: bool=True,
430
430
  with_favicon: bool=False
431
431
  ) -> CaptureResponse:
432
+
432
433
  to_return: CaptureResponse = {}
434
+
435
+ self.wait_for_download = False
436
+
437
+ async def handle_download(download: Download) -> None:
438
+ # This method is called when a download event is triggered from JS in a page that also renders
439
+ try:
440
+ self.wait_for_download = True
441
+ self.logger.info('Got a download triggered from JS.')
442
+ tmp_f = NamedTemporaryFile(delete=False)
443
+ await download.save_as(tmp_f.name)
444
+ to_return["downloaded_filename"] = download.suggested_filename
445
+ with open(tmp_f.name, "rb") as f:
446
+ to_return["downloaded_file"] = f.read()
447
+ os.unlink(tmp_f.name)
448
+ self.logger.info('Done with download.')
449
+ except Exception as e:
450
+ self.logger.warning(f'Unable to finish download triggered from JS: {e}')
451
+ finally:
452
+ self.wait_for_download = False
453
+
433
454
  if page is not None:
434
455
  capturing_sub = True
435
456
  else:
@@ -443,6 +464,7 @@ class Capture():
443
464
  try:
444
465
  # NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
445
466
  await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
467
+ page.on("download", handle_download)
446
468
  except Error as initial_error:
447
469
  self._update_exceptions(initial_error)
448
470
  # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
@@ -546,6 +568,10 @@ class Capture():
546
568
  if 'html' in to_return and to_return['html'] is not None and with_favicon:
547
569
  to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
548
570
 
571
+ if self.wait_for_download:
572
+ self.logger.info('Waiting for download to finish...')
573
+ await self._safe_wait(page)
574
+
549
575
  if depth > 0 and to_return.get('html') and to_return['html']:
550
576
  if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
551
577
  to_return['children'] = []
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.21.9"
3
+ version = "1.21.10"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -42,8 +42,8 @@ types-beautifulsoup4 = "^4.12.0.6"
42
42
  pytest = "^7.4.2"
43
43
  mypy = "^1.5.1"
44
44
  types-dateparser = "^1.1.4.10"
45
- types-requests = "^2.31.0.2"
46
- types-pytz = "^2023.3.1.0"
45
+ types-requests = "^2.31.0.6"
46
+ types-pytz = "^2023.3.1.1"
47
47
 
48
48
 
49
49
  [build-system]