PlaywrightCapture 1.21.8__tar.gz → 1.21.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.21.8
3
+ Version: 1.21.10
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -22,9 +22,9 @@ Provides-Extra: recaptcha
22
22
  Requires-Dist: SpeechRecognition (>=3.10.0,<4.0.0) ; extra == "recaptcha"
23
23
  Requires-Dist: beautifulsoup4[lxml] (>=4.12.2,<5.0.0)
24
24
  Requires-Dist: dateparser (>=1.1.8,<2.0.0)
25
- Requires-Dist: playwright (>=1.37.0,<2.0.0)
25
+ Requires-Dist: playwright (>=1.38.0,<2.0.0)
26
26
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
27
- Requires-Dist: pytz (>=2023.3,<2024.0) ; python_version < "3.9"
27
+ Requires-Dist: pytz (>=2023.3.post1,<2024.0) ; python_version < "3.9"
28
28
  Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
29
29
  Requires-Dist: tzdata (>=2023.3,<2024.0)
30
30
  Requires-Dist: w3lib (>=2.1.2,<3.0.0)
@@ -20,7 +20,7 @@ import requests
20
20
 
21
21
  from bs4 import BeautifulSoup
22
22
  from charset_normalizer import from_bytes
23
- from playwright.async_api import async_playwright, Frame, Error, Page
23
+ from playwright.async_api import async_playwright, Frame, Error, Page, Download
24
24
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
25
25
  from w3lib.html import strip_html5_whitespace
26
26
  from w3lib.url import canonicalize_url, safe_url_string
@@ -429,21 +429,42 @@ class Capture():
429
429
  rendered_hostname_only: bool=True,
430
430
  with_favicon: bool=False
431
431
  ) -> CaptureResponse:
432
+
432
433
  to_return: CaptureResponse = {}
433
- try:
434
- if page:
435
- capturing_sub = True
436
- else:
437
- capturing_sub = False
438
- page = await self.context.new_page()
439
- page.set_default_timeout(self._capture_timeout * 1000)
440
434
 
435
+ self.wait_for_download = False
436
+
437
+ async def handle_download(download: Download) -> None:
438
+ # This method is called when a download event is triggered from JS in a page that also renders
439
+ try:
440
+ self.wait_for_download = True
441
+ self.logger.info('Got a download triggered from JS.')
442
+ tmp_f = NamedTemporaryFile(delete=False)
443
+ await download.save_as(tmp_f.name)
444
+ to_return["downloaded_filename"] = download.suggested_filename
445
+ with open(tmp_f.name, "rb") as f:
446
+ to_return["downloaded_file"] = f.read()
447
+ os.unlink(tmp_f.name)
448
+ self.logger.info('Done with download.')
449
+ except Exception as e:
450
+ self.logger.warning(f'Unable to finish download triggered from JS: {e}')
451
+ finally:
452
+ self.wait_for_download = False
453
+
454
+ if page is not None:
455
+ capturing_sub = True
456
+ else:
457
+ capturing_sub = False
458
+ page = await self.context.new_page()
459
+ page.set_default_timeout(self._capture_timeout * 1000)
460
+ try:
441
461
  # Parse the URL. If there is a fragment, we need to scroll to it manually
442
462
  parsed_url = urlparse(url, allow_fragments=True)
443
463
 
444
464
  try:
445
465
  # NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
446
466
  await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
467
+ page.on("download", handle_download)
447
468
  except Error as initial_error:
448
469
  self._update_exceptions(initial_error)
449
470
  # So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
@@ -547,6 +568,10 @@ class Capture():
547
568
  if 'html' in to_return and to_return['html'] is not None and with_favicon:
548
569
  to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
549
570
 
571
+ if self.wait_for_download:
572
+ self.logger.info('Waiting for download to finish...')
573
+ await self._safe_wait(page)
574
+
550
575
  if depth > 0 and to_return.get('html') and to_return['html']:
551
576
  if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
552
577
  to_return['children'] = []
@@ -816,7 +841,8 @@ class Capture():
816
841
  name, _ = name.split(' at ', maxsplit=1)
817
842
  elif '; ' in name:
818
843
  name, _ = name.split('; ', maxsplit=1)
819
- exception.name = name.strip()
844
+ # This is kinda dirty.
845
+ exception._name = name.strip()
820
846
 
821
847
  def _exception_is_network_error(self, exception: Error) -> bool:
822
848
  if exception.name in [
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.21.8"
3
+ version = "1.21.10"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -21,14 +21,14 @@ include = ['README.md']
21
21
 
22
22
  [tool.poetry.dependencies]
23
23
  python = "^3.8"
24
- playwright = "^1.37.0"
24
+ playwright = "^1.38.0"
25
25
  dateparser = "^1.1.8"
26
26
  beautifulsoup4 = {version= "^4.12.2", extras = ["lxml"]}
27
27
  w3lib = "^2.1.2"
28
28
  requests = {extras = ["socks"], version = "^2.31.0"}
29
29
  pydub = {version = "^0.25.1", optional = true}
30
30
  SpeechRecognition = {version = "^3.10.0", optional = true}
31
- pytz = {"version" = "^2023.3", python = "<3.9"}
31
+ pytz = {"version" = "^2023.3.post1", python = "<3.9"}
32
32
  tzdata = "^2023.3"
33
33
 
34
34
  [tool.poetry.extras]
@@ -39,11 +39,11 @@ optional = true
39
39
 
40
40
  [tool.poetry.group.dev.dependencies]
41
41
  types-beautifulsoup4 = "^4.12.0.6"
42
- pytest = "^7.4.0"
42
+ pytest = "^7.4.2"
43
43
  mypy = "^1.5.1"
44
44
  types-dateparser = "^1.1.4.10"
45
- types-requests = "^2.31.0.2"
46
- types-pytz = "^2023.3.0.1"
45
+ types-requests = "^2.31.0.6"
46
+ types-pytz = "^2023.3.1.1"
47
47
 
48
48
 
49
49
  [build-system]