PlaywrightCapture 1.21.9__tar.gz → 1.21.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/PKG-INFO +1 -1
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/playwrightcapture/capture.py +27 -1
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/pyproject.toml +3 -3
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/LICENSE +0 -0
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/README.md +0 -0
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.21.9 → playwrightcapture-1.21.10}/playwrightcapture/py.typed +0 -0
@@ -20,7 +20,7 @@ import requests
|
|
20
20
|
|
21
21
|
from bs4 import BeautifulSoup
|
22
22
|
from charset_normalizer import from_bytes
|
23
|
-
from playwright.async_api import async_playwright, Frame, Error, Page
|
23
|
+
from playwright.async_api import async_playwright, Frame, Error, Page, Download
|
24
24
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
25
25
|
from w3lib.html import strip_html5_whitespace
|
26
26
|
from w3lib.url import canonicalize_url, safe_url_string
|
@@ -429,7 +429,28 @@ class Capture():
|
|
429
429
|
rendered_hostname_only: bool=True,
|
430
430
|
with_favicon: bool=False
|
431
431
|
) -> CaptureResponse:
|
432
|
+
|
432
433
|
to_return: CaptureResponse = {}
|
434
|
+
|
435
|
+
self.wait_for_download = False
|
436
|
+
|
437
|
+
async def handle_download(download: Download) -> None:
|
438
|
+
# This method is called when a download event is triggered from JS in a page that also renders
|
439
|
+
try:
|
440
|
+
self.wait_for_download = True
|
441
|
+
self.logger.info('Got a download triggered from JS.')
|
442
|
+
tmp_f = NamedTemporaryFile(delete=False)
|
443
|
+
await download.save_as(tmp_f.name)
|
444
|
+
to_return["downloaded_filename"] = download.suggested_filename
|
445
|
+
with open(tmp_f.name, "rb") as f:
|
446
|
+
to_return["downloaded_file"] = f.read()
|
447
|
+
os.unlink(tmp_f.name)
|
448
|
+
self.logger.info('Done with download.')
|
449
|
+
except Exception as e:
|
450
|
+
self.logger.warning(f'Unable to finish download triggered from JS: {e}')
|
451
|
+
finally:
|
452
|
+
self.wait_for_download = False
|
453
|
+
|
433
454
|
if page is not None:
|
434
455
|
capturing_sub = True
|
435
456
|
else:
|
@@ -443,6 +464,7 @@ class Capture():
|
|
443
464
|
try:
|
444
465
|
# NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
|
445
466
|
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
467
|
+
page.on("download", handle_download)
|
446
468
|
except Error as initial_error:
|
447
469
|
self._update_exceptions(initial_error)
|
448
470
|
# So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
|
@@ -546,6 +568,10 @@ class Capture():
|
|
546
568
|
if 'html' in to_return and to_return['html'] is not None and with_favicon:
|
547
569
|
to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
|
548
570
|
|
571
|
+
if self.wait_for_download:
|
572
|
+
self.logger.info('Waiting for download to finish...')
|
573
|
+
await self._safe_wait(page)
|
574
|
+
|
549
575
|
if depth > 0 and to_return.get('html') and to_return['html']:
|
550
576
|
if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
|
551
577
|
to_return['children'] = []
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.21.
|
3
|
+
version = "1.21.10"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -42,8 +42,8 @@ types-beautifulsoup4 = "^4.12.0.6"
|
|
42
42
|
pytest = "^7.4.2"
|
43
43
|
mypy = "^1.5.1"
|
44
44
|
types-dateparser = "^1.1.4.10"
|
45
|
-
types-requests = "^2.31.0.
|
46
|
-
types-pytz = "^2023.3.1.
|
45
|
+
types-requests = "^2.31.0.6"
|
46
|
+
types-pytz = "^2023.3.1.1"
|
47
47
|
|
48
48
|
|
49
49
|
[build-system]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|