PlaywrightCapture 1.21.8__tar.gz → 1.21.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/PKG-INFO +3 -3
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/capture.py +35 -9
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/pyproject.toml +6 -6
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/LICENSE +0 -0
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/README.md +0 -0
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.21.8 → playwrightcapture-1.21.10}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.21.
|
3
|
+
Version: 1.21.10
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -22,9 +22,9 @@ Provides-Extra: recaptcha
|
|
22
22
|
Requires-Dist: SpeechRecognition (>=3.10.0,<4.0.0) ; extra == "recaptcha"
|
23
23
|
Requires-Dist: beautifulsoup4[lxml] (>=4.12.2,<5.0.0)
|
24
24
|
Requires-Dist: dateparser (>=1.1.8,<2.0.0)
|
25
|
-
Requires-Dist: playwright (>=1.
|
25
|
+
Requires-Dist: playwright (>=1.38.0,<2.0.0)
|
26
26
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
27
|
-
Requires-Dist: pytz (>=2023.3,<2024.0) ; python_version < "3.9"
|
27
|
+
Requires-Dist: pytz (>=2023.3.post1,<2024.0) ; python_version < "3.9"
|
28
28
|
Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
|
29
29
|
Requires-Dist: tzdata (>=2023.3,<2024.0)
|
30
30
|
Requires-Dist: w3lib (>=2.1.2,<3.0.0)
|
@@ -20,7 +20,7 @@ import requests
|
|
20
20
|
|
21
21
|
from bs4 import BeautifulSoup
|
22
22
|
from charset_normalizer import from_bytes
|
23
|
-
from playwright.async_api import async_playwright, Frame, Error, Page
|
23
|
+
from playwright.async_api import async_playwright, Frame, Error, Page, Download
|
24
24
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
25
25
|
from w3lib.html import strip_html5_whitespace
|
26
26
|
from w3lib.url import canonicalize_url, safe_url_string
|
@@ -429,21 +429,42 @@ class Capture():
|
|
429
429
|
rendered_hostname_only: bool=True,
|
430
430
|
with_favicon: bool=False
|
431
431
|
) -> CaptureResponse:
|
432
|
+
|
432
433
|
to_return: CaptureResponse = {}
|
433
|
-
try:
|
434
|
-
if page:
|
435
|
-
capturing_sub = True
|
436
|
-
else:
|
437
|
-
capturing_sub = False
|
438
|
-
page = await self.context.new_page()
|
439
|
-
page.set_default_timeout(self._capture_timeout * 1000)
|
440
434
|
|
435
|
+
self.wait_for_download = False
|
436
|
+
|
437
|
+
async def handle_download(download: Download) -> None:
|
438
|
+
# This method is called when a download event is triggered from JS in a page that also renders
|
439
|
+
try:
|
440
|
+
self.wait_for_download = True
|
441
|
+
self.logger.info('Got a download triggered from JS.')
|
442
|
+
tmp_f = NamedTemporaryFile(delete=False)
|
443
|
+
await download.save_as(tmp_f.name)
|
444
|
+
to_return["downloaded_filename"] = download.suggested_filename
|
445
|
+
with open(tmp_f.name, "rb") as f:
|
446
|
+
to_return["downloaded_file"] = f.read()
|
447
|
+
os.unlink(tmp_f.name)
|
448
|
+
self.logger.info('Done with download.')
|
449
|
+
except Exception as e:
|
450
|
+
self.logger.warning(f'Unable to finish download triggered from JS: {e}')
|
451
|
+
finally:
|
452
|
+
self.wait_for_download = False
|
453
|
+
|
454
|
+
if page is not None:
|
455
|
+
capturing_sub = True
|
456
|
+
else:
|
457
|
+
capturing_sub = False
|
458
|
+
page = await self.context.new_page()
|
459
|
+
page.set_default_timeout(self._capture_timeout * 1000)
|
460
|
+
try:
|
441
461
|
# Parse the URL. If there is a fragment, we need to scroll to it manually
|
442
462
|
parsed_url = urlparse(url, allow_fragments=True)
|
443
463
|
|
444
464
|
try:
|
445
465
|
# NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
|
446
466
|
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
467
|
+
page.on("download", handle_download)
|
447
468
|
except Error as initial_error:
|
448
469
|
self._update_exceptions(initial_error)
|
449
470
|
# So this one is really annoying: chromium raises a net::ERR_ABORTED when it hits a download
|
@@ -547,6 +568,10 @@ class Capture():
|
|
547
568
|
if 'html' in to_return and to_return['html'] is not None and with_favicon:
|
548
569
|
to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
|
549
570
|
|
571
|
+
if self.wait_for_download:
|
572
|
+
self.logger.info('Waiting for download to finish...')
|
573
|
+
await self._safe_wait(page)
|
574
|
+
|
550
575
|
if depth > 0 and to_return.get('html') and to_return['html']:
|
551
576
|
if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
|
552
577
|
to_return['children'] = []
|
@@ -816,7 +841,8 @@ class Capture():
|
|
816
841
|
name, _ = name.split(' at ', maxsplit=1)
|
817
842
|
elif '; ' in name:
|
818
843
|
name, _ = name.split('; ', maxsplit=1)
|
819
|
-
|
844
|
+
# This is kinda dirty.
|
845
|
+
exception._name = name.strip()
|
820
846
|
|
821
847
|
def _exception_is_network_error(self, exception: Error) -> bool:
|
822
848
|
if exception.name in [
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.21.
|
3
|
+
version = "1.21.10"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -21,14 +21,14 @@ include = ['README.md']
|
|
21
21
|
|
22
22
|
[tool.poetry.dependencies]
|
23
23
|
python = "^3.8"
|
24
|
-
playwright = "^1.
|
24
|
+
playwright = "^1.38.0"
|
25
25
|
dateparser = "^1.1.8"
|
26
26
|
beautifulsoup4 = {version= "^4.12.2", extras = ["lxml"]}
|
27
27
|
w3lib = "^2.1.2"
|
28
28
|
requests = {extras = ["socks"], version = "^2.31.0"}
|
29
29
|
pydub = {version = "^0.25.1", optional = true}
|
30
30
|
SpeechRecognition = {version = "^3.10.0", optional = true}
|
31
|
-
pytz = {"version" = "^2023.3", python = "<3.9"}
|
31
|
+
pytz = {"version" = "^2023.3.post1", python = "<3.9"}
|
32
32
|
tzdata = "^2023.3"
|
33
33
|
|
34
34
|
[tool.poetry.extras]
|
@@ -39,11 +39,11 @@ optional = true
|
|
39
39
|
|
40
40
|
[tool.poetry.group.dev.dependencies]
|
41
41
|
types-beautifulsoup4 = "^4.12.0.6"
|
42
|
-
pytest = "^7.4.
|
42
|
+
pytest = "^7.4.2"
|
43
43
|
mypy = "^1.5.1"
|
44
44
|
types-dateparser = "^1.1.4.10"
|
45
|
-
types-requests = "^2.31.0.
|
46
|
-
types-pytz = "^2023.3.
|
45
|
+
types-requests = "^2.31.0.6"
|
46
|
+
types-pytz = "^2023.3.1.1"
|
47
47
|
|
48
48
|
|
49
49
|
[build-system]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|