PlaywrightCapture 1.22.3__tar.gz → 1.22.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/PKG-INFO +3 -3
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/playwrightcapture/capture.py +38 -16
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/pyproject.toml +4 -4
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/LICENSE +0 -0
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/README.md +0 -0
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.22.3 → playwrightcapture-1.22.5}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.22.
|
3
|
+
Version: 1.22.5
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Topic :: Internet
|
21
21
|
Classifier: Topic :: Security
|
22
22
|
Provides-Extra: recaptcha
|
23
|
-
Requires-Dist: SpeechRecognition (>=3.10.
|
23
|
+
Requires-Dist: SpeechRecognition (>=3.10.1,<4.0.0) ; extra == "recaptcha"
|
24
24
|
Requires-Dist: beautifulsoup4[lxml] (>=4.12.2,<5.0.0)
|
25
25
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
26
26
|
Requires-Dist: playwright (>=1.40.0,<2.0.0)
|
@@ -28,7 +28,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
|
28
28
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
29
29
|
Requires-Dist: pytz (>=2023.3.post1,<2024.0) ; python_version < "3.9"
|
30
30
|
Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
|
31
|
-
Requires-Dist: setuptools (>=69.0.
|
31
|
+
Requires-Dist: setuptools (>=69.0.3,<70.0.0)
|
32
32
|
Requires-Dist: tzdata (>=2023.3,<2024.0)
|
33
33
|
Requires-Dist: w3lib (>=2.1.2,<3.0.0)
|
34
34
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -11,9 +11,11 @@ import sys
|
|
11
11
|
import time
|
12
12
|
|
13
13
|
from base64 import b64decode
|
14
|
+
from io import BytesIO
|
14
15
|
from tempfile import NamedTemporaryFile
|
15
16
|
from typing import Optional, Dict, List, Union, Any, TypedDict, Literal, TYPE_CHECKING, Set, Tuple
|
16
17
|
from urllib.parse import urlparse, unquote, urljoin
|
18
|
+
from zipfile import ZipFile
|
17
19
|
|
18
20
|
import dateparser
|
19
21
|
import requests
|
@@ -375,7 +377,7 @@ class Capture():
|
|
375
377
|
default_context_settings.pop('is_mobile')
|
376
378
|
|
377
379
|
# FIXME: video for debug
|
378
|
-
|
380
|
+
default_context_settings['record_video_dir'] = './videos/'
|
379
381
|
|
380
382
|
self.context = await self.browser.new_context(**default_context_settings) # type: ignore
|
381
383
|
self.context.set_default_timeout(self._capture_timeout * 1000)
|
@@ -459,23 +461,29 @@ class Capture():
|
|
459
461
|
|
460
462
|
to_return: CaptureResponse = {}
|
461
463
|
|
462
|
-
|
464
|
+
# We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
|
465
|
+
# but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't
|
466
|
+
self.wait_for_download = 0
|
467
|
+
|
468
|
+
# We may have multiple download triggered via JS
|
469
|
+
multiple_downloads: List[Tuple[str, bytes]] = []
|
463
470
|
|
464
471
|
async def handle_download(download: Download) -> None:
|
465
472
|
# This method is called when a download event is triggered from JS in a page that also renders
|
466
473
|
try:
|
467
|
-
self.wait_for_download
|
474
|
+
self.wait_for_download += 1
|
468
475
|
with NamedTemporaryFile() as tmp_f:
|
469
476
|
self.logger.info('Got a download triggered from JS.')
|
470
477
|
await download.save_as(tmp_f.name)
|
471
|
-
|
478
|
+
filename = download.suggested_filename
|
472
479
|
with open(tmp_f.name, "rb") as f:
|
473
|
-
|
480
|
+
file_content = f.read()
|
481
|
+
multiple_downloads.append((filename, file_content))
|
474
482
|
self.logger.info('Done with download.')
|
475
483
|
except Exception as e:
|
476
484
|
self.logger.warning(f'Unable to finish download triggered from JS: {e}')
|
477
485
|
finally:
|
478
|
-
self.wait_for_download
|
486
|
+
self.wait_for_download -= 1
|
479
487
|
|
480
488
|
if page is not None:
|
481
489
|
capturing_sub = True
|
@@ -504,13 +512,13 @@ class Capture():
|
|
504
512
|
await page.goto(url, referer=referer if referer else '')
|
505
513
|
except Exception:
|
506
514
|
pass
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
515
|
+
with NamedTemporaryFile() as tmp_f:
|
516
|
+
download = await download_info.value
|
517
|
+
await download.save_as(tmp_f.name)
|
518
|
+
filename = download.suggested_filename
|
519
|
+
with open(tmp_f.name, "rb") as f:
|
520
|
+
file_content = f.read()
|
521
|
+
multiple_downloads.append((filename, file_content))
|
514
522
|
except PlaywrightTimeoutError:
|
515
523
|
self.logger.debug('No download has been triggered.')
|
516
524
|
raise initial_error
|
@@ -597,10 +605,24 @@ class Capture():
|
|
597
605
|
if 'html' in to_return and to_return['html'] is not None and with_favicon:
|
598
606
|
to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
|
599
607
|
|
600
|
-
if self.wait_for_download:
|
608
|
+
if self.wait_for_download > 0:
|
601
609
|
self.logger.info('Waiting for download to finish...')
|
602
610
|
await self._safe_wait(page)
|
603
611
|
|
612
|
+
if multiple_downloads:
|
613
|
+
if len(multiple_downloads) == 1:
|
614
|
+
to_return["downloaded_filename"] = multiple_downloads[0][0]
|
615
|
+
to_return["downloaded_file"] = multiple_downloads[0][1]
|
616
|
+
else:
|
617
|
+
# we have multiple downloads, making it a zip
|
618
|
+
mem_zip = BytesIO()
|
619
|
+
to_return["downloaded_filename"] = 'multiple_downloads.zip'
|
620
|
+
with ZipFile(mem_zip, 'w') as z:
|
621
|
+
for i, f_details in enumerate(multiple_downloads):
|
622
|
+
filename, file_content = f_details
|
623
|
+
z.writestr(f'{i}_{filename}', file_content)
|
624
|
+
to_return["downloaded_file"] = mem_zip.getvalue()
|
625
|
+
|
604
626
|
if depth > 0 and to_return.get('html') and to_return['html']:
|
605
627
|
if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
|
606
628
|
to_return['children'] = []
|
@@ -630,10 +652,10 @@ class Capture():
|
|
630
652
|
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
631
653
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
632
654
|
except Exception as e:
|
633
|
-
self.logger.warning(f'Error while capturing child "{url}": {e}. {
|
655
|
+
self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.')
|
634
656
|
else:
|
635
657
|
runtime = int(time.time() - start_time)
|
636
|
-
self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {
|
658
|
+
self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.')
|
637
659
|
try:
|
638
660
|
await page.go_back()
|
639
661
|
except PlaywrightTimeoutError:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.22.
|
3
|
+
version = "1.22.5"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -25,11 +25,11 @@ beautifulsoup4 = {version= "^4.12.2", extras = ["lxml"]}
|
|
25
25
|
w3lib = "^2.1.2"
|
26
26
|
requests = {extras = ["socks"], version = "^2.31.0"}
|
27
27
|
pydub = {version = "^0.25.1", optional = true}
|
28
|
-
SpeechRecognition = {version = "^3.10.
|
28
|
+
SpeechRecognition = {version = "^3.10.1", optional = true}
|
29
29
|
pytz = {"version" = "^2023.3.post1", python = "<3.9"}
|
30
30
|
tzdata = "^2023.3"
|
31
31
|
playwright-stealth = "^1.0.6"
|
32
|
-
setuptools = "^69.0.
|
32
|
+
setuptools = "^69.0.3"
|
33
33
|
|
34
34
|
[tool.poetry.extras]
|
35
35
|
recaptcha = ["requests", "pydub", "SpeechRecognition"]
|
@@ -40,7 +40,7 @@ optional = true
|
|
40
40
|
[tool.poetry.group.dev.dependencies]
|
41
41
|
types-beautifulsoup4 = "^4.12.0.7"
|
42
42
|
pytest = "^7.4.3"
|
43
|
-
mypy = "^1.
|
43
|
+
mypy = "^1.8.0"
|
44
44
|
types-dateparser = "^1.1.4.10"
|
45
45
|
types-requests = "^2.31.0.10"
|
46
46
|
types-pytz = "^2023.3.1.1"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|