PlaywrightCapture 1.22.4__tar.gz → 1.22.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.22.4
3
+ Version: 1.22.5
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -28,7 +28,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
28
28
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
29
29
  Requires-Dist: pytz (>=2023.3.post1,<2024.0) ; python_version < "3.9"
30
30
  Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
31
- Requires-Dist: setuptools (>=69.0.2,<70.0.0)
31
+ Requires-Dist: setuptools (>=69.0.3,<70.0.0)
32
32
  Requires-Dist: tzdata (>=2023.3,<2024.0)
33
33
  Requires-Dist: w3lib (>=2.1.2,<3.0.0)
34
34
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -11,9 +11,11 @@ import sys
11
11
  import time
12
12
 
13
13
  from base64 import b64decode
14
+ from io import BytesIO
14
15
  from tempfile import NamedTemporaryFile
15
16
  from typing import Optional, Dict, List, Union, Any, TypedDict, Literal, TYPE_CHECKING, Set, Tuple
16
17
  from urllib.parse import urlparse, unquote, urljoin
18
+ from zipfile import ZipFile
17
19
 
18
20
  import dateparser
19
21
  import requests
@@ -459,23 +461,29 @@ class Capture():
459
461
 
460
462
  to_return: CaptureResponse = {}
461
463
 
462
- self.wait_for_download = False
464
+ # We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
465
+ # but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't
466
+ self.wait_for_download = 0
467
+
468
+ # We may have multiple download triggered via JS
469
+ multiple_downloads: List[Tuple[str, bytes]] = []
463
470
 
464
471
  async def handle_download(download: Download) -> None:
465
472
  # This method is called when a download event is triggered from JS in a page that also renders
466
473
  try:
467
- self.wait_for_download = True
474
+ self.wait_for_download += 1
468
475
  with NamedTemporaryFile() as tmp_f:
469
476
  self.logger.info('Got a download triggered from JS.')
470
477
  await download.save_as(tmp_f.name)
471
- to_return["downloaded_filename"] = download.suggested_filename
478
+ filename = download.suggested_filename
472
479
  with open(tmp_f.name, "rb") as f:
473
- to_return["downloaded_file"] = f.read()
480
+ file_content = f.read()
481
+ multiple_downloads.append((filename, file_content))
474
482
  self.logger.info('Done with download.')
475
483
  except Exception as e:
476
484
  self.logger.warning(f'Unable to finish download triggered from JS: {e}')
477
485
  finally:
478
- self.wait_for_download = False
486
+ self.wait_for_download -= 1
479
487
 
480
488
  if page is not None:
481
489
  capturing_sub = True
@@ -504,13 +512,13 @@ class Capture():
504
512
  await page.goto(url, referer=referer if referer else '')
505
513
  except Exception:
506
514
  pass
507
- tmp_f = NamedTemporaryFile(delete=False)
508
- download = await download_info.value
509
- await download.save_as(tmp_f.name)
510
- to_return["downloaded_filename"] = download.suggested_filename
511
- with open(tmp_f.name, "rb") as f:
512
- to_return["downloaded_file"] = f.read()
513
- os.unlink(tmp_f.name)
515
+ with NamedTemporaryFile() as tmp_f:
516
+ download = await download_info.value
517
+ await download.save_as(tmp_f.name)
518
+ filename = download.suggested_filename
519
+ with open(tmp_f.name, "rb") as f:
520
+ file_content = f.read()
521
+ multiple_downloads.append((filename, file_content))
514
522
  except PlaywrightTimeoutError:
515
523
  self.logger.debug('No download has been triggered.')
516
524
  raise initial_error
@@ -597,10 +605,24 @@ class Capture():
597
605
  if 'html' in to_return and to_return['html'] is not None and with_favicon:
598
606
  to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
599
607
 
600
- if self.wait_for_download:
608
+ if self.wait_for_download > 0:
601
609
  self.logger.info('Waiting for download to finish...')
602
610
  await self._safe_wait(page)
603
611
 
612
+ if multiple_downloads:
613
+ if len(multiple_downloads) == 1:
614
+ to_return["downloaded_filename"] = multiple_downloads[0][0]
615
+ to_return["downloaded_file"] = multiple_downloads[0][1]
616
+ else:
617
+ # we have multiple downloads, making it a zip
618
+ mem_zip = BytesIO()
619
+ to_return["downloaded_filename"] = 'multiple_downloads.zip'
620
+ with ZipFile(mem_zip, 'w') as z:
621
+ for i, f_details in enumerate(multiple_downloads):
622
+ filename, file_content = f_details
623
+ z.writestr(f'{i}_{filename}', file_content)
624
+ to_return["downloaded_file"] = mem_zip.getvalue()
625
+
604
626
  if depth > 0 and to_return.get('html') and to_return['html']:
605
627
  if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
606
628
  to_return['children'] = []
@@ -630,10 +652,10 @@ class Capture():
630
652
  except (TimeoutError, asyncio.exceptions.TimeoutError):
631
653
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
632
654
  except Exception as e:
633
- self.logger.warning(f'Error while capturing child "{url}": {e}. {total_urls - index - 1} more to go.')
655
+ self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.')
634
656
  else:
635
657
  runtime = int(time.time() - start_time)
636
- self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {total_urls - index - 1} to go.')
658
+ self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.')
637
659
  try:
638
660
  await page.go_back()
639
661
  except PlaywrightTimeoutError:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.22.4"
3
+ version = "1.22.5"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -29,7 +29,7 @@ SpeechRecognition = {version = "^3.10.1", optional = true}
29
29
  pytz = {"version" = "^2023.3.post1", python = "<3.9"}
30
30
  tzdata = "^2023.3"
31
31
  playwright-stealth = "^1.0.6"
32
- setuptools = "^69.0.2"
32
+ setuptools = "^69.0.3"
33
33
 
34
34
  [tool.poetry.extras]
35
35
  recaptcha = ["requests", "pydub", "SpeechRecognition"]
@@ -40,7 +40,7 @@ optional = true
40
40
  [tool.poetry.group.dev.dependencies]
41
41
  types-beautifulsoup4 = "^4.12.0.7"
42
42
  pytest = "^7.4.3"
43
- mypy = "^1.7.1"
43
+ mypy = "^1.8.0"
44
44
  types-dateparser = "^1.1.4.10"
45
45
  types-requests = "^2.31.0.10"
46
46
  types-pytz = "^2023.3.1.1"