PlaywrightCapture 1.24.6__tar.gz → 1.24.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.24.6
3
+ Version: 1.24.7
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -21,11 +21,12 @@ Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
23
  Requires-Dist: SpeechRecognition (>=3.10.3,<4.0.0) ; extra == "recaptcha"
24
+ Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
24
25
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
25
26
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
26
27
  Requires-Dist: playwright (>=1.43.0,<2.0.0)
27
28
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
28
- Requires-Dist: puremagic (>=1.21,<2.0)
29
+ Requires-Dist: puremagic (>=1.22,<2.0)
29
30
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
30
31
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
31
32
  Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
@@ -667,7 +667,6 @@ class Capture():
667
667
  parsed_url = urlparse(url, allow_fragments=True)
668
668
 
669
669
  try:
670
- # NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
671
670
  await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
672
671
  page.on("download", handle_download)
673
672
  except Error as initial_error:
@@ -832,6 +831,7 @@ class Capture():
832
831
  else:
833
832
  child_urls = child_urls[:max_captures]
834
833
  self.logger.info(f'Capturing children, {max_captures} URLs')
834
+ consecutive_errors = 0
835
835
  for index, url in enumerate(child_urls):
836
836
  self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
837
837
  start_time = time.time()
@@ -846,13 +846,24 @@ class Capture():
846
846
  rendered_hostname_only=rendered_hostname_only,
847
847
  max_depth_capture_time=max_capture_time)
848
848
  to_return['children'].append(child_capture) # type: ignore[union-attr]
849
- except (TimeoutError, asyncio.exceptions.TimeoutError):
849
+ except (TimeoutError, asyncio.exceptions.TimeoutError, asyncio.TimeoutError):
850
850
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
851
+ consecutive_errors += 1
851
852
  except Exception as e:
852
853
  self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.')
854
+ consecutive_errors += 1
853
855
  else:
856
+ consecutive_errors = 0
854
857
  runtime = int(time.time() - start_time)
855
858
  self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.')
859
+
860
+ if consecutive_errors >= 5:
861
+ # if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
862
+ self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
863
+ to_return['error'] = "Got more than 5 consecutive errors while capturing children"
864
+ self.should_retry = True
865
+ break
866
+
856
867
  try:
857
868
  await page.go_back()
858
869
  except PlaywrightTimeoutError:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.24.6"
3
+ version = "1.24.7"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -30,7 +30,8 @@ pytz = {"version" = "^2024.1", python = "<3.9"}
30
30
  tzdata = "^2024.1"
31
31
  playwright-stealth = "^1.0.6"
32
32
  setuptools = "^69.5.1"
33
- puremagic = "^1.21"
33
+ puremagic = "^1.22"
34
+ async-timeout = {version = "^4.0.3", python = "<3.11"}
34
35
 
35
36
  [tool.poetry.extras]
36
37
  recaptcha = ["requests", "pydub", "SpeechRecognition"]
@@ -40,7 +41,7 @@ optional = true
40
41
 
41
42
  [tool.poetry.group.dev.dependencies]
42
43
  types-beautifulsoup4 = "^4.12.0.20240229"
43
- pytest = "^8.1.1"
44
+ pytest = "^8.2.0"
44
45
  mypy = "^1.10.0"
45
46
  types-dateparser = "^1.2.0.20240420"
46
47
  types-requests = "^2.31.0.20240406"