PlaywrightCapture 1.24.6__py3-none-any.whl → 1.24.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +13 -2
- {playwrightcapture-1.24.6.dist-info → playwrightcapture-1.24.7.dist-info}/METADATA +3 -2
- {playwrightcapture-1.24.6.dist-info → playwrightcapture-1.24.7.dist-info}/RECORD +5 -5
- {playwrightcapture-1.24.6.dist-info → playwrightcapture-1.24.7.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.24.6.dist-info → playwrightcapture-1.24.7.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -667,7 +667,6 @@ class Capture():
|
|
667
667
|
parsed_url = urlparse(url, allow_fragments=True)
|
668
668
|
|
669
669
|
try:
|
670
|
-
# NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
|
671
670
|
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
672
671
|
page.on("download", handle_download)
|
673
672
|
except Error as initial_error:
|
@@ -832,6 +831,7 @@ class Capture():
|
|
832
831
|
else:
|
833
832
|
child_urls = child_urls[:max_captures]
|
834
833
|
self.logger.info(f'Capturing children, {max_captures} URLs')
|
834
|
+
consecutive_errors = 0
|
835
835
|
for index, url in enumerate(child_urls):
|
836
836
|
self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
|
837
837
|
start_time = time.time()
|
@@ -846,13 +846,24 @@ class Capture():
|
|
846
846
|
rendered_hostname_only=rendered_hostname_only,
|
847
847
|
max_depth_capture_time=max_capture_time)
|
848
848
|
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
849
|
-
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
849
|
+
except (TimeoutError, asyncio.exceptions.TimeoutError, asyncio.TimeoutError):
|
850
850
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
851
|
+
consecutive_errors += 1
|
851
852
|
except Exception as e:
|
852
853
|
self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.')
|
854
|
+
consecutive_errors += 1
|
853
855
|
else:
|
856
|
+
consecutive_errors = 0
|
854
857
|
runtime = int(time.time() - start_time)
|
855
858
|
self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.')
|
859
|
+
|
860
|
+
if consecutive_errors >= 5:
|
861
|
+
# if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
|
862
|
+
self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
|
863
|
+
to_return['error'] = "Got more than 5 consecutive errors while capturing children"
|
864
|
+
self.should_retry = True
|
865
|
+
break
|
866
|
+
|
856
867
|
try:
|
857
868
|
await page.go_back()
|
858
869
|
except PlaywrightTimeoutError:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.24.
|
3
|
+
Version: 1.24.7
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -21,11 +21,12 @@ Classifier: Topic :: Internet
|
|
21
21
|
Classifier: Topic :: Security
|
22
22
|
Provides-Extra: recaptcha
|
23
23
|
Requires-Dist: SpeechRecognition (>=3.10.3,<4.0.0) ; extra == "recaptcha"
|
24
|
+
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
24
25
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
25
26
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
26
27
|
Requires-Dist: playwright (>=1.43.0,<2.0.0)
|
27
28
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
28
|
-
Requires-Dist: puremagic (>=1.
|
29
|
+
Requires-Dist: puremagic (>=1.22,<2.0)
|
29
30
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
30
31
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
31
32
|
Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
|
@@ -1,9 +1,9 @@
|
|
1
1
|
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=
|
2
|
+
playwrightcapture/capture.py,sha256=z_5IfCbJl3pmLKnm8YTcxVC0XFPXhaqL9TXLCSQ6t0s,64760
|
3
3
|
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
4
|
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
5
|
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.24.
|
7
|
-
playwrightcapture-1.24.
|
8
|
-
playwrightcapture-1.24.
|
9
|
-
playwrightcapture-1.24.
|
6
|
+
playwrightcapture-1.24.7.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.24.7.dist-info/METADATA,sha256=W2FGoCXAodeRAD_ydGPQd8r2KTAQZr1oPAOzOcFzsmU,3149
|
8
|
+
playwrightcapture-1.24.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.24.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|