PlaywrightCapture 1.24.6__tar.gz → 1.24.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/PKG-INFO +3 -2
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/playwrightcapture/capture.py +22 -4
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/pyproject.toml +4 -3
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/LICENSE +0 -0
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/README.md +0 -0
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.24.6 → playwrightcapture-1.24.8}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.24.
|
3
|
+
Version: 1.24.8
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -21,11 +21,12 @@ Classifier: Topic :: Internet
|
|
21
21
|
Classifier: Topic :: Security
|
22
22
|
Provides-Extra: recaptcha
|
23
23
|
Requires-Dist: SpeechRecognition (>=3.10.3,<4.0.0) ; extra == "recaptcha"
|
24
|
+
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
24
25
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
25
26
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
26
27
|
Requires-Dist: playwright (>=1.43.0,<2.0.0)
|
27
28
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
28
|
-
Requires-Dist: puremagic (>=1.
|
29
|
+
Requires-Dist: puremagic (>=1.22,<2.0)
|
29
30
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
30
31
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
31
32
|
Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
|
@@ -667,7 +667,6 @@ class Capture():
|
|
667
667
|
parsed_url = urlparse(url, allow_fragments=True)
|
668
668
|
|
669
669
|
try:
|
670
|
-
# NOTE 2022-12-02: allow 15s less than the general timeout to get a DOM
|
671
670
|
await page.goto(url, wait_until='domcontentloaded', referer=referer if referer else '')
|
672
671
|
page.on("download", handle_download)
|
673
672
|
except Error as initial_error:
|
@@ -832,6 +831,7 @@ class Capture():
|
|
832
831
|
else:
|
833
832
|
child_urls = child_urls[:max_captures]
|
834
833
|
self.logger.info(f'Capturing children, {max_captures} URLs')
|
834
|
+
consecutive_errors = 0
|
835
835
|
for index, url in enumerate(child_urls):
|
836
836
|
self.logger.info(f'Capture child {url} - Timeout: {max_capture_time}s')
|
837
837
|
start_time = time.time()
|
@@ -846,13 +846,24 @@ class Capture():
|
|
846
846
|
rendered_hostname_only=rendered_hostname_only,
|
847
847
|
max_depth_capture_time=max_capture_time)
|
848
848
|
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
849
|
-
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
849
|
+
except (TimeoutError, asyncio.exceptions.TimeoutError, asyncio.TimeoutError):
|
850
850
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
851
|
+
consecutive_errors += 1
|
851
852
|
except Exception as e:
|
852
853
|
self.logger.warning(f'Error while capturing child "{url}": {e}. {len(child_urls) - index - 1} more to go.')
|
854
|
+
consecutive_errors += 1
|
853
855
|
else:
|
856
|
+
consecutive_errors = 0
|
854
857
|
runtime = int(time.time() - start_time)
|
855
858
|
self.logger.info(f'Successfully captured child URL: {url} in {runtime}s. {len(child_urls) - index - 1} to go.')
|
859
|
+
|
860
|
+
if consecutive_errors >= 5:
|
861
|
+
# if we have more than 5 consecutive errors, the capture is most probably broken, breaking.
|
862
|
+
self.logger.warning('Got more than 5 consecutive errors while capturing children, breaking.')
|
863
|
+
to_return['error'] = "Got more than 5 consecutive errors while capturing children"
|
864
|
+
self.should_retry = True
|
865
|
+
break
|
866
|
+
|
856
867
|
try:
|
857
868
|
await page.go_back()
|
858
869
|
except PlaywrightTimeoutError:
|
@@ -1278,8 +1289,15 @@ class Capture():
|
|
1278
1289
|
session.verify = False
|
1279
1290
|
session.headers['user-agent'] = self.user_agent
|
1280
1291
|
if self.proxy and self.proxy.get('server'):
|
1281
|
-
|
1282
|
-
|
1292
|
+
proxy_server = self.proxy['server']
|
1293
|
+
# Make sure the DNS desolution is done remotely
|
1294
|
+
# https://urllib3.readthedocs.io/en/stable/advanced-usage.html#socks-proxies
|
1295
|
+
if proxy_server.startswith('socks5://'):
|
1296
|
+
proxy_server = proxy_server.replace('socks5://', 'socks5h://')
|
1297
|
+
if proxy_server.startswith('socks4://'):
|
1298
|
+
proxy_server = proxy_server.replace('socks4://', 'socks4a://')
|
1299
|
+
|
1300
|
+
proxies = {'http': proxy_server, 'https': proxy_server}
|
1283
1301
|
session.proxies.update(proxies)
|
1284
1302
|
for u in to_fetch:
|
1285
1303
|
try:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.24.
|
3
|
+
version = "1.24.8"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -30,7 +30,8 @@ pytz = {"version" = "^2024.1", python = "<3.9"}
|
|
30
30
|
tzdata = "^2024.1"
|
31
31
|
playwright-stealth = "^1.0.6"
|
32
32
|
setuptools = "^69.5.1"
|
33
|
-
puremagic = "^1.
|
33
|
+
puremagic = "^1.22"
|
34
|
+
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
35
|
|
35
36
|
[tool.poetry.extras]
|
36
37
|
recaptcha = ["requests", "pydub", "SpeechRecognition"]
|
@@ -40,7 +41,7 @@ optional = true
|
|
40
41
|
|
41
42
|
[tool.poetry.group.dev.dependencies]
|
42
43
|
types-beautifulsoup4 = "^4.12.0.20240229"
|
43
|
-
pytest = "^8.
|
44
|
+
pytest = "^8.2.0"
|
44
45
|
mypy = "^1.10.0"
|
45
46
|
types-dateparser = "^1.2.0.20240420"
|
46
47
|
types-requests = "^2.31.0.20240406"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|