PlaywrightCapture 1.25.4__tar.gz → 1.25.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/PKG-INFO +2 -2
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/playwrightcapture/capture.py +22 -19
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/pyproject.toml +2 -2
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/LICENSE +0 -0
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/README.md +0 -0
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.25.4 → playwrightcapture-1.25.6}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.6
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
|
31
31
|
Requires-Dist: puremagic (>=1.26,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=
|
34
|
+
Requires-Dist: setuptools (>=71.0.3,<72.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -165,10 +165,15 @@ class Capture():
|
|
165
165
|
if proxy:
|
166
166
|
if isinstance(proxy, str):
|
167
167
|
self.proxy = {'server': proxy}
|
168
|
-
|
168
|
+
elif isinstance(proxy, dict):
|
169
169
|
self.proxy = {'server': proxy['server'], 'bypass': proxy.get('bypass', ''),
|
170
170
|
'username': proxy.get('username', ''),
|
171
171
|
'password': proxy.get('password', '')}
|
172
|
+
elif isinstance(proxy, int):
|
173
|
+
# This is clearly a mistake, just ignoring it
|
174
|
+
self.logger.warning('Proxy is an integer, this is a mistake, ignoring it.')
|
175
|
+
else:
|
176
|
+
raise InvalidPlaywrightParameter(f'Invalid proxy parameter: "{proxy}" ({type(proxy)})')
|
172
177
|
|
173
178
|
self.should_retry: bool = False
|
174
179
|
self.__network_not_idle: int = 2 # makes sure we do not wait for network idle the max amount of time the capture is allowed to take
|
@@ -334,19 +339,10 @@ class Capture():
|
|
334
339
|
return self._headers
|
335
340
|
|
336
341
|
@headers.setter
|
337
|
-
def headers(self, headers:
|
342
|
+
def headers(self, headers: dict[str, str] | None) -> None:
|
338
343
|
if not headers:
|
339
344
|
return
|
340
|
-
if isinstance(headers,
|
341
|
-
new_headers: dict[str, str] = {}
|
342
|
-
for header_line in headers.splitlines():
|
343
|
-
if header_line and ':' in header_line:
|
344
|
-
splitted = header_line.split(':', 1)
|
345
|
-
if splitted and len(splitted) == 2:
|
346
|
-
header, h_value = splitted
|
347
|
-
if header.strip() and h_value.strip():
|
348
|
-
new_headers[header.strip()] = h_value.strip()
|
349
|
-
elif isinstance(headers, dict):
|
345
|
+
if isinstance(headers, dict):
|
350
346
|
# Check if they are valid
|
351
347
|
new_headers = {name.strip(): value.strip() for name, value in headers.items() if isinstance(name, str) and isinstance(value, str) and name.strip() and value.strip()}
|
352
348
|
if new_headers != headers:
|
@@ -812,7 +808,7 @@ class Capture():
|
|
812
808
|
try:
|
813
809
|
await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
|
814
810
|
await self._wait_for_random_timeout(page, 2)
|
815
|
-
async with timeout(
|
811
|
+
async with timeout(5):
|
816
812
|
await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
|
817
813
|
self.logger.debug('Jumped to fragment.')
|
818
814
|
except PlaywrightTimeoutError as e:
|
@@ -821,20 +817,24 @@ class Capture():
|
|
821
817
|
self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
|
822
818
|
except Error as e:
|
823
819
|
self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
|
824
|
-
except TimeoutError:
|
820
|
+
except (asyncio.TimeoutError, TimeoutError):
|
825
821
|
self.logger.debug('Unable to scroll due to timeout')
|
822
|
+
except (asyncio.CancelledError):
|
823
|
+
self.logger.debug('Unable to scroll due to timeout, call canceled')
|
826
824
|
else:
|
827
825
|
# scroll more
|
828
826
|
try:
|
829
827
|
# NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
|
830
828
|
# 2024-07-08: Also, it sometimes get stuck.
|
831
|
-
async with timeout(
|
829
|
+
async with timeout(5):
|
832
830
|
await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
|
833
831
|
self.logger.debug('Scrolled down.')
|
834
832
|
except Error as e:
|
835
833
|
self.logger.debug(f'Unable to scroll: {e}')
|
836
|
-
except TimeoutError:
|
834
|
+
except (TimeoutError, asyncio.TimeoutError):
|
837
835
|
self.logger.debug('Unable to scroll due to timeout')
|
836
|
+
except (asyncio.CancelledError):
|
837
|
+
self.logger.debug('Unable to scroll due to timeout, call canceled')
|
838
838
|
|
839
839
|
await self._wait_for_random_timeout(page, 3)
|
840
840
|
self.logger.debug('Keep going after moving on page.')
|
@@ -870,7 +870,7 @@ class Capture():
|
|
870
870
|
# async with timeout(3):
|
871
871
|
# await page.clock.run_for("47")
|
872
872
|
# self.logger.debug('Moved time forward.')
|
873
|
-
# except TimeoutError:
|
873
|
+
# except (TimeoutError, asyncio.TimeoutError):
|
874
874
|
# self.logger.warning('Unable to move time forward.')
|
875
875
|
|
876
876
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
@@ -919,7 +919,7 @@ class Capture():
|
|
919
919
|
rendered_hostname_only=rendered_hostname_only,
|
920
920
|
max_depth_capture_time=max_capture_time)
|
921
921
|
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
922
|
-
except (TimeoutError, asyncio.
|
922
|
+
except (TimeoutError, asyncio.TimeoutError):
|
923
923
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
924
924
|
consecutive_errors += 1
|
925
925
|
except Exception as e:
|
@@ -947,6 +947,9 @@ class Capture():
|
|
947
947
|
except PlaywrightTimeoutError as e:
|
948
948
|
to_return['error'] = f"The capture took too long - {e.message}"
|
949
949
|
self.should_retry = True
|
950
|
+
except (asyncio.TimeoutError, TimeoutError):
|
951
|
+
to_return['error'] = "Something in the capture took too long"
|
952
|
+
self.should_retry = True
|
950
953
|
except TargetClosedError as e:
|
951
954
|
to_return['error'] = f"The target was closed - {e}"
|
952
955
|
self.should_retry = True
|
@@ -1084,7 +1087,7 @@ class Capture():
|
|
1084
1087
|
try:
|
1085
1088
|
async with timeout(30):
|
1086
1089
|
return await page.content()
|
1087
|
-
except (Error, TimeoutError):
|
1090
|
+
except (Error, TimeoutError, asyncio.TimeoutError):
|
1088
1091
|
self.logger.debug('Unable to get page content, trying again.')
|
1089
1092
|
tries -= 1
|
1090
1093
|
await self._wait_for_random_timeout(page, 1)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.25.
|
3
|
+
version = "1.25.6"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -28,7 +28,7 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
|
|
28
28
|
pytz = {"version" = "^2024.1", python = "<3.9"}
|
29
29
|
tzdata = "^2024.1"
|
30
30
|
playwright-stealth = "^1.0.6"
|
31
|
-
setuptools = "^
|
31
|
+
setuptools = "^71.0.3"
|
32
32
|
puremagic = "^1.26"
|
33
33
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
34
|
aiohttp = {extras = ["speedups"], version = "^3.9.5"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|