PlaywrightCapture 1.25.4__tar.gz → 1.25.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.4
3
+ Version: 1.25.6
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
31
  Requires-Dist: puremagic (>=1.26,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=70.3.0,<71.0.0)
34
+ Requires-Dist: setuptools (>=71.0.3,<72.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -165,10 +165,15 @@ class Capture():
165
165
  if proxy:
166
166
  if isinstance(proxy, str):
167
167
  self.proxy = {'server': proxy}
168
- else:
168
+ elif isinstance(proxy, dict):
169
169
  self.proxy = {'server': proxy['server'], 'bypass': proxy.get('bypass', ''),
170
170
  'username': proxy.get('username', ''),
171
171
  'password': proxy.get('password', '')}
172
+ elif isinstance(proxy, int):
173
+ # This is clearly a mistake, just ignoring it
174
+ self.logger.warning('Proxy is an integer, this is a mistake, ignoring it.')
175
+ else:
176
+ raise InvalidPlaywrightParameter(f'Invalid proxy parameter: "{proxy}" ({type(proxy)})')
172
177
 
173
178
  self.should_retry: bool = False
174
179
  self.__network_not_idle: int = 2 # makes sure we do not wait for network idle the max amount of time the capture is allowed to take
@@ -334,19 +339,10 @@ class Capture():
334
339
  return self._headers
335
340
 
336
341
  @headers.setter
337
- def headers(self, headers: str | dict[str, str] | None) -> None:
342
+ def headers(self, headers: dict[str, str] | None) -> None:
338
343
  if not headers:
339
344
  return
340
- if isinstance(headers, str):
341
- new_headers: dict[str, str] = {}
342
- for header_line in headers.splitlines():
343
- if header_line and ':' in header_line:
344
- splitted = header_line.split(':', 1)
345
- if splitted and len(splitted) == 2:
346
- header, h_value = splitted
347
- if header.strip() and h_value.strip():
348
- new_headers[header.strip()] = h_value.strip()
349
- elif isinstance(headers, dict):
345
+ if isinstance(headers, dict):
350
346
  # Check if they are valid
351
347
  new_headers = {name.strip(): value.strip() for name, value in headers.items() if isinstance(name, str) and isinstance(value, str) and name.strip() and value.strip()}
352
348
  if new_headers != headers:
@@ -812,7 +808,7 @@ class Capture():
812
808
  try:
813
809
  await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
814
810
  await self._wait_for_random_timeout(page, 2)
815
- async with timeout(3):
811
+ async with timeout(5):
816
812
  await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
817
813
  self.logger.debug('Jumped to fragment.')
818
814
  except PlaywrightTimeoutError as e:
@@ -821,20 +817,24 @@ class Capture():
821
817
  self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
822
818
  except Error as e:
823
819
  self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
824
- except TimeoutError:
820
+ except (asyncio.TimeoutError, TimeoutError):
825
821
  self.logger.debug('Unable to scroll due to timeout')
822
+ except (asyncio.CancelledError):
823
+ self.logger.debug('Unable to scroll due to timeout, call canceled')
826
824
  else:
827
825
  # scroll more
828
826
  try:
829
827
  # NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
830
828
  # 2024-07-08: Also, it sometimes get stuck.
831
- async with timeout(3):
829
+ async with timeout(5):
832
830
  await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
833
831
  self.logger.debug('Scrolled down.')
834
832
  except Error as e:
835
833
  self.logger.debug(f'Unable to scroll: {e}')
836
- except TimeoutError:
834
+ except (TimeoutError, asyncio.TimeoutError):
837
835
  self.logger.debug('Unable to scroll due to timeout')
836
+ except (asyncio.CancelledError):
837
+ self.logger.debug('Unable to scroll due to timeout, call canceled')
838
838
 
839
839
  await self._wait_for_random_timeout(page, 3)
840
840
  self.logger.debug('Keep going after moving on page.')
@@ -870,7 +870,7 @@ class Capture():
870
870
  # async with timeout(3):
871
871
  # await page.clock.run_for("47")
872
872
  # self.logger.debug('Moved time forward.')
873
- # except TimeoutError:
873
+ # except (TimeoutError, asyncio.TimeoutError):
874
874
  # self.logger.warning('Unable to move time forward.')
875
875
 
876
876
  self.logger.debug('Done with instrumentation, waiting for network idle.')
@@ -919,7 +919,7 @@ class Capture():
919
919
  rendered_hostname_only=rendered_hostname_only,
920
920
  max_depth_capture_time=max_capture_time)
921
921
  to_return['children'].append(child_capture) # type: ignore[union-attr]
922
- except (TimeoutError, asyncio.exceptions.TimeoutError, asyncio.TimeoutError):
922
+ except (TimeoutError, asyncio.TimeoutError):
923
923
  self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
924
924
  consecutive_errors += 1
925
925
  except Exception as e:
@@ -947,6 +947,9 @@ class Capture():
947
947
  except PlaywrightTimeoutError as e:
948
948
  to_return['error'] = f"The capture took too long - {e.message}"
949
949
  self.should_retry = True
950
+ except (asyncio.TimeoutError, TimeoutError):
951
+ to_return['error'] = "Something in the capture took too long"
952
+ self.should_retry = True
950
953
  except TargetClosedError as e:
951
954
  to_return['error'] = f"The target was closed - {e}"
952
955
  self.should_retry = True
@@ -1084,7 +1087,7 @@ class Capture():
1084
1087
  try:
1085
1088
  async with timeout(30):
1086
1089
  return await page.content()
1087
- except (Error, TimeoutError):
1090
+ except (Error, TimeoutError, asyncio.TimeoutError):
1088
1091
  self.logger.debug('Unable to get page content, trying again.')
1089
1092
  tries -= 1
1090
1093
  await self._wait_for_random_timeout(page, 1)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.25.4"
3
+ version = "1.25.6"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -28,7 +28,7 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
28
28
  pytz = {"version" = "^2024.1", python = "<3.9"}
29
29
  tzdata = "^2024.1"
30
30
  playwright-stealth = "^1.0.6"
31
- setuptools = "^70.3.0"
31
+ setuptools = "^71.0.3"
32
32
  puremagic = "^1.26"
33
33
  async-timeout = {version = "^4.0.3", python = "<3.11"}
34
34
  aiohttp = {extras = ["speedups"], version = "^3.9.5"}