PyPI - PlaywrightCapture - Versions diffs - 1.22.5__tar.gz → 1.22.7__tar.gz - Mend

PlaywrightCapture 1.22.5tar.gz → 1.22.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: PlaywrightCapture
-Version: 1.22.5
+Version: 1.22.7
 Summary: A simple library to capture websites using playwright
 Home-page: https://github.com/Lookyloo/PlaywrightCapture
 License: BSD-3-Clause
@@ -21,15 +21,15 @@ Classifier: Topic :: Internet
 Classifier: Topic :: Security
 Provides-Extra: recaptcha
 Requires-Dist: SpeechRecognition (>=3.10.1,<4.0.0) ; extra == "recaptcha"
-Requires-Dist: beautifulsoup4[lxml] (>=4.12.2,<5.0.0)
+Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
 Requires-Dist: dateparser (>=1.2.0,<2.0.0)
-Requires-Dist: playwright (>=1.40.0,<2.0.0)
+Requires-Dist: playwright (>=1.41.0,<2.0.0)
 Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
 Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
 Requires-Dist: pytz (>=2023.3.post1,<2024.0) ; python_version < "3.9"
 Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
 Requires-Dist: setuptools (>=69.0.3,<70.0.0)
-Requires-Dist: tzdata (>=2023.3,<2024.0)
+Requires-Dist: tzdata (>=2023.4,<2024.0)
 Requires-Dist: w3lib (>=2.1.2,<3.0.0)
 Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
 Description-Content-Type: text/markdown

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/playwrightcapture/__init__.py RENAMED Viewed

@@ -3,3 +3,13 @@ from .helpers import get_devices  # noqa
 from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType,  # noqa
                          UnknownPlaywrightBrowser, UnknownPlaywrightDevice,
                          InvalidPlaywrightParameter)
+__all__ = [
+    'Capture',
+    'get_devices',
+    'PlaywrightCaptureException',
+    'UnknownPlaywrightDeviceType',
+    'UnknownPlaywrightBrowser',
+    'UnknownPlaywrightDevice',
+    'InvalidPlaywrightParameter'
+]

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/playwrightcapture/capture.py RENAMED Viewed

@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
+from __future__ import annotations
 import asyncio
 import binascii
 import json
@@ -13,7 +15,7 @@ import time
 from base64 import b64decode
 from io import BytesIO
 from tempfile import NamedTemporaryFile
-from typing import Optional, Dict, List, Union, Any, TypedDict, Literal, TYPE_CHECKING, Set, Tuple
+from typing import Any, TypedDict, Literal, TYPE_CHECKING
 from urllib.parse import urlparse, unquote, urljoin
 from zipfile import ZipFile
@@ -54,32 +56,32 @@ except ImportError:
 class CaptureResponse(TypedDict, total=False):
     last_redirected_url: str
-    har: Optional[Dict[str, Any]]
-    cookies: Optional[List['Cookie']]
-    error: Optional[str]
-    error_name: Optional[str]
-    html: Optional[str]
-    png: Optional[bytes]
-    downloaded_filename: Optional[str]
-    downloaded_file: Optional[bytes]
-    children: Optional[List['CaptureResponse']]
+    har: dict[str, Any] | None
+    cookies: list[Cookie] | None
+    error: str | None
+    error_name: str | None
+    html: str | None
+    png: bytes | None
+    downloaded_filename: str | None
+    downloaded_file: bytes | None
+    children: list[CaptureResponse] | None
     # One day, playwright will support getting the favicon from the capture itself
     # favicon: Optional[bytes]
     # in the meantime, we need a workaround: https://github.com/Lookyloo/PlaywrightCapture/issues/45
-    potential_favicons: Optional[Set[bytes]]
+    potential_favicons: set[bytes] | None
 class Capture():
-    _browsers: List['BROWSER'] = ['chromium', 'firefox', 'webkit']
-    _default_viewport: 'ViewportSize' = {'width': 1920, 'height': 1080}
+    _browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
+    _default_viewport: ViewportSize = {'width': 1920, 'height': 1080}
     _default_timeout: int = 90  # set to 90s by default
     _minimal_timeout: int = 15  # set to 15s - It makes little sense to attempt a capture below that limit.
-    def __init__(self, browser: Optional['BROWSER']=None, device_name: Optional[str]=None,
-                 proxy: Optional[Union[str, Dict[str, str]]]=None,
-                 general_timeout_in_sec: Optional[int] = None, loglevel: str='INFO'):
+    def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
+                 proxy: str | dict[str, str] | None=None,
+                 general_timeout_in_sec: int | None = None, loglevel: str='INFO'):
         """Captures a page with Playwright.
         :param browser: The browser to use for the capture.
@@ -100,8 +102,8 @@ class Capture():
                 self.logger.warning(f'Timeout given: {general_timeout_in_sec}s. Ignoring that as it makes little sense to attempt to capture a page in less than {self._minimal_timeout}s.')
                 self._capture_timeout = self._minimal_timeout
-        self.device_name: Optional[str] = device_name
-        self.proxy: 'ProxySettings' = {}
+        self.device_name: str | None = device_name
+        self.proxy: ProxySettings = {}
         if proxy:
             if isinstance(proxy, str):
                 self.proxy = {'server': proxy}
@@ -112,11 +114,11 @@ class Capture():
         self.should_retry: bool = False
         self.__network_not_idle: int = 1
-        self._cookies: List['SetCookieParam'] = []
-        self._http_credentials: 'HttpCredentials' = {}
-        self._geolocation: 'Geolocation' = {}
-        self._headers: 'Headers' = {}
-        self._viewport: Optional['ViewportSize'] = None
+        self._cookies: list[SetCookieParam] = []
+        self._http_credentials: HttpCredentials = {}
+        self._geolocation: Geolocation = {}
+        self._headers: Headers = {}
+        self._viewport: ViewportSize | None = None
         self._user_agent: str = ''
         self._timezone_id: str = ''
         self._locale: str = ''
@@ -162,7 +164,7 @@ class Capture():
         return self._locale
     @locale.setter
-    def locale(self, locale: Optional[str]) -> None:
+    def locale(self, locale: str | None) -> None:
         if locale:
             self._locale = locale
@@ -171,7 +173,7 @@ class Capture():
         return self._timezone_id
     @timezone_id.setter
-    def timezone_id(self, timezone_id: Optional[str]) -> None:
+    def timezone_id(self, timezone_id: str | None) -> None:
         if not timezone_id:
             return
         if timezone_id in all_timezones_set:
@@ -180,11 +182,11 @@ class Capture():
             raise InvalidPlaywrightParameter(f'The Timezone ID provided ({timezone_id}) is invalid.')
     @property
-    def http_credentials(self) -> 'HttpCredentials':
+    def http_credentials(self) -> HttpCredentials:
         return self._http_credentials
     @http_credentials.setter
-    def http_credentials(self, credentials: Optional[Dict[str, str]]) -> None:
+    def http_credentials(self, credentials: dict[str, str] | None) -> None:
         if not credentials:
             return
         if 'username' in credentials and 'password' in credentials:
@@ -195,15 +197,15 @@ class Capture():
         else:
             raise InvalidPlaywrightParameter(f'At least a username and a password are required in the credentials: {credentials}')
-    def set_http_credentials(self, username: str, password: str, origin: Optional[str]=None) -> None:
+    def set_http_credentials(self, username: str, password: str, origin: str | None=None) -> None:
         self._http_credentials = {'username': username, 'password': password, 'origin': origin}
     @property
-    def geolocation(self) -> 'Geolocation':
+    def geolocation(self) -> Geolocation:
         return self._geolocation
     @geolocation.setter
-    def geolocation(self, geolocation: Optional[Dict[str, Union[str, int, float]]]) -> None:
+    def geolocation(self, geolocation: dict[str, str | int | float] | None) -> None:
         if not geolocation:
             return
         if 'latitude' in geolocation and 'longitude' in geolocation:
@@ -215,18 +217,18 @@ class Capture():
             raise InvalidPlaywrightParameter(f'At least a latitude and a longitude are required in the geolocation: {geolocation}')
     @property
-    def cookies(self) -> List['SetCookieParam']:
+    def cookies(self) -> list[SetCookieParam]:
         return self._cookies
     @cookies.setter
-    def cookies(self, cookies: Optional[List[Dict[str, Any]]]) -> None:
+    def cookies(self, cookies: list[dict[str, Any]] | None) -> None:
         '''Cookies to send along to the initial request.
         :param cookies: The cookies, in this format: https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-cookies
         '''
         if not cookies:
             return
         for cookie in cookies:
-            c: 'SetCookieParam' = {
+            c: SetCookieParam = {
                 'name': cookie['name'],
                 'value': cookie['value'],
             }
@@ -266,15 +268,15 @@ class Capture():
                 self.logger.warning(f'The cookie must have a URL ({url}) or a domain ({domain}) and a path ({path})')
     @property
-    def headers(self) -> 'Headers':
+    def headers(self) -> Headers:
         return self._headers
     @headers.setter
-    def headers(self, headers: Optional[Union[str, Dict[str, str]]]) -> None:
+    def headers(self, headers: str | dict[str, str] | None) -> None:
         if not headers:
             return
         if isinstance(headers, str):
-            new_headers: Dict[str, str] = {}
+            new_headers: dict[str, str] = {}
             for header_line in headers.splitlines():
                 if header_line and ':' in header_line:
                     splitted = header_line.split(':', 1)
@@ -290,7 +292,7 @@ class Capture():
         else:
             # This shouldn't happen, but we also cannot ensure the calls leading to this are following the specs,
             # and playwright dislikes invalid HTTP headers so we rather drop them.
-            self.logger.info(f'Wrong type of headers ({type(headers)}): {headers}')  # type: ignore[unreachable]
+            self.logger.info(f'Wrong type of headers ({type(headers)}): {headers}')
             return
         # Validate the new headers, only a subset of characters are accepted
@@ -305,11 +307,11 @@ class Capture():
             self._headers[name] = value
     @property
-    def viewport(self) -> Optional['ViewportSize']:
+    def viewport(self) -> ViewportSize | None:
         return self._viewport
     @viewport.setter
-    def viewport(self, viewport: Optional[Dict[str, Union[str, int]]]) -> None:
+    def viewport(self, viewport: dict[str, str | int] | None) -> None:
         if not viewport:
             return
         if 'width' in viewport and 'height' in viewport:
@@ -322,7 +324,7 @@ class Capture():
         return self._user_agent
     @user_agent.setter
-    def user_agent(self, user_agent: Optional[str]) -> None:
+    def user_agent(self, user_agent: str | None) -> None:
         if user_agent is not None:
             self._user_agent = user_agent
@@ -331,7 +333,7 @@ class Capture():
         return self._color_scheme
     @color_scheme.setter
-    def color_scheme(self, color_scheme: Optional[str]) -> None:
+    def color_scheme(self, color_scheme: str | None) -> None:
         if not color_scheme:
             return
         schemes = ['light', 'dark', 'no-preference']
@@ -377,7 +379,7 @@ class Capture():
             default_context_settings.pop('is_mobile')
         # FIXME: video for debug
-        default_context_settings['record_video_dir'] = './videos/'
+        # default_context_settings['record_video_dir'] = './videos/'
         self.context = await self.browser.new_context(**default_context_settings)  # type: ignore
         self.context.set_default_timeout(self._capture_timeout * 1000)
@@ -453,8 +455,8 @@ class Capture():
             self.logger.info(f'Unable to find Cloudflare locator: {e}')
     async def capture_page(self, url: str, *, max_depth_capture_time: int,
-                           referer: Optional[str]=None,
-                           page: Optional[Page]=None, depth: int=0,
+                           referer: str | None=None,
+                           page: Page | None=None, depth: int=0,
                            rendered_hostname_only: bool=True,
                            with_favicon: bool=False
                            ) -> CaptureResponse:
@@ -466,7 +468,7 @@ class Capture():
         self.wait_for_download = 0
         # We may have multiple download triggered via JS
-        multiple_downloads: List[Tuple[str, bytes]] = []
+        multiple_downloads: list[tuple[str, bytes]] = []
         async def handle_download(download: Download) -> None:
             # This method is called when a download event is triggered from JS in a page that also renders
@@ -752,7 +754,7 @@ class Capture():
             # Network never idle, keep going
             self.__network_not_idle += 1
-    async def _failsafe_get_content(self, page: Page) -> Optional[str]:
+    async def _failsafe_get_content(self, page: Page) -> str | None:
         ''' The page might be changing for all kind of reason (generally a JS timeout).
         In that case, we try a few times to get the HTML.'''
         tries = 3
@@ -770,8 +772,8 @@ class Capture():
         self.logger.warning('Unable to get page content.')
         return None
-    def _get_links_from_rendered_page(self, rendered_url: str, rendered_html: str, rendered_hostname_only: bool) -> List[str]:
-        def _sanitize(maybe_url: str) -> Optional[str]:
+    def _get_links_from_rendered_page(self, rendered_url: str, rendered_html: str, rendered_hostname_only: bool) -> list[str]:
+        def _sanitize(maybe_url: str) -> str | None:
             href = strip_html5_whitespace(maybe_url)
             href = safe_url_string(href)
@@ -783,7 +785,7 @@ class Capture():
                 return None
             return href
-        urls: Set[str] = set()
+        urls: set[str] = set()
         soup = BeautifulSoup(rendered_html, "lxml")
         rendered_hostname = urlparse(rendered_url).hostname
@@ -924,9 +926,9 @@ class Capture():
             return True
         return False
-    def make_frame_tree(self, frame: Frame) -> Dict[str, List[Dict[str, Any]]]:
+    def make_frame_tree(self, frame: Frame) -> dict[str, list[dict[str, Any]]]:
         # TODO: not used at this time, need to figure out how do use that.
-        to_return: Dict[str, List[Dict[str, Any]]] = {frame._impl_obj._guid: []}
+        to_return: dict[str, list[dict[str, Any]]] = {frame._impl_obj._guid: []}
         for child in frame.child_frames:
             to_return[frame._impl_obj._guid].append(self.make_frame_tree(child))
         return to_return
@@ -934,7 +936,7 @@ class Capture():
     # #### Manual favicon extractor, will be removed if/when Playwright supports getting the favicon.
     # Method copied from HAR2Tree
-    def __parse_data_uri(self, uri: str) -> Optional[Tuple[str, str, bytes]]:
+    def __parse_data_uri(self, uri: str) -> tuple[str, str, bytes] | None:
         if not uri.startswith('data:'):
             return None
         uri = uri[5:]
@@ -973,7 +975,7 @@ class Capture():
             mimeparams = ''
         return mime, mimeparams, data
-    def __extract_favicons(self, rendered_content: Union[str, bytes]) -> Optional[Tuple[Set[str], Set[bytes]]]:
+    def __extract_favicons(self, rendered_content: str | bytes) -> tuple[set[str], set[bytes]] | None:
         if isinstance(rendered_content, bytes):
             rendered_content = str(from_bytes(rendered_content).best())
             if not rendered_content:
@@ -1018,7 +1020,7 @@ class Capture():
         # print(favicons_urls)
         return favicons_urls, favicons
-    def get_favicons(self, rendered_url: str, rendered_content: str) -> Set[bytes]:
+    def get_favicons(self, rendered_url: str, rendered_content: str) -> set[bytes]:
         """This method will be deprecated as soon as Playwright will be able to fetch favicons (https://github.com/microsoft/playwright/issues/7493).
         In the meantime, we try to get all the potential ones in this method.
         Method inspired by https://github.com/ail-project/ail-framework/blob/master/bin/lib/crawlers.py

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/playwrightcapture/helpers.py RENAMED Viewed

@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
+from __future__ import annotations
 from collections import defaultdict
-from typing import TypedDict, Dict
+from typing import TypedDict
 from playwright.sync_api import sync_playwright
@@ -11,17 +13,17 @@ from .exceptions import UnknownPlaywrightDeviceType
 class PlaywrightDevice(TypedDict):
     user_agent: str
-    viewport: Dict[str, int]
+    viewport: dict[str, int]
     device_scale_factor: int
     is_mobile: bool
     has_touch: bool
     default_browser_type: str
-def get_devices(in_testsuite: bool=False) -> Dict[str, Dict[str, Dict[str, PlaywrightDevice]]]:
-    to_return: Dict[str, Dict[str, Dict[str, PlaywrightDevice]]] = {'desktop': defaultdict(dict), 'mobile': defaultdict(dict)}
+def get_devices(in_testsuite: bool=False) -> dict[str, dict[str, dict[str, PlaywrightDevice]]]:
+    to_return: dict[str, dict[str, dict[str, PlaywrightDevice]]] = {'desktop': defaultdict(dict), 'mobile': defaultdict(dict)}
     playwright = sync_playwright().start()
-    devices: Dict[str, PlaywrightDevice] = playwright.devices
+    devices: dict[str, PlaywrightDevice] = playwright.devices
     playwright.stop()
     for device_name, settings in devices.items():
         splitted_name = device_name.split(' ')

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "PlaywrightCapture"
-version = "1.22.5"
+version = "1.22.7"
 description = "A simple library to capture websites using playwright"
 authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
 license = "BSD-3-Clause"
@@ -19,15 +19,15 @@ classifiers=[
 [tool.poetry.dependencies]
 python = "^3.8"
-playwright = "^1.40.0"
+playwright = "^1.41.0"
 dateparser = "^1.2.0"
-beautifulsoup4 = {version= "^4.12.2", extras = ["lxml"]}
+beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
 w3lib = "^2.1.2"
 requests = {extras = ["socks"], version = "^2.31.0"}
 pydub = {version = "^0.25.1", optional = true}
 SpeechRecognition = {version = "^3.10.1", optional = true}
 pytz = {"version" = "^2023.3.post1", python = "<3.9"}
-tzdata = "^2023.3"
+tzdata = "^2023.4"
 playwright-stealth = "^1.0.6"
 setuptools = "^69.0.3"
@@ -38,34 +38,14 @@ recaptcha = ["requests", "pydub", "SpeechRecognition"]
 optional = true
 [tool.poetry.group.dev.dependencies]
-types-beautifulsoup4 = "^4.12.0.7"
-pytest = "^7.4.3"
+types-beautifulsoup4 = "^4.12.0.20240106"
+pytest = "^7.4.4"
 mypy = "^1.8.0"
-types-dateparser = "^1.1.4.10"
-types-requests = "^2.31.0.10"
+types-dateparser = "^1.1.4.20240106"
+types-requests = "^2.31.0.20240106"
 types-pytz = "^2023.3.1.1"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
-[tool.mypy]
-disallow_untyped_calls = true
-disallow_untyped_defs = true
-disallow_incomplete_defs = true
-check_untyped_defs = true
-disallow_any_generics = true
-python_version = 3.8
-ignore_errors = false
-ignore_missing_imports = false
-strict_optional = true
-no_implicit_optional = true
-warn_return_any = true
-warn_unused_ignores = true
-warn_redundant_casts = true
-warn_unused_configs = true
-warn_unreachable = true
-show_error_context = true
-pretty = true

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/LICENSE RENAMED Viewed

File without changes

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/README.md RENAMED Viewed

File without changes

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/playwrightcapture/exceptions.py RENAMED Viewed

File without changes

{playwrightcapture-1.22.5 → playwrightcapture-1.22.7}/playwrightcapture/py.typed RENAMED Viewed

File without changes

PlaywrightCapture 1.22.5__tar.gz → 1.22.7__tar.gz

PlaywrightCapture 1.22.5tar.gz → 1.22.7tar.gz