PlaywrightCapture 1.22.5__tar.gz → 1.22.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.22.5
3
+ Version: 1.22.7
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -21,15 +21,15 @@ Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
23
  Requires-Dist: SpeechRecognition (>=3.10.1,<4.0.0) ; extra == "recaptcha"
24
- Requires-Dist: beautifulsoup4[lxml] (>=4.12.2,<5.0.0)
24
+ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
25
25
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
26
- Requires-Dist: playwright (>=1.40.0,<2.0.0)
26
+ Requires-Dist: playwright (>=1.41.0,<2.0.0)
27
27
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
28
28
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
29
29
  Requires-Dist: pytz (>=2023.3.post1,<2024.0) ; python_version < "3.9"
30
30
  Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
31
31
  Requires-Dist: setuptools (>=69.0.3,<70.0.0)
32
- Requires-Dist: tzdata (>=2023.3,<2024.0)
32
+ Requires-Dist: tzdata (>=2023.4,<2024.0)
33
33
  Requires-Dist: w3lib (>=2.1.2,<3.0.0)
34
34
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
35
35
  Description-Content-Type: text/markdown
@@ -3,3 +3,13 @@ from .helpers import get_devices # noqa
3
3
  from .exceptions import (PlaywrightCaptureException, UnknownPlaywrightDeviceType, # noqa
4
4
  UnknownPlaywrightBrowser, UnknownPlaywrightDevice,
5
5
  InvalidPlaywrightParameter)
6
+
7
+ __all__ = [
8
+ 'Capture',
9
+ 'get_devices',
10
+ 'PlaywrightCaptureException',
11
+ 'UnknownPlaywrightDeviceType',
12
+ 'UnknownPlaywrightBrowser',
13
+ 'UnknownPlaywrightDevice',
14
+ 'InvalidPlaywrightParameter'
15
+ ]
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import asyncio
4
6
  import binascii
5
7
  import json
@@ -13,7 +15,7 @@ import time
13
15
  from base64 import b64decode
14
16
  from io import BytesIO
15
17
  from tempfile import NamedTemporaryFile
16
- from typing import Optional, Dict, List, Union, Any, TypedDict, Literal, TYPE_CHECKING, Set, Tuple
18
+ from typing import Any, TypedDict, Literal, TYPE_CHECKING
17
19
  from urllib.parse import urlparse, unquote, urljoin
18
20
  from zipfile import ZipFile
19
21
 
@@ -54,32 +56,32 @@ except ImportError:
54
56
  class CaptureResponse(TypedDict, total=False):
55
57
 
56
58
  last_redirected_url: str
57
- har: Optional[Dict[str, Any]]
58
- cookies: Optional[List['Cookie']]
59
- error: Optional[str]
60
- error_name: Optional[str]
61
- html: Optional[str]
62
- png: Optional[bytes]
63
- downloaded_filename: Optional[str]
64
- downloaded_file: Optional[bytes]
65
- children: Optional[List['CaptureResponse']]
59
+ har: dict[str, Any] | None
60
+ cookies: list[Cookie] | None
61
+ error: str | None
62
+ error_name: str | None
63
+ html: str | None
64
+ png: bytes | None
65
+ downloaded_filename: str | None
66
+ downloaded_file: bytes | None
67
+ children: list[CaptureResponse] | None
66
68
 
67
69
  # One day, playwright will support getting the favicon from the capture itself
68
70
  # favicon: Optional[bytes]
69
71
  # in the meantime, we need a workaround: https://github.com/Lookyloo/PlaywrightCapture/issues/45
70
- potential_favicons: Optional[Set[bytes]]
72
+ potential_favicons: set[bytes] | None
71
73
 
72
74
 
73
75
  class Capture():
74
76
 
75
- _browsers: List['BROWSER'] = ['chromium', 'firefox', 'webkit']
76
- _default_viewport: 'ViewportSize' = {'width': 1920, 'height': 1080}
77
+ _browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
78
+ _default_viewport: ViewportSize = {'width': 1920, 'height': 1080}
77
79
  _default_timeout: int = 90 # set to 90s by default
78
80
  _minimal_timeout: int = 15 # set to 15s - It makes little sense to attempt a capture below that limit.
79
81
 
80
- def __init__(self, browser: Optional['BROWSER']=None, device_name: Optional[str]=None,
81
- proxy: Optional[Union[str, Dict[str, str]]]=None,
82
- general_timeout_in_sec: Optional[int] = None, loglevel: str='INFO'):
82
+ def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
83
+ proxy: str | dict[str, str] | None=None,
84
+ general_timeout_in_sec: int | None = None, loglevel: str='INFO'):
83
85
  """Captures a page with Playwright.
84
86
 
85
87
  :param browser: The browser to use for the capture.
@@ -100,8 +102,8 @@ class Capture():
100
102
  self.logger.warning(f'Timeout given: {general_timeout_in_sec}s. Ignoring that as it makes little sense to attempt to capture a page in less than {self._minimal_timeout}s.')
101
103
  self._capture_timeout = self._minimal_timeout
102
104
 
103
- self.device_name: Optional[str] = device_name
104
- self.proxy: 'ProxySettings' = {}
105
+ self.device_name: str | None = device_name
106
+ self.proxy: ProxySettings = {}
105
107
  if proxy:
106
108
  if isinstance(proxy, str):
107
109
  self.proxy = {'server': proxy}
@@ -112,11 +114,11 @@ class Capture():
112
114
 
113
115
  self.should_retry: bool = False
114
116
  self.__network_not_idle: int = 1
115
- self._cookies: List['SetCookieParam'] = []
116
- self._http_credentials: 'HttpCredentials' = {}
117
- self._geolocation: 'Geolocation' = {}
118
- self._headers: 'Headers' = {}
119
- self._viewport: Optional['ViewportSize'] = None
117
+ self._cookies: list[SetCookieParam] = []
118
+ self._http_credentials: HttpCredentials = {}
119
+ self._geolocation: Geolocation = {}
120
+ self._headers: Headers = {}
121
+ self._viewport: ViewportSize | None = None
120
122
  self._user_agent: str = ''
121
123
  self._timezone_id: str = ''
122
124
  self._locale: str = ''
@@ -162,7 +164,7 @@ class Capture():
162
164
  return self._locale
163
165
 
164
166
  @locale.setter
165
- def locale(self, locale: Optional[str]) -> None:
167
+ def locale(self, locale: str | None) -> None:
166
168
  if locale:
167
169
  self._locale = locale
168
170
 
@@ -171,7 +173,7 @@ class Capture():
171
173
  return self._timezone_id
172
174
 
173
175
  @timezone_id.setter
174
- def timezone_id(self, timezone_id: Optional[str]) -> None:
176
+ def timezone_id(self, timezone_id: str | None) -> None:
175
177
  if not timezone_id:
176
178
  return
177
179
  if timezone_id in all_timezones_set:
@@ -180,11 +182,11 @@ class Capture():
180
182
  raise InvalidPlaywrightParameter(f'The Timezone ID provided ({timezone_id}) is invalid.')
181
183
 
182
184
  @property
183
- def http_credentials(self) -> 'HttpCredentials':
185
+ def http_credentials(self) -> HttpCredentials:
184
186
  return self._http_credentials
185
187
 
186
188
  @http_credentials.setter
187
- def http_credentials(self, credentials: Optional[Dict[str, str]]) -> None:
189
+ def http_credentials(self, credentials: dict[str, str] | None) -> None:
188
190
  if not credentials:
189
191
  return
190
192
  if 'username' in credentials and 'password' in credentials:
@@ -195,15 +197,15 @@ class Capture():
195
197
  else:
196
198
  raise InvalidPlaywrightParameter(f'At least a username and a password are required in the credentials: {credentials}')
197
199
 
198
- def set_http_credentials(self, username: str, password: str, origin: Optional[str]=None) -> None:
200
+ def set_http_credentials(self, username: str, password: str, origin: str | None=None) -> None:
199
201
  self._http_credentials = {'username': username, 'password': password, 'origin': origin}
200
202
 
201
203
  @property
202
- def geolocation(self) -> 'Geolocation':
204
+ def geolocation(self) -> Geolocation:
203
205
  return self._geolocation
204
206
 
205
207
  @geolocation.setter
206
- def geolocation(self, geolocation: Optional[Dict[str, Union[str, int, float]]]) -> None:
208
+ def geolocation(self, geolocation: dict[str, str | int | float] | None) -> None:
207
209
  if not geolocation:
208
210
  return
209
211
  if 'latitude' in geolocation and 'longitude' in geolocation:
@@ -215,18 +217,18 @@ class Capture():
215
217
  raise InvalidPlaywrightParameter(f'At least a latitude and a longitude are required in the geolocation: {geolocation}')
216
218
 
217
219
  @property
218
- def cookies(self) -> List['SetCookieParam']:
220
+ def cookies(self) -> list[SetCookieParam]:
219
221
  return self._cookies
220
222
 
221
223
  @cookies.setter
222
- def cookies(self, cookies: Optional[List[Dict[str, Any]]]) -> None:
224
+ def cookies(self, cookies: list[dict[str, Any]] | None) -> None:
223
225
  '''Cookies to send along to the initial request.
224
226
  :param cookies: The cookies, in this format: https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-cookies
225
227
  '''
226
228
  if not cookies:
227
229
  return
228
230
  for cookie in cookies:
229
- c: 'SetCookieParam' = {
231
+ c: SetCookieParam = {
230
232
  'name': cookie['name'],
231
233
  'value': cookie['value'],
232
234
  }
@@ -266,15 +268,15 @@ class Capture():
266
268
  self.logger.warning(f'The cookie must have a URL ({url}) or a domain ({domain}) and a path ({path})')
267
269
 
268
270
  @property
269
- def headers(self) -> 'Headers':
271
+ def headers(self) -> Headers:
270
272
  return self._headers
271
273
 
272
274
  @headers.setter
273
- def headers(self, headers: Optional[Union[str, Dict[str, str]]]) -> None:
275
+ def headers(self, headers: str | dict[str, str] | None) -> None:
274
276
  if not headers:
275
277
  return
276
278
  if isinstance(headers, str):
277
- new_headers: Dict[str, str] = {}
279
+ new_headers: dict[str, str] = {}
278
280
  for header_line in headers.splitlines():
279
281
  if header_line and ':' in header_line:
280
282
  splitted = header_line.split(':', 1)
@@ -290,7 +292,7 @@ class Capture():
290
292
  else:
291
293
  # This shouldn't happen, but we also cannot ensure the calls leading to this are following the specs,
292
294
  # and playwright dislikes invalid HTTP headers so we rather drop them.
293
- self.logger.info(f'Wrong type of headers ({type(headers)}): {headers}') # type: ignore[unreachable]
295
+ self.logger.info(f'Wrong type of headers ({type(headers)}): {headers}')
294
296
  return
295
297
 
296
298
  # Validate the new headers, only a subset of characters are accepted
@@ -305,11 +307,11 @@ class Capture():
305
307
  self._headers[name] = value
306
308
 
307
309
  @property
308
- def viewport(self) -> Optional['ViewportSize']:
310
+ def viewport(self) -> ViewportSize | None:
309
311
  return self._viewport
310
312
 
311
313
  @viewport.setter
312
- def viewport(self, viewport: Optional[Dict[str, Union[str, int]]]) -> None:
314
+ def viewport(self, viewport: dict[str, str | int] | None) -> None:
313
315
  if not viewport:
314
316
  return
315
317
  if 'width' in viewport and 'height' in viewport:
@@ -322,7 +324,7 @@ class Capture():
322
324
  return self._user_agent
323
325
 
324
326
  @user_agent.setter
325
- def user_agent(self, user_agent: Optional[str]) -> None:
327
+ def user_agent(self, user_agent: str | None) -> None:
326
328
  if user_agent is not None:
327
329
  self._user_agent = user_agent
328
330
 
@@ -331,7 +333,7 @@ class Capture():
331
333
  return self._color_scheme
332
334
 
333
335
  @color_scheme.setter
334
- def color_scheme(self, color_scheme: Optional[str]) -> None:
336
+ def color_scheme(self, color_scheme: str | None) -> None:
335
337
  if not color_scheme:
336
338
  return
337
339
  schemes = ['light', 'dark', 'no-preference']
@@ -377,7 +379,7 @@ class Capture():
377
379
  default_context_settings.pop('is_mobile')
378
380
 
379
381
  # FIXME: video for debug
380
- default_context_settings['record_video_dir'] = './videos/'
382
+ # default_context_settings['record_video_dir'] = './videos/'
381
383
 
382
384
  self.context = await self.browser.new_context(**default_context_settings) # type: ignore
383
385
  self.context.set_default_timeout(self._capture_timeout * 1000)
@@ -453,8 +455,8 @@ class Capture():
453
455
  self.logger.info(f'Unable to find Cloudflare locator: {e}')
454
456
 
455
457
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
456
- referer: Optional[str]=None,
457
- page: Optional[Page]=None, depth: int=0,
458
+ referer: str | None=None,
459
+ page: Page | None=None, depth: int=0,
458
460
  rendered_hostname_only: bool=True,
459
461
  with_favicon: bool=False
460
462
  ) -> CaptureResponse:
@@ -466,7 +468,7 @@ class Capture():
466
468
  self.wait_for_download = 0
467
469
 
468
470
  # We may have multiple download triggered via JS
469
- multiple_downloads: List[Tuple[str, bytes]] = []
471
+ multiple_downloads: list[tuple[str, bytes]] = []
470
472
 
471
473
  async def handle_download(download: Download) -> None:
472
474
  # This method is called when a download event is triggered from JS in a page that also renders
@@ -752,7 +754,7 @@ class Capture():
752
754
  # Network never idle, keep going
753
755
  self.__network_not_idle += 1
754
756
 
755
- async def _failsafe_get_content(self, page: Page) -> Optional[str]:
757
+ async def _failsafe_get_content(self, page: Page) -> str | None:
756
758
  ''' The page might be changing for all kind of reason (generally a JS timeout).
757
759
  In that case, we try a few times to get the HTML.'''
758
760
  tries = 3
@@ -770,8 +772,8 @@ class Capture():
770
772
  self.logger.warning('Unable to get page content.')
771
773
  return None
772
774
 
773
- def _get_links_from_rendered_page(self, rendered_url: str, rendered_html: str, rendered_hostname_only: bool) -> List[str]:
774
- def _sanitize(maybe_url: str) -> Optional[str]:
775
+ def _get_links_from_rendered_page(self, rendered_url: str, rendered_html: str, rendered_hostname_only: bool) -> list[str]:
776
+ def _sanitize(maybe_url: str) -> str | None:
775
777
  href = strip_html5_whitespace(maybe_url)
776
778
  href = safe_url_string(href)
777
779
 
@@ -783,7 +785,7 @@ class Capture():
783
785
  return None
784
786
  return href
785
787
 
786
- urls: Set[str] = set()
788
+ urls: set[str] = set()
787
789
  soup = BeautifulSoup(rendered_html, "lxml")
788
790
 
789
791
  rendered_hostname = urlparse(rendered_url).hostname
@@ -924,9 +926,9 @@ class Capture():
924
926
  return True
925
927
  return False
926
928
 
927
- def make_frame_tree(self, frame: Frame) -> Dict[str, List[Dict[str, Any]]]:
929
+ def make_frame_tree(self, frame: Frame) -> dict[str, list[dict[str, Any]]]:
928
930
  # TODO: not used at this time, need to figure out how do use that.
929
- to_return: Dict[str, List[Dict[str, Any]]] = {frame._impl_obj._guid: []}
931
+ to_return: dict[str, list[dict[str, Any]]] = {frame._impl_obj._guid: []}
930
932
  for child in frame.child_frames:
931
933
  to_return[frame._impl_obj._guid].append(self.make_frame_tree(child))
932
934
  return to_return
@@ -934,7 +936,7 @@ class Capture():
934
936
  # #### Manual favicon extractor, will be removed if/when Playwright supports getting the favicon.
935
937
 
936
938
  # Method copied from HAR2Tree
937
- def __parse_data_uri(self, uri: str) -> Optional[Tuple[str, str, bytes]]:
939
+ def __parse_data_uri(self, uri: str) -> tuple[str, str, bytes] | None:
938
940
  if not uri.startswith('data:'):
939
941
  return None
940
942
  uri = uri[5:]
@@ -973,7 +975,7 @@ class Capture():
973
975
  mimeparams = ''
974
976
  return mime, mimeparams, data
975
977
 
976
- def __extract_favicons(self, rendered_content: Union[str, bytes]) -> Optional[Tuple[Set[str], Set[bytes]]]:
978
+ def __extract_favicons(self, rendered_content: str | bytes) -> tuple[set[str], set[bytes]] | None:
977
979
  if isinstance(rendered_content, bytes):
978
980
  rendered_content = str(from_bytes(rendered_content).best())
979
981
  if not rendered_content:
@@ -1018,7 +1020,7 @@ class Capture():
1018
1020
  # print(favicons_urls)
1019
1021
  return favicons_urls, favicons
1020
1022
 
1021
- def get_favicons(self, rendered_url: str, rendered_content: str) -> Set[bytes]:
1023
+ def get_favicons(self, rendered_url: str, rendered_content: str) -> set[bytes]:
1022
1024
  """This method will be deprecated as soon as Playwright will be able to fetch favicons (https://github.com/microsoft/playwright/issues/7493).
1023
1025
  In the meantime, we try to get all the potential ones in this method.
1024
1026
  Method inspired by https://github.com/ail-project/ail-framework/blob/master/bin/lib/crawlers.py
@@ -1,7 +1,9 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  from collections import defaultdict
4
- from typing import TypedDict, Dict
6
+ from typing import TypedDict
5
7
 
6
8
  from playwright.sync_api import sync_playwright
7
9
 
@@ -11,17 +13,17 @@ from .exceptions import UnknownPlaywrightDeviceType
11
13
  class PlaywrightDevice(TypedDict):
12
14
 
13
15
  user_agent: str
14
- viewport: Dict[str, int]
16
+ viewport: dict[str, int]
15
17
  device_scale_factor: int
16
18
  is_mobile: bool
17
19
  has_touch: bool
18
20
  default_browser_type: str
19
21
 
20
22
 
21
- def get_devices(in_testsuite: bool=False) -> Dict[str, Dict[str, Dict[str, PlaywrightDevice]]]:
22
- to_return: Dict[str, Dict[str, Dict[str, PlaywrightDevice]]] = {'desktop': defaultdict(dict), 'mobile': defaultdict(dict)}
23
+ def get_devices(in_testsuite: bool=False) -> dict[str, dict[str, dict[str, PlaywrightDevice]]]:
24
+ to_return: dict[str, dict[str, dict[str, PlaywrightDevice]]] = {'desktop': defaultdict(dict), 'mobile': defaultdict(dict)}
23
25
  playwright = sync_playwright().start()
24
- devices: Dict[str, PlaywrightDevice] = playwright.devices
26
+ devices: dict[str, PlaywrightDevice] = playwright.devices
25
27
  playwright.stop()
26
28
  for device_name, settings in devices.items():
27
29
  splitted_name = device_name.split(' ')
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.22.5"
3
+ version = "1.22.7"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -19,15 +19,15 @@ classifiers=[
19
19
 
20
20
  [tool.poetry.dependencies]
21
21
  python = "^3.8"
22
- playwright = "^1.40.0"
22
+ playwright = "^1.41.0"
23
23
  dateparser = "^1.2.0"
24
- beautifulsoup4 = {version= "^4.12.2", extras = ["lxml"]}
24
+ beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
25
25
  w3lib = "^2.1.2"
26
26
  requests = {extras = ["socks"], version = "^2.31.0"}
27
27
  pydub = {version = "^0.25.1", optional = true}
28
28
  SpeechRecognition = {version = "^3.10.1", optional = true}
29
29
  pytz = {"version" = "^2023.3.post1", python = "<3.9"}
30
- tzdata = "^2023.3"
30
+ tzdata = "^2023.4"
31
31
  playwright-stealth = "^1.0.6"
32
32
  setuptools = "^69.0.3"
33
33
 
@@ -38,34 +38,14 @@ recaptcha = ["requests", "pydub", "SpeechRecognition"]
38
38
  optional = true
39
39
 
40
40
  [tool.poetry.group.dev.dependencies]
41
- types-beautifulsoup4 = "^4.12.0.7"
42
- pytest = "^7.4.3"
41
+ types-beautifulsoup4 = "^4.12.0.20240106"
42
+ pytest = "^7.4.4"
43
43
  mypy = "^1.8.0"
44
- types-dateparser = "^1.1.4.10"
45
- types-requests = "^2.31.0.10"
44
+ types-dateparser = "^1.1.4.20240106"
45
+ types-requests = "^2.31.0.20240106"
46
46
  types-pytz = "^2023.3.1.1"
47
47
 
48
48
 
49
49
  [build-system]
50
50
  requires = ["poetry-core"]
51
51
  build-backend = "poetry.core.masonry.api"
52
-
53
- [tool.mypy]
54
- disallow_untyped_calls = true
55
- disallow_untyped_defs = true
56
- disallow_incomplete_defs = true
57
- check_untyped_defs = true
58
- disallow_any_generics = true
59
- python_version = 3.8
60
- ignore_errors = false
61
- ignore_missing_imports = false
62
- strict_optional = true
63
- no_implicit_optional = true
64
- warn_return_any = true
65
- warn_unused_ignores = true
66
- warn_redundant_casts = true
67
- warn_unused_configs = true
68
- warn_unreachable = true
69
-
70
- show_error_context = true
71
- pretty = true