PlaywrightCapture 1.25.8__py3-none-any.whl → 1.25.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,7 @@ from io import BytesIO
18
18
  from logging import LoggerAdapter, Logger
19
19
  from tempfile import NamedTemporaryFile
20
20
  from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping, Generator
21
- from urllib.parse import urlparse, unquote, urljoin
21
+ from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit
22
22
  from zipfile import ZipFile
23
23
 
24
24
  import aiohttp
@@ -96,28 +96,40 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
96
96
  return msg, kwargs
97
97
 
98
98
 
99
+ # good test pages:
100
+ # https://bot.incolumitas.com/
101
+ # https://fingerprintjs.github.io/BotD/main/
102
+
99
103
  @dataclass
100
104
  class PCStealthConfig(StealthConfig): # type: ignore[misc]
101
105
 
102
106
  @property
103
107
  def enabled_scripts(self) -> Generator[str, None, None]:
104
- self.webdriver = True
105
- self.webgl_vendor = True
106
108
  self.chrome_app = True
107
109
  self.chrome_csi = True
108
- self.chrome_load_times = True
109
110
  self.chrome_runtime = True
111
+ self.chrome_load_times = True
112
+ self.navigator_plugins = True
113
+ self.hairline = True
110
114
  self.iframe_content_window = True
111
115
  self.media_codecs = True
116
+
117
+ # permissions are handled directly in playwright
118
+ self.navigator_permissions = False
119
+ # Platform is correct now
120
+ self.navigator_platform = False
121
+ # probably useless, but it will fallback to 4 regardless
112
122
  self.navigator_hardware_concurrency = 4
123
+ # Webgl vendor is correct now
124
+ self.webgl_vendor = False
125
+ # Set by the viewport
126
+ self.outerdimensions = False
127
+
128
+ # Not working with Playwright 1.45+
113
129
  self.navigator_languages = False # Causes issue
114
- self.navigator_permissions = True
115
- self.navigator_platform = True
116
- self.navigator_plugins = True
117
130
  self.navigator_user_agent = False # Causes issues
118
131
  self.navigator_vendor = False # Causes issues
119
- self.outerdimensions = True
120
- self.hairline = True
132
+
121
133
  yield from super().enabled_scripts
122
134
 
123
135
 
@@ -164,7 +176,7 @@ class Capture():
164
176
  self.proxy: ProxySettings = {}
165
177
  if proxy:
166
178
  if isinstance(proxy, str):
167
- self.proxy = {'server': proxy}
179
+ self.proxy = self.__prepare_proxy_playwright(proxy)
168
180
  elif isinstance(proxy, dict):
169
181
  self.proxy = {'server': proxy['server'], 'bypass': proxy.get('bypass', ''),
170
182
  'username': proxy.get('username', ''),
@@ -184,9 +196,22 @@ class Capture():
184
196
  self._viewport: ViewportSize | None = None
185
197
  self._user_agent: str = ''
186
198
  self._timezone_id: str = ''
187
- self._locale: str = ''
199
+ self._locale: str = 'en-US'
188
200
  self._color_scheme: Literal['dark', 'light', 'no-preference', 'null'] | None = None
189
201
 
202
+ def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
203
+ splitted = urlsplit(proxy)
204
+ if splitted.username and splitted.password:
205
+ return {'username': splitted.username, 'password': splitted.password,
206
+ 'server': urlunsplit((splitted.scheme, f'{splitted.hostname}:{splitted.port}', splitted.path, splitted.query, splitted.fragment))}
207
+ return {'server': proxy}
208
+
209
+ def __prepare_proxy_aiohttp(self, proxy: ProxySettings) -> str:
210
+ if 'username' in proxy and 'password' in proxy:
211
+ splitted = urlsplit(proxy['server'])
212
+ return urlunsplit((splitted.scheme, f'{proxy["username"]}:{proxy["password"]}@{splitted.netloc}', splitted.path, splitted.query, splitted.fragment))
213
+ return proxy['server']
214
+
190
215
  async def __aenter__(self) -> Capture:
191
216
  '''Launch the browser'''
192
217
  self._temp_harfile = NamedTemporaryFile(delete=False)
@@ -449,21 +474,22 @@ class Capture():
449
474
  # NOTE: Which perms are supported by which browsers varies
450
475
  # See https://github.com/microsoft/playwright/issues/16577
451
476
  chromium_permissions = [
452
- 'geolocation',
453
- 'midi',
454
- 'midi-sysex',
455
- 'notifications',
456
- 'camera',
457
- 'microphone',
458
- 'background-sync',
459
- 'ambient-light-sensor',
460
477
  'accelerometer',
461
- 'gyroscope',
462
- 'magnetometer',
463
478
  'accessibility-events',
479
+ 'ambient-light-sensor',
480
+ 'background-sync',
481
+ 'camera',
464
482
  'clipboard-read',
465
483
  'clipboard-write',
466
- 'payment-handler'
484
+ 'geolocation',
485
+ 'gyroscope',
486
+ 'magnetometer',
487
+ 'microphone',
488
+ 'midi-sysex',
489
+ 'midi',
490
+ 'notifications',
491
+ 'payment-handler',
492
+ 'storage-access'
467
493
  ]
468
494
 
469
495
  firefox_permissions = ['geolocation', 'notifications']
@@ -707,7 +733,6 @@ class Capture():
707
733
  await self.__dialog_clickthrough(page)
708
734
 
709
735
  await stealth_async(page, PCStealthConfig())
710
- # await stealth_async(page)
711
736
 
712
737
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
713
738
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
@@ -786,6 +811,8 @@ class Capture():
786
811
  self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
787
812
  except Error as e:
788
813
  self.logger.warning(f'Error while resolving captcha on {url}: {e}')
814
+ except (TimeoutError, asyncio.TimeoutError) as e:
815
+ self.logger.warning(f'[Timeout] Error while resolving captcha on {url}: {e}')
789
816
  except Exception as e:
790
817
  self.logger.exception(f'General error with captcha solving on {url}: {e}')
791
818
  # ======
@@ -892,8 +919,13 @@ class Capture():
892
919
  to_return['html'] = content
893
920
 
894
921
  if 'html' in to_return and to_return['html'] is not None and with_favicon:
895
- to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html'])
896
- got_favicons = True
922
+ try:
923
+ to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html'])
924
+ got_favicons = True
925
+ except (TimeoutError, asyncio.TimeoutError) as e:
926
+ self.logger.warning(f'[Timeout] Unable to get favicons: {e}')
927
+ except Exception as e:
928
+ self.logger.warning(f'Unable to get favicons: {e}')
897
929
 
898
930
  to_return['last_redirected_url'] = page.url
899
931
  to_return['png'] = await self._failsafe_get_screenshot(page)
@@ -1196,7 +1228,8 @@ class Capture():
1196
1228
  if self.proxy and self.proxy.get('server'):
1197
1229
  connector = ProxyConnector.from_url(self.proxy['server'])
1198
1230
 
1199
- async with aiohttp.ClientSession(connector=connector) as session:
1231
+ timeout = aiohttp.ClientTimeout(total=10)
1232
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
1200
1233
  while True:
1201
1234
  try:
1202
1235
  href = await main_frame.get_by_role("link", name="Alternatively, download audio as MP3").get_attribute("href")
@@ -1206,7 +1239,7 @@ class Capture():
1206
1239
  if not href:
1207
1240
  self.logger.warning('Unable to find download link for captcha.')
1208
1241
  return False
1209
- async with session.get(href, timeout=10, ssl=False) as response:
1242
+ async with session.get(href, ssl=False) as response:
1210
1243
  response.raise_for_status()
1211
1244
  mp3_content = await response.read()
1212
1245
  with NamedTemporaryFile() as mp3_file, NamedTemporaryFile() as wav_file:
@@ -1395,16 +1428,17 @@ class Capture():
1395
1428
  Method inspired by https://github.com/ail-project/ail-framework/blob/master/bin/lib/crawlers.py
1396
1429
  """
1397
1430
  connector = None
1398
- if self.proxy and self.proxy.get('server'):
1431
+ if self.proxy:
1399
1432
  # NOTE 2024-05-17: switch to async to fetch, the lib uses socks5h by default
1400
- connector = ProxyConnector.from_url(self.proxy['server'])
1433
+ connector = ProxyConnector.from_url(self.__prepare_proxy_aiohttp(self.proxy))
1401
1434
 
1402
1435
  extracted_favicons = self.__extract_favicons(rendered_content)
1403
1436
  if not extracted_favicons:
1404
1437
  return set()
1405
1438
  to_fetch, to_return = extracted_favicons
1406
1439
  to_fetch.add('/favicon.ico')
1407
- async with aiohttp.ClientSession(connector=connector) as session:
1440
+ timeout = aiohttp.ClientTimeout(total=10)
1441
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
1408
1442
  session.headers['user-agent'] = self.user_agent
1409
1443
  for u in to_fetch:
1410
1444
  try:
@@ -1414,7 +1448,7 @@ class Capture():
1414
1448
  if url_to_fetch in self._requests:
1415
1449
  favicon = self._requests[url_to_fetch]
1416
1450
  if not favicon:
1417
- async with session.get(url_to_fetch, timeout=5, ssl=False) as favicon_response:
1451
+ async with session.get(url_to_fetch, ssl=False) as favicon_response:
1418
1452
  favicon_response.raise_for_status()
1419
1453
  favicon = await favicon_response.read()
1420
1454
  if favicon:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.8
3
+ Version: 1.25.10
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -21,17 +21,17 @@ Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
23
  Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
24
- Requires-Dist: aiohttp-socks (>=0.8.4,<0.9.0)
25
- Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
24
+ Requires-Dist: aiohttp-socks (>=0.9,<0.10)
25
+ Requires-Dist: aiohttp[speedups] (>=3.10.1,<4.0.0)
26
26
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
27
27
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
28
28
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
29
- Requires-Dist: playwright (>=1.45.0,<2.0.0)
29
+ Requires-Dist: playwright (>=1.45.1,<2.0.0)
30
30
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
31
  Requires-Dist: puremagic (>=1.26,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=71.1.0,<72.0.0)
34
+ Requires-Dist: setuptools (>=72.1.0,<73.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -0,0 +1,9 @@
1
+ playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
+ playwrightcapture/capture.py,sha256=AayQPmsjzcN2TIBi4uuwDmV0kqIVLDdAfEaNPUg8TXY,72606
3
+ playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
+ playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
+ playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ playwrightcapture-1.25.10.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.25.10.dist-info/METADATA,sha256=3nVVIzM7kcR62_XYA1mRUxfoflPZwfLS5B8KDPy3Hks,3172
8
+ playwrightcapture-1.25.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
+ playwrightcapture-1.25.10.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=DzAx7xwF5IbYfIOt-4gzppc9Qzc1jadk93zN8PtVJZQ,70846
3
- playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
- playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
- playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.25.8.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.25.8.dist-info/METADATA,sha256=508SCjk5btHdQCfQrrivoGA7s-X-LY1TAv6Qp_HyeOE,3173
8
- playwrightcapture-1.25.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- playwrightcapture-1.25.8.dist-info/RECORD,,