PlaywrightCapture 1.25.8__py3-none-any.whl → 1.25.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +65 -31
- {playwrightcapture-1.25.8.dist-info → playwrightcapture-1.25.10.dist-info}/METADATA +5 -5
- playwrightcapture-1.25.10.dist-info/RECORD +9 -0
- playwrightcapture-1.25.8.dist-info/RECORD +0 -9
- {playwrightcapture-1.25.8.dist-info → playwrightcapture-1.25.10.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.25.8.dist-info → playwrightcapture-1.25.10.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -18,7 +18,7 @@ from io import BytesIO
|
|
18
18
|
from logging import LoggerAdapter, Logger
|
19
19
|
from tempfile import NamedTemporaryFile
|
20
20
|
from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping, Generator
|
21
|
-
from urllib.parse import urlparse, unquote, urljoin
|
21
|
+
from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit
|
22
22
|
from zipfile import ZipFile
|
23
23
|
|
24
24
|
import aiohttp
|
@@ -96,28 +96,40 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
96
96
|
return msg, kwargs
|
97
97
|
|
98
98
|
|
99
|
+
# good test pages:
|
100
|
+
# https://bot.incolumitas.com/
|
101
|
+
# https://fingerprintjs.github.io/BotD/main/
|
102
|
+
|
99
103
|
@dataclass
|
100
104
|
class PCStealthConfig(StealthConfig): # type: ignore[misc]
|
101
105
|
|
102
106
|
@property
|
103
107
|
def enabled_scripts(self) -> Generator[str, None, None]:
|
104
|
-
self.webdriver = True
|
105
|
-
self.webgl_vendor = True
|
106
108
|
self.chrome_app = True
|
107
109
|
self.chrome_csi = True
|
108
|
-
self.chrome_load_times = True
|
109
110
|
self.chrome_runtime = True
|
111
|
+
self.chrome_load_times = True
|
112
|
+
self.navigator_plugins = True
|
113
|
+
self.hairline = True
|
110
114
|
self.iframe_content_window = True
|
111
115
|
self.media_codecs = True
|
116
|
+
|
117
|
+
# permissions are handled directly in playwright
|
118
|
+
self.navigator_permissions = False
|
119
|
+
# Platform is correct now
|
120
|
+
self.navigator_platform = False
|
121
|
+
# probably useless, but it will fallback to 4 regardless
|
112
122
|
self.navigator_hardware_concurrency = 4
|
123
|
+
# Webgl vendor is correct now
|
124
|
+
self.webgl_vendor = False
|
125
|
+
# Set by the viewport
|
126
|
+
self.outerdimensions = False
|
127
|
+
|
128
|
+
# Not working with Playwright 1.45+
|
113
129
|
self.navigator_languages = False # Causes issue
|
114
|
-
self.navigator_permissions = True
|
115
|
-
self.navigator_platform = True
|
116
|
-
self.navigator_plugins = True
|
117
130
|
self.navigator_user_agent = False # Causes issues
|
118
131
|
self.navigator_vendor = False # Causes issues
|
119
|
-
|
120
|
-
self.hairline = True
|
132
|
+
|
121
133
|
yield from super().enabled_scripts
|
122
134
|
|
123
135
|
|
@@ -164,7 +176,7 @@ class Capture():
|
|
164
176
|
self.proxy: ProxySettings = {}
|
165
177
|
if proxy:
|
166
178
|
if isinstance(proxy, str):
|
167
|
-
self.proxy =
|
179
|
+
self.proxy = self.__prepare_proxy_playwright(proxy)
|
168
180
|
elif isinstance(proxy, dict):
|
169
181
|
self.proxy = {'server': proxy['server'], 'bypass': proxy.get('bypass', ''),
|
170
182
|
'username': proxy.get('username', ''),
|
@@ -184,9 +196,22 @@ class Capture():
|
|
184
196
|
self._viewport: ViewportSize | None = None
|
185
197
|
self._user_agent: str = ''
|
186
198
|
self._timezone_id: str = ''
|
187
|
-
self._locale: str = ''
|
199
|
+
self._locale: str = 'en-US'
|
188
200
|
self._color_scheme: Literal['dark', 'light', 'no-preference', 'null'] | None = None
|
189
201
|
|
202
|
+
def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
|
203
|
+
splitted = urlsplit(proxy)
|
204
|
+
if splitted.username and splitted.password:
|
205
|
+
return {'username': splitted.username, 'password': splitted.password,
|
206
|
+
'server': urlunsplit((splitted.scheme, f'{splitted.hostname}:{splitted.port}', splitted.path, splitted.query, splitted.fragment))}
|
207
|
+
return {'server': proxy}
|
208
|
+
|
209
|
+
def __prepare_proxy_aiohttp(self, proxy: ProxySettings) -> str:
|
210
|
+
if 'username' in proxy and 'password' in proxy:
|
211
|
+
splitted = urlsplit(proxy['server'])
|
212
|
+
return urlunsplit((splitted.scheme, f'{proxy["username"]}:{proxy["password"]}@{splitted.netloc}', splitted.path, splitted.query, splitted.fragment))
|
213
|
+
return proxy['server']
|
214
|
+
|
190
215
|
async def __aenter__(self) -> Capture:
|
191
216
|
'''Launch the browser'''
|
192
217
|
self._temp_harfile = NamedTemporaryFile(delete=False)
|
@@ -449,21 +474,22 @@ class Capture():
|
|
449
474
|
# NOTE: Which perms are supported by which browsers varies
|
450
475
|
# See https://github.com/microsoft/playwright/issues/16577
|
451
476
|
chromium_permissions = [
|
452
|
-
'geolocation',
|
453
|
-
'midi',
|
454
|
-
'midi-sysex',
|
455
|
-
'notifications',
|
456
|
-
'camera',
|
457
|
-
'microphone',
|
458
|
-
'background-sync',
|
459
|
-
'ambient-light-sensor',
|
460
477
|
'accelerometer',
|
461
|
-
'gyroscope',
|
462
|
-
'magnetometer',
|
463
478
|
'accessibility-events',
|
479
|
+
'ambient-light-sensor',
|
480
|
+
'background-sync',
|
481
|
+
'camera',
|
464
482
|
'clipboard-read',
|
465
483
|
'clipboard-write',
|
466
|
-
'
|
484
|
+
'geolocation',
|
485
|
+
'gyroscope',
|
486
|
+
'magnetometer',
|
487
|
+
'microphone',
|
488
|
+
'midi-sysex',
|
489
|
+
'midi',
|
490
|
+
'notifications',
|
491
|
+
'payment-handler',
|
492
|
+
'storage-access'
|
467
493
|
]
|
468
494
|
|
469
495
|
firefox_permissions = ['geolocation', 'notifications']
|
@@ -707,7 +733,6 @@ class Capture():
|
|
707
733
|
await self.__dialog_clickthrough(page)
|
708
734
|
|
709
735
|
await stealth_async(page, PCStealthConfig())
|
710
|
-
# await stealth_async(page)
|
711
736
|
|
712
737
|
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
713
738
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
@@ -786,6 +811,8 @@ class Capture():
|
|
786
811
|
self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
|
787
812
|
except Error as e:
|
788
813
|
self.logger.warning(f'Error while resolving captcha on {url}: {e}')
|
814
|
+
except (TimeoutError, asyncio.TimeoutError) as e:
|
815
|
+
self.logger.warning(f'[Timeout] Error while resolving captcha on {url}: {e}')
|
789
816
|
except Exception as e:
|
790
817
|
self.logger.exception(f'General error with captcha solving on {url}: {e}')
|
791
818
|
# ======
|
@@ -892,8 +919,13 @@ class Capture():
|
|
892
919
|
to_return['html'] = content
|
893
920
|
|
894
921
|
if 'html' in to_return and to_return['html'] is not None and with_favicon:
|
895
|
-
|
896
|
-
|
922
|
+
try:
|
923
|
+
to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html'])
|
924
|
+
got_favicons = True
|
925
|
+
except (TimeoutError, asyncio.TimeoutError) as e:
|
926
|
+
self.logger.warning(f'[Timeout] Unable to get favicons: {e}')
|
927
|
+
except Exception as e:
|
928
|
+
self.logger.warning(f'Unable to get favicons: {e}')
|
897
929
|
|
898
930
|
to_return['last_redirected_url'] = page.url
|
899
931
|
to_return['png'] = await self._failsafe_get_screenshot(page)
|
@@ -1196,7 +1228,8 @@ class Capture():
|
|
1196
1228
|
if self.proxy and self.proxy.get('server'):
|
1197
1229
|
connector = ProxyConnector.from_url(self.proxy['server'])
|
1198
1230
|
|
1199
|
-
|
1231
|
+
timeout = aiohttp.ClientTimeout(total=10)
|
1232
|
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
1200
1233
|
while True:
|
1201
1234
|
try:
|
1202
1235
|
href = await main_frame.get_by_role("link", name="Alternatively, download audio as MP3").get_attribute("href")
|
@@ -1206,7 +1239,7 @@ class Capture():
|
|
1206
1239
|
if not href:
|
1207
1240
|
self.logger.warning('Unable to find download link for captcha.')
|
1208
1241
|
return False
|
1209
|
-
async with session.get(href,
|
1242
|
+
async with session.get(href, ssl=False) as response:
|
1210
1243
|
response.raise_for_status()
|
1211
1244
|
mp3_content = await response.read()
|
1212
1245
|
with NamedTemporaryFile() as mp3_file, NamedTemporaryFile() as wav_file:
|
@@ -1395,16 +1428,17 @@ class Capture():
|
|
1395
1428
|
Method inspired by https://github.com/ail-project/ail-framework/blob/master/bin/lib/crawlers.py
|
1396
1429
|
"""
|
1397
1430
|
connector = None
|
1398
|
-
if self.proxy
|
1431
|
+
if self.proxy:
|
1399
1432
|
# NOTE 2024-05-17: switch to async to fetch, the lib uses socks5h by default
|
1400
|
-
connector = ProxyConnector.from_url(self.proxy
|
1433
|
+
connector = ProxyConnector.from_url(self.__prepare_proxy_aiohttp(self.proxy))
|
1401
1434
|
|
1402
1435
|
extracted_favicons = self.__extract_favicons(rendered_content)
|
1403
1436
|
if not extracted_favicons:
|
1404
1437
|
return set()
|
1405
1438
|
to_fetch, to_return = extracted_favicons
|
1406
1439
|
to_fetch.add('/favicon.ico')
|
1407
|
-
|
1440
|
+
timeout = aiohttp.ClientTimeout(total=10)
|
1441
|
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
1408
1442
|
session.headers['user-agent'] = self.user_agent
|
1409
1443
|
for u in to_fetch:
|
1410
1444
|
try:
|
@@ -1414,7 +1448,7 @@ class Capture():
|
|
1414
1448
|
if url_to_fetch in self._requests:
|
1415
1449
|
favicon = self._requests[url_to_fetch]
|
1416
1450
|
if not favicon:
|
1417
|
-
async with session.get(url_to_fetch,
|
1451
|
+
async with session.get(url_to_fetch, ssl=False) as favicon_response:
|
1418
1452
|
favicon_response.raise_for_status()
|
1419
1453
|
favicon = await favicon_response.read()
|
1420
1454
|
if favicon:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.10
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -21,17 +21,17 @@ Classifier: Topic :: Internet
|
|
21
21
|
Classifier: Topic :: Security
|
22
22
|
Provides-Extra: recaptcha
|
23
23
|
Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
|
24
|
-
Requires-Dist: aiohttp-socks (>=0.
|
25
|
-
Requires-Dist: aiohttp[speedups] (>=3.
|
24
|
+
Requires-Dist: aiohttp-socks (>=0.9,<0.10)
|
25
|
+
Requires-Dist: aiohttp[speedups] (>=3.10.1,<4.0.0)
|
26
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
27
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
|
-
Requires-Dist: playwright (>=1.45.
|
29
|
+
Requires-Dist: playwright (>=1.45.1,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
31
|
Requires-Dist: puremagic (>=1.26,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=
|
34
|
+
Requires-Dist: setuptools (>=72.1.0,<73.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -0,0 +1,9 @@
|
|
1
|
+
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
+
playwrightcapture/capture.py,sha256=AayQPmsjzcN2TIBi4uuwDmV0kqIVLDdAfEaNPUg8TXY,72606
|
3
|
+
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
+
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
|
+
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
playwrightcapture-1.25.10.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.25.10.dist-info/METADATA,sha256=3nVVIzM7kcR62_XYA1mRUxfoflPZwfLS5B8KDPy3Hks,3172
|
8
|
+
playwrightcapture-1.25.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.25.10.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=DzAx7xwF5IbYfIOt-4gzppc9Qzc1jadk93zN8PtVJZQ,70846
|
3
|
-
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
|
-
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
|
-
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.25.8.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
-
playwrightcapture-1.25.8.dist-info/METADATA,sha256=508SCjk5btHdQCfQrrivoGA7s-X-LY1TAv6Qp_HyeOE,3173
|
8
|
-
playwrightcapture-1.25.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
-
playwrightcapture-1.25.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|