PlaywrightCapture 1.24.7__tar.gz → 1.24.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/PKG-INFO +5 -4
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/playwrightcapture/capture.py +84 -75
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/pyproject.toml +7 -7
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/LICENSE +0 -0
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/README.md +0 -0
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.24.7 → playwrightcapture-1.24.9}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.24.
|
3
|
+
Version: 1.24.9
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -20,16 +20,17 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Topic :: Internet
|
21
21
|
Classifier: Topic :: Security
|
22
22
|
Provides-Extra: recaptcha
|
23
|
-
Requires-Dist: SpeechRecognition (>=3.10.
|
23
|
+
Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
|
24
|
+
Requires-Dist: aiohttp-socks (>=0.8.4,<0.9.0)
|
25
|
+
Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
|
24
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
25
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
26
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
27
29
|
Requires-Dist: playwright (>=1.43.0,<2.0.0)
|
28
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
29
|
-
Requires-Dist: puremagic (>=1.
|
31
|
+
Requires-Dist: puremagic (>=1.23,<2.0)
|
30
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
31
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
32
|
-
Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
|
33
34
|
Requires-Dist: setuptools (>=69.5.1,<70.0.0)
|
34
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
35
36
|
Requires-Dist: w3lib (>=2.1.2,<3.0.0)
|
@@ -20,11 +20,11 @@ from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
|
|
20
20
|
from urllib.parse import urlparse, unquote, urljoin
|
21
21
|
from zipfile import ZipFile
|
22
22
|
|
23
|
+
import aiohttp
|
23
24
|
import dateparser
|
24
|
-
import requests
|
25
25
|
import urllib3
|
26
26
|
|
27
|
-
|
27
|
+
from aiohttp_socks import ProxyConnector # type: ignore[import-untyped]
|
28
28
|
from bs4 import BeautifulSoup
|
29
29
|
from charset_normalizer import from_bytes
|
30
30
|
from playwright._impl._errors import TargetClosedError
|
@@ -638,7 +638,7 @@ class Capture():
|
|
638
638
|
except Exception:
|
639
639
|
pass
|
640
640
|
except Exception as e:
|
641
|
-
self.logger.
|
641
|
+
self.logger.info(f'Unable to store request: {e}')
|
642
642
|
|
643
643
|
if page is not None:
|
644
644
|
capturing_sub = True
|
@@ -658,7 +658,7 @@ class Capture():
|
|
658
658
|
await self.__dialog_clickthrough(page)
|
659
659
|
|
660
660
|
await stealth_async(page)
|
661
|
-
page.set_default_timeout(self._capture_timeout * 1000)
|
661
|
+
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
662
662
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
663
663
|
page.on("requestfinished", store_request)
|
664
664
|
|
@@ -809,7 +809,7 @@ class Capture():
|
|
809
809
|
to_return['html'] = content
|
810
810
|
|
811
811
|
if 'html' in to_return and to_return['html'] is not None and with_favicon:
|
812
|
-
to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
|
812
|
+
to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html'])
|
813
813
|
got_favicons = True
|
814
814
|
|
815
815
|
to_return['last_redirected_url'] = page.url
|
@@ -949,6 +949,7 @@ class Capture():
|
|
949
949
|
to_return['error'] = f'Unable to get the cookies: {e}'
|
950
950
|
# frames_tree = self.make_frame_tree(page.main_frame)
|
951
951
|
try:
|
952
|
+
page.remove_listener("requestfinished", store_request)
|
952
953
|
await page.close()
|
953
954
|
await self.context.close() # context needs to be closed to generate the HAR
|
954
955
|
self.logger.debug('Context closed.')
|
@@ -1096,36 +1097,44 @@ class Capture():
|
|
1096
1097
|
|
1097
1098
|
# click on audio challenge button
|
1098
1099
|
await main_frame.get_by_role("button", name="Get an audio challenge").click()
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
with
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
self.
|
1100
|
+
|
1101
|
+
connector = None
|
1102
|
+
if self.proxy and self.proxy.get('server'):
|
1103
|
+
connector = ProxyConnector.from_url(self.proxy['server'])
|
1104
|
+
|
1105
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
1106
|
+
while True:
|
1107
|
+
try:
|
1108
|
+
href = await main_frame.get_by_role("link", name="Alternatively, download audio as MP3").get_attribute("href")
|
1109
|
+
except Exception as e:
|
1110
|
+
self.logger.warning(f'Google caught the browser as a robot, sorry: {e}')
|
1111
|
+
return False
|
1112
|
+
if not href:
|
1113
|
+
self.logger.warning('Unable to find download link for captcha.')
|
1114
|
+
return False
|
1115
|
+
async with session.get(href, timeout=10, ssl=False) as response:
|
1116
|
+
response.raise_for_status()
|
1117
|
+
mp3_content = await response.read()
|
1118
|
+
with NamedTemporaryFile() as mp3_file, NamedTemporaryFile() as wav_file:
|
1119
|
+
mp3_file.write(mp3_content)
|
1120
|
+
pydub.AudioSegment.from_mp3(mp3_file.name).export(wav_file.name, format="wav")
|
1121
|
+
recognizer = Recognizer()
|
1122
|
+
recaptcha_audio = AudioFile(wav_file.name)
|
1123
|
+
with recaptcha_audio as source:
|
1124
|
+
audio = recognizer.record(source)
|
1125
|
+
text = recognizer.recognize_google(audio)
|
1126
|
+
await main_frame.get_by_role("textbox", name="Enter what you hear").fill(text)
|
1127
|
+
await main_frame.get_by_role("button", name="Verify").click()
|
1128
|
+
await self._safe_wait(page, 5)
|
1129
|
+
await self._wait_for_random_timeout(page, random.randint(3, 6))
|
1130
|
+
try:
|
1131
|
+
if await recaptcha_init_frame.locator("//span[@id='recaptcha-anchor']").first.is_checked(timeout=5000):
|
1132
|
+
self.logger.info('Captcha solved successfully')
|
1133
|
+
return True
|
1134
|
+
elif await main_frame.get_by_role("textbox", name="Enter what you hear").is_editable(timeout=5000):
|
1135
|
+
self.logger.info('Unable to find checkbox, needs to solve more captchas')
|
1136
|
+
except PlaywrightTimeoutError as e:
|
1137
|
+
self.logger.info(f'Unexpected timeout: {e}')
|
1129
1138
|
|
1130
1139
|
def _update_exceptions(self, exception: Error) -> None:
|
1131
1140
|
if '\n' in exception.message:
|
@@ -1275,56 +1284,56 @@ class Capture():
|
|
1275
1284
|
self.logger.info(f'Not processing {tag}')
|
1276
1285
|
return favicons_urls, favicons
|
1277
1286
|
|
1278
|
-
def get_favicons(self, rendered_url: str, rendered_content: str) -> set[bytes]:
|
1287
|
+
async def get_favicons(self, rendered_url: str, rendered_content: str) -> set[bytes]:
|
1279
1288
|
"""This method will be deprecated as soon as Playwright will be able to fetch favicons (https://github.com/microsoft/playwright/issues/7493).
|
1280
1289
|
In the meantime, we try to get all the potential ones in this method.
|
1281
1290
|
Method inspired by https://github.com/ail-project/ail-framework/blob/master/bin/lib/crawlers.py
|
1282
1291
|
"""
|
1292
|
+
connector = None
|
1293
|
+
if self.proxy and self.proxy.get('server'):
|
1294
|
+
# NOTE 2024-05-17: switch to async to fetch, the lib uses socks5h by default
|
1295
|
+
connector = ProxyConnector.from_url(self.proxy['server'])
|
1296
|
+
|
1283
1297
|
extracted_favicons = self.__extract_favicons(rendered_content)
|
1284
1298
|
if not extracted_favicons:
|
1285
1299
|
return set()
|
1286
1300
|
to_fetch, to_return = extracted_favicons
|
1287
1301
|
to_fetch.add('/favicon.ico')
|
1288
|
-
|
1289
|
-
|
1290
|
-
|
1291
|
-
|
1292
|
-
|
1293
|
-
|
1294
|
-
|
1295
|
-
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
favicon
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
1305
|
-
|
1306
|
-
|
1307
|
-
try:
|
1308
|
-
mimetype = from_string(favicon, mime=True)
|
1309
|
-
except PureError:
|
1310
|
-
# unable to identify the mimetype
|
1311
|
-
self.logger.debug(f'Unable to identify the mimetype for favicon from {u}')
|
1312
|
-
else:
|
1313
|
-
if not mimetype:
|
1314
|
-
# empty, ignore
|
1315
|
-
pass
|
1316
|
-
elif mimetype.startswith('image'):
|
1317
|
-
to_return.add(favicon)
|
1318
|
-
elif mimetype.startswith('text'):
|
1319
|
-
# Just ignore, it's probably a 404 page
|
1320
|
-
pass
|
1302
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
1303
|
+
session.headers['user-agent'] = self.user_agent
|
1304
|
+
for u in to_fetch:
|
1305
|
+
try:
|
1306
|
+
self.logger.debug(f'Attempting to fetch favicon from {u}.')
|
1307
|
+
url_to_fetch = urljoin(rendered_url, u)
|
1308
|
+
favicon = b''
|
1309
|
+
if url_to_fetch in self._requests:
|
1310
|
+
favicon = self._requests[url_to_fetch]
|
1311
|
+
if not favicon:
|
1312
|
+
async with session.get(url_to_fetch, timeout=5, ssl=False) as favicon_response:
|
1313
|
+
favicon_response.raise_for_status()
|
1314
|
+
favicon = await favicon_response.read()
|
1315
|
+
if favicon:
|
1316
|
+
try:
|
1317
|
+
mimetype = from_string(favicon, mime=True)
|
1318
|
+
except PureError:
|
1319
|
+
# unable to identify the mimetype
|
1320
|
+
self.logger.debug(f'Unable to identify the mimetype for favicon from {u}')
|
1321
1321
|
else:
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1322
|
+
if not mimetype:
|
1323
|
+
# empty, ignore
|
1324
|
+
pass
|
1325
|
+
elif mimetype.startswith('image'):
|
1326
|
+
to_return.add(favicon)
|
1327
|
+
elif mimetype.startswith('text'):
|
1328
|
+
# Just ignore, it's probably a 404 page
|
1329
|
+
pass
|
1330
|
+
else:
|
1331
|
+
self.logger.warning(f'Unexpected mimetype for favicon from {u}: {mimetype}')
|
1332
|
+
self.logger.debug(f'Done with favicon from {u}.')
|
1333
|
+
except aiohttp.ClientError as e:
|
1334
|
+
self.logger.debug(f'Unable to fetch favicon from {u}: {e}')
|
1335
|
+
except Exception as e:
|
1336
|
+
self.logger.info(f'Unexpectedly unable to fetch favicon from {u}: {e}')
|
1328
1337
|
return to_return
|
1329
1338
|
|
1330
1339
|
# END FAVICON EXTRACTOR
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.24.
|
3
|
+
version = "1.24.9"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -23,28 +23,28 @@ playwright = "^1.43.0"
|
|
23
23
|
dateparser = "^1.2.0"
|
24
24
|
beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
|
25
25
|
w3lib = "^2.1.2"
|
26
|
-
requests = {extras = ["socks"], version = "^2.31.0"}
|
27
26
|
pydub = {version = "^0.25.1", optional = true}
|
28
|
-
SpeechRecognition = {version = "^3.10.
|
27
|
+
SpeechRecognition = {version = "^3.10.4", optional = true}
|
29
28
|
pytz = {"version" = "^2024.1", python = "<3.9"}
|
30
29
|
tzdata = "^2024.1"
|
31
30
|
playwright-stealth = "^1.0.6"
|
32
31
|
setuptools = "^69.5.1"
|
33
|
-
puremagic = "^1.
|
32
|
+
puremagic = "^1.23"
|
34
33
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
|
+
aiohttp = {extras = ["speedups"], version = "^3.9.5"}
|
35
|
+
aiohttp-socks = "^0.8.4"
|
35
36
|
|
36
37
|
[tool.poetry.extras]
|
37
|
-
recaptcha = ["
|
38
|
+
recaptcha = ["pydub", "SpeechRecognition"]
|
38
39
|
|
39
40
|
[tool.poetry.group.dev]
|
40
41
|
optional = true
|
41
42
|
|
42
43
|
[tool.poetry.group.dev.dependencies]
|
43
|
-
types-beautifulsoup4 = "^4.12.0.
|
44
|
+
types-beautifulsoup4 = "^4.12.0.20240511"
|
44
45
|
pytest = "^8.2.0"
|
45
46
|
mypy = "^1.10.0"
|
46
47
|
types-dateparser = "^1.2.0.20240420"
|
47
|
-
types-requests = "^2.31.0.20240406"
|
48
48
|
types-pytz = "^2024.1.0.20240417"
|
49
49
|
|
50
50
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|