PlaywrightCapture 1.24.8__tar.gz → 1.24.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.24.8
3
+ Version: 1.24.10
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -20,16 +20,17 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
- Requires-Dist: SpeechRecognition (>=3.10.3,<4.0.0) ; extra == "recaptcha"
23
+ Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
24
+ Requires-Dist: aiohttp-socks (>=0.8.4,<0.9.0)
25
+ Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
24
26
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
25
27
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
26
28
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
27
- Requires-Dist: playwright (>=1.43.0,<2.0.0)
29
+ Requires-Dist: playwright (>=1.44.0,<2.0.0)
28
30
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
29
- Requires-Dist: puremagic (>=1.22,<2.0)
31
+ Requires-Dist: puremagic (>=1.23,<2.0)
30
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
31
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
32
- Requires-Dist: requests[socks] (>=2.31.0,<3.0.0) ; extra == "recaptcha"
33
34
  Requires-Dist: setuptools (>=69.5.1,<70.0.0)
34
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
35
36
  Requires-Dist: w3lib (>=2.1.2,<3.0.0)
@@ -20,11 +20,11 @@ from typing import Any, TypedDict, Literal, TYPE_CHECKING, MutableMapping
20
20
  from urllib.parse import urlparse, unquote, urljoin
21
21
  from zipfile import ZipFile
22
22
 
23
+ import aiohttp
23
24
  import dateparser
24
- import requests
25
25
  import urllib3
26
26
 
27
-
27
+ from aiohttp_socks import ProxyConnector # type: ignore[import-untyped]
28
28
  from bs4 import BeautifulSoup
29
29
  from charset_normalizer import from_bytes
30
30
  from playwright._impl._errors import TargetClosedError
@@ -638,7 +638,7 @@ class Capture():
638
638
  except Exception:
639
639
  pass
640
640
  except Exception as e:
641
- self.logger.warning(f'Unable to store request: {e}')
641
+ self.logger.info(f'Unable to store request: {e}')
642
642
 
643
643
  if page is not None:
644
644
  capturing_sub = True
@@ -658,7 +658,7 @@ class Capture():
658
658
  await self.__dialog_clickthrough(page)
659
659
 
660
660
  await stealth_async(page)
661
- page.set_default_timeout(self._capture_timeout * 1000)
661
+ page.set_default_timeout((self._capture_timeout - 2) * 1000)
662
662
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
663
663
  page.on("requestfinished", store_request)
664
664
 
@@ -809,7 +809,7 @@ class Capture():
809
809
  to_return['html'] = content
810
810
 
811
811
  if 'html' in to_return and to_return['html'] is not None and with_favicon:
812
- to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
812
+ to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html'])
813
813
  got_favicons = True
814
814
 
815
815
  to_return['last_redirected_url'] = page.url
@@ -949,6 +949,7 @@ class Capture():
949
949
  to_return['error'] = f'Unable to get the cookies: {e}'
950
950
  # frames_tree = self.make_frame_tree(page.main_frame)
951
951
  try:
952
+ page.remove_listener("requestfinished", store_request)
952
953
  await page.close()
953
954
  await self.context.close() # context needs to be closed to generate the HAR
954
955
  self.logger.debug('Context closed.')
@@ -1096,36 +1097,44 @@ class Capture():
1096
1097
 
1097
1098
  # click on audio challenge button
1098
1099
  await main_frame.get_by_role("button", name="Get an audio challenge").click()
1099
- while True:
1100
- try:
1101
- href = await main_frame.get_by_role("link", name="Alternatively, download audio as MP3").get_attribute("href")
1102
- except Exception as e:
1103
- self.logger.warning(f'Google caught the browser as a robot, sorry: {e}')
1104
- return False
1105
- if not href:
1106
- self.logger.warning('Unable to find download link for captcha.')
1107
- return False
1108
- r = requests.get(href, allow_redirects=True)
1109
- with NamedTemporaryFile() as mp3_file, NamedTemporaryFile() as wav_file:
1110
- mp3_file.write(r.content)
1111
- pydub.AudioSegment.from_mp3(mp3_file.name).export(wav_file.name, format="wav")
1112
- recognizer = Recognizer()
1113
- recaptcha_audio = AudioFile(wav_file.name)
1114
- with recaptcha_audio as source:
1115
- audio = recognizer.record(source)
1116
- text = recognizer.recognize_google(audio)
1117
- await main_frame.get_by_role("textbox", name="Enter what you hear").fill(text)
1118
- await main_frame.get_by_role("button", name="Verify").click()
1119
- await self._safe_wait(page, 5)
1120
- await self._wait_for_random_timeout(page, random.randint(3, 6))
1121
- try:
1122
- if await recaptcha_init_frame.locator("//span[@id='recaptcha-anchor']").first.is_checked(timeout=5000):
1123
- self.logger.info('Captcha solved successfully')
1124
- return True
1125
- elif await main_frame.get_by_role("textbox", name="Enter what you hear").is_editable(timeout=5000):
1126
- self.logger.info('Unable to find checkbox, needs to solve more captchas')
1127
- except PlaywrightTimeoutError as e:
1128
- self.logger.info(f'Unexpected timeout: {e}')
1100
+
1101
+ connector = None
1102
+ if self.proxy and self.proxy.get('server'):
1103
+ connector = ProxyConnector.from_url(self.proxy['server'])
1104
+
1105
+ async with aiohttp.ClientSession(connector=connector) as session:
1106
+ while True:
1107
+ try:
1108
+ href = await main_frame.get_by_role("link", name="Alternatively, download audio as MP3").get_attribute("href")
1109
+ except Exception as e:
1110
+ self.logger.warning(f'Google caught the browser as a robot, sorry: {e}')
1111
+ return False
1112
+ if not href:
1113
+ self.logger.warning('Unable to find download link for captcha.')
1114
+ return False
1115
+ async with session.get(href, timeout=10, ssl=False) as response:
1116
+ response.raise_for_status()
1117
+ mp3_content = await response.read()
1118
+ with NamedTemporaryFile() as mp3_file, NamedTemporaryFile() as wav_file:
1119
+ mp3_file.write(mp3_content)
1120
+ pydub.AudioSegment.from_mp3(mp3_file.name).export(wav_file.name, format="wav")
1121
+ recognizer = Recognizer()
1122
+ recaptcha_audio = AudioFile(wav_file.name)
1123
+ with recaptcha_audio as source:
1124
+ audio = recognizer.record(source)
1125
+ text = recognizer.recognize_google(audio)
1126
+ await main_frame.get_by_role("textbox", name="Enter what you hear").fill(text)
1127
+ await main_frame.get_by_role("button", name="Verify").click()
1128
+ await self._safe_wait(page, 5)
1129
+ await self._wait_for_random_timeout(page, random.randint(3, 6))
1130
+ try:
1131
+ if await recaptcha_init_frame.locator("//span[@id='recaptcha-anchor']").first.is_checked(timeout=5000):
1132
+ self.logger.info('Captcha solved successfully')
1133
+ return True
1134
+ elif await main_frame.get_by_role("textbox", name="Enter what you hear").is_editable(timeout=5000):
1135
+ self.logger.info('Unable to find checkbox, needs to solve more captchas')
1136
+ except PlaywrightTimeoutError as e:
1137
+ self.logger.info(f'Unexpected timeout: {e}')
1129
1138
 
1130
1139
  def _update_exceptions(self, exception: Error) -> None:
1131
1140
  if '\n' in exception.message:
@@ -1275,63 +1284,56 @@ class Capture():
1275
1284
  self.logger.info(f'Not processing {tag}')
1276
1285
  return favicons_urls, favicons
1277
1286
 
1278
- def get_favicons(self, rendered_url: str, rendered_content: str) -> set[bytes]:
1287
+ async def get_favicons(self, rendered_url: str, rendered_content: str) -> set[bytes]:
1279
1288
  """This method will be deprecated as soon as Playwright will be able to fetch favicons (https://github.com/microsoft/playwright/issues/7493).
1280
1289
  In the meantime, we try to get all the potential ones in this method.
1281
1290
  Method inspired by https://github.com/ail-project/ail-framework/blob/master/bin/lib/crawlers.py
1282
1291
  """
1292
+ connector = None
1293
+ if self.proxy and self.proxy.get('server'):
1294
+ # NOTE 2024-05-17: switch to async to fetch, the lib uses socks5h by default
1295
+ connector = ProxyConnector.from_url(self.proxy['server'])
1296
+
1283
1297
  extracted_favicons = self.__extract_favicons(rendered_content)
1284
1298
  if not extracted_favicons:
1285
1299
  return set()
1286
1300
  to_fetch, to_return = extracted_favicons
1287
1301
  to_fetch.add('/favicon.ico')
1288
- session = requests.session()
1289
- session.verify = False
1290
- session.headers['user-agent'] = self.user_agent
1291
- if self.proxy and self.proxy.get('server'):
1292
- proxy_server = self.proxy['server']
1293
- # Make sure the DNS desolution is done remotely
1294
- # https://urllib3.readthedocs.io/en/stable/advanced-usage.html#socks-proxies
1295
- if proxy_server.startswith('socks5://'):
1296
- proxy_server = proxy_server.replace('socks5://', 'socks5h://')
1297
- if proxy_server.startswith('socks4://'):
1298
- proxy_server = proxy_server.replace('socks4://', 'socks4a://')
1299
-
1300
- proxies = {'http': proxy_server, 'https': proxy_server}
1301
- session.proxies.update(proxies)
1302
- for u in to_fetch:
1303
- try:
1304
- self.logger.debug(f'Attempting to fetch favicon from {u}.')
1305
- url_to_fetch = urljoin(rendered_url, u)
1306
- favicon = b''
1307
- if url_to_fetch in self._requests:
1308
- favicon = self._requests[url_to_fetch]
1309
- if not favicon:
1310
- favicon_response = session.get(url_to_fetch, timeout=5)
1311
- favicon_response.raise_for_status()
1312
- favicon = favicon_response.content
1313
- if favicon:
1314
- try:
1315
- mimetype = from_string(favicon, mime=True)
1316
- except PureError:
1317
- # unable to identify the mimetype
1318
- self.logger.debug(f'Unable to identify the mimetype for favicon from {u}')
1319
- else:
1320
- if not mimetype:
1321
- # empty, ignore
1322
- pass
1323
- elif mimetype.startswith('image'):
1324
- to_return.add(favicon)
1325
- elif mimetype.startswith('text'):
1326
- # Just ignore, it's probably a 404 page
1327
- pass
1302
+ async with aiohttp.ClientSession(connector=connector) as session:
1303
+ session.headers['user-agent'] = self.user_agent
1304
+ for u in to_fetch:
1305
+ try:
1306
+ self.logger.debug(f'Attempting to fetch favicon from {u}.')
1307
+ url_to_fetch = urljoin(rendered_url, u)
1308
+ favicon = b''
1309
+ if url_to_fetch in self._requests:
1310
+ favicon = self._requests[url_to_fetch]
1311
+ if not favicon:
1312
+ async with session.get(url_to_fetch, timeout=5, ssl=False) as favicon_response:
1313
+ favicon_response.raise_for_status()
1314
+ favicon = await favicon_response.read()
1315
+ if favicon:
1316
+ try:
1317
+ mimetype = from_string(favicon, mime=True)
1318
+ except PureError:
1319
+ # unable to identify the mimetype
1320
+ self.logger.debug(f'Unable to identify the mimetype for favicon from {u}')
1328
1321
  else:
1329
- self.logger.warning(f'Unexpected mimetype for favicon from {u}: {mimetype}')
1330
- self.logger.debug(f'Done with favicon from {u}.')
1331
- except requests.HTTPError as e:
1332
- self.logger.debug(f'Unable to fetch favicon from {u}: {e}')
1333
- except Exception as e:
1334
- self.logger.info(f'Unexpectedly unable to fetch favicon from {u}: {e}')
1322
+ if not mimetype:
1323
+ # empty, ignore
1324
+ pass
1325
+ elif mimetype.startswith('image'):
1326
+ to_return.add(favicon)
1327
+ elif mimetype.startswith('text'):
1328
+ # Just ignore, it's probably a 404 page
1329
+ pass
1330
+ else:
1331
+ self.logger.warning(f'Unexpected mimetype for favicon from {u}: {mimetype}')
1332
+ self.logger.debug(f'Done with favicon from {u}.')
1333
+ except aiohttp.ClientError as e:
1334
+ self.logger.debug(f'Unable to fetch favicon from {u}: {e}')
1335
+ except Exception as e:
1336
+ self.logger.info(f'Unexpectedly unable to fetch favicon from {u}: {e}')
1335
1337
  return to_return
1336
1338
 
1337
1339
  # END FAVICON EXTRACTOR
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.24.8"
3
+ version = "1.24.10"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -19,32 +19,32 @@ classifiers=[
19
19
 
20
20
  [tool.poetry.dependencies]
21
21
  python = "^3.8"
22
- playwright = "^1.43.0"
22
+ playwright = "^1.44.0"
23
23
  dateparser = "^1.2.0"
24
24
  beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
25
25
  w3lib = "^2.1.2"
26
- requests = {extras = ["socks"], version = "^2.31.0"}
27
26
  pydub = {version = "^0.25.1", optional = true}
28
- SpeechRecognition = {version = "^3.10.3", optional = true}
27
+ SpeechRecognition = {version = "^3.10.4", optional = true}
29
28
  pytz = {"version" = "^2024.1", python = "<3.9"}
30
29
  tzdata = "^2024.1"
31
30
  playwright-stealth = "^1.0.6"
32
31
  setuptools = "^69.5.1"
33
- puremagic = "^1.22"
32
+ puremagic = "^1.23"
34
33
  async-timeout = {version = "^4.0.3", python = "<3.11"}
34
+ aiohttp = {extras = ["speedups"], version = "^3.9.5"}
35
+ aiohttp-socks = "^0.8.4"
35
36
 
36
37
  [tool.poetry.extras]
37
- recaptcha = ["requests", "pydub", "SpeechRecognition"]
38
+ recaptcha = ["pydub", "SpeechRecognition"]
38
39
 
39
40
  [tool.poetry.group.dev]
40
41
  optional = true
41
42
 
42
43
  [tool.poetry.group.dev.dependencies]
43
- types-beautifulsoup4 = "^4.12.0.20240229"
44
- pytest = "^8.2.0"
44
+ types-beautifulsoup4 = "^4.12.0.20240511"
45
+ pytest = "^8.2.1"
45
46
  mypy = "^1.10.0"
46
47
  types-dateparser = "^1.2.0.20240420"
47
- types-requests = "^2.31.0.20240406"
48
48
  types-pytz = "^2024.1.0.20240417"
49
49
 
50
50