PlaywrightCapture 1.28.6__tar.gz → 1.29.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/PKG-INFO +7 -5
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/playwrightcapture/capture.py +38 -0
- playwrightcapture-1.29.1/playwrightcapture/socks5dnslookup.py +111 -0
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/pyproject.toml +8 -6
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/LICENSE +0 -0
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/README.md +0 -0
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.28.6 → playwrightcapture-1.29.1}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.29.1
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
@@ -20,15 +20,17 @@ Classifier: Topic :: Security
|
|
20
20
|
Provides-Extra: recaptcha
|
21
21
|
Requires-Dist: SpeechRecognition (>=3.14.2) ; extra == "recaptcha"
|
22
22
|
Requires-Dist: aiohttp-socks (>=0.10.1)
|
23
|
-
Requires-Dist: aiohttp[speedups] (>=3.11.
|
23
|
+
Requires-Dist: aiohttp[speedups] (>=3.11.18)
|
24
24
|
Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
25
|
-
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.
|
25
|
+
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
|
26
26
|
Requires-Dist: dateparser (>=1.2.1)
|
27
|
-
Requires-Dist:
|
27
|
+
Requires-Dist: dnspython (>=2.7.0,<3.0.0)
|
28
|
+
Requires-Dist: playwright (>=1.52.0)
|
28
29
|
Requires-Dist: playwright-stealth (>=1.0.6)
|
29
30
|
Requires-Dist: puremagic (>=1.28)
|
30
31
|
Requires-Dist: pydub (>=0.25.1) ; extra == "recaptcha"
|
31
|
-
Requires-Dist:
|
32
|
+
Requires-Dist: python-socks (>=2.7.1,<3.0.0)
|
33
|
+
Requires-Dist: setuptools (>=80.0.1)
|
32
34
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0) ; python_version < "3.12"
|
33
35
|
Requires-Dist: tzdata (>=2025.2)
|
34
36
|
Requires-Dist: w3lib (>=2.3.1)
|
@@ -37,6 +37,7 @@ from w3lib.html import strip_html5_whitespace
|
|
37
37
|
from w3lib.url import canonicalize_url, safe_url_string
|
38
38
|
|
39
39
|
from .exceptions import UnknownPlaywrightBrowser, UnknownPlaywrightDevice, InvalidPlaywrightParameter
|
40
|
+
from .socks5dnslookup import Socks5Resolver
|
40
41
|
|
41
42
|
from zoneinfo import available_timezones
|
42
43
|
all_timezones_set = available_timezones()
|
@@ -144,6 +145,7 @@ class Capture():
|
|
144
145
|
|
145
146
|
def __init__(self, browser: BROWSER | None=None, device_name: str | None=None,
|
146
147
|
proxy: str | dict[str, str] | None=None,
|
148
|
+
socks5_dns_resolver: str | list[str] | None=None,
|
147
149
|
general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
|
148
150
|
uuid: str | None=None, headless: bool=True):
|
149
151
|
"""Captures a page with Playwright.
|
@@ -151,6 +153,7 @@ class Capture():
|
|
151
153
|
:param browser: The browser to use for the capture.
|
152
154
|
:param device_name: The pre-defined device to use for the capture (from playwright).)
|
153
155
|
:param proxy: The external proxy to use for the capture.
|
156
|
+
:param socks5_dns_resolver: DNS resolver to use for the socks5 proxy and fill the HAR file.
|
154
157
|
:param general_timeout_in_sec: The general timeout for the capture, including children.
|
155
158
|
:param loglevel: Python loglevel
|
156
159
|
:param uuid: The UUID of the capture.
|
@@ -177,6 +180,7 @@ class Capture():
|
|
177
180
|
self.device_name: str | None = device_name
|
178
181
|
self.headless: bool = headless
|
179
182
|
self.proxy: ProxySettings = {}
|
183
|
+
self.socks5_dns_resolver: str | list[str] | None = socks5_dns_resolver
|
180
184
|
if proxy:
|
181
185
|
if isinstance(proxy, str):
|
182
186
|
self.proxy = self.__prepare_proxy_playwright(proxy)
|
@@ -1238,6 +1242,12 @@ class Capture():
|
|
1238
1242
|
with open(self._temp_harfile.name) as _har:
|
1239
1243
|
to_return['har'] = json.load(_har)
|
1240
1244
|
self.logger.debug('Got HAR.')
|
1245
|
+
if (to_return.get('har') and self.proxy and self.proxy.get('server')
|
1246
|
+
and self.proxy['server'].startswith('socks5')):
|
1247
|
+
# Only if the capture was not done via a socks5 proxy
|
1248
|
+
if har := to_return['har']: # Could be None
|
1249
|
+
async with timeout(30):
|
1250
|
+
await self.socks5_resolver(har)
|
1241
1251
|
except (TimeoutError, asyncio.TimeoutError):
|
1242
1252
|
self.logger.warning("Unable to close page and context at the end of the capture.")
|
1243
1253
|
errors.append("Unable to close page and context at the end of the capture.")
|
@@ -1713,3 +1723,31 @@ class Capture():
|
|
1713
1723
|
return to_return
|
1714
1724
|
|
1715
1725
|
# END FAVICON EXTRACTOR
|
1726
|
+
|
1727
|
+
# ##### Run DNS resolution over socks5 proxy #####
|
1728
|
+
# This is only use when the capture is done over a socks5 proxy, and not on a .onion
|
1729
|
+
# We get the HAR file, iterate over the entries an update the IPs
|
1730
|
+
|
1731
|
+
async def socks5_resolver(self, harfile: dict[str, Any]) -> None:
|
1732
|
+
resolver = Socks5Resolver(logger=self.logger, socks5_proxy=self.proxy['server'],
|
1733
|
+
dns_resolver=self.socks5_dns_resolver)
|
1734
|
+
# get all the hostnames from the HAR file
|
1735
|
+
hostnames = set()
|
1736
|
+
for entry in harfile['log']['entries']:
|
1737
|
+
if entry['request']['url']:
|
1738
|
+
parsed = urlparse(entry['request']['url'])
|
1739
|
+
if parsed.netloc and not parsed.netloc.endswith('onion'):
|
1740
|
+
hostnames.add(parsed.netloc)
|
1741
|
+
# use the same technique as in lookyloo to resolve many domains in parallel
|
1742
|
+
semaphore = asyncio.Semaphore(20)
|
1743
|
+
all_requests = [resolver.resolve(hostname, semaphore) for hostname in hostnames]
|
1744
|
+
await asyncio.gather(*all_requests)
|
1745
|
+
self.logger.debug('Resolved all domains through the proxy.')
|
1746
|
+
for entry in harfile['log']['entries']:
|
1747
|
+
if entry['request']['url']:
|
1748
|
+
parsed = urlparse(entry['request']['url'])
|
1749
|
+
if parsed.netloc and not parsed.netloc.endswith('onion'):
|
1750
|
+
answer = resolver.get_cache(parsed.netloc)
|
1751
|
+
if answer:
|
1752
|
+
entry['serverIPAddress'] = {str(b) for b in answer}.pop()
|
1753
|
+
self.logger.debug('Done updating HAR file')
|
@@ -0,0 +1,111 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import asyncio
|
6
|
+
import socket
|
7
|
+
|
8
|
+
from typing import TYPE_CHECKING
|
9
|
+
|
10
|
+
import dns
|
11
|
+
|
12
|
+
from dns.asyncresolver import Resolver
|
13
|
+
from dns.resolver import Cache
|
14
|
+
from dns._asyncio_backend import _maybe_wait_for, StreamSocket, Backend
|
15
|
+
from python_socks.async_.asyncio import Proxy
|
16
|
+
|
17
|
+
|
18
|
+
if TYPE_CHECKING:
|
19
|
+
from .capture import PlaywrightCaptureLogAdapter
|
20
|
+
from logging import Logger
|
21
|
+
|
22
|
+
|
23
|
+
class Socks5Backend(Backend):
|
24
|
+
|
25
|
+
def __init__(self, socks5_proxy_url: str):
|
26
|
+
super().__init__()
|
27
|
+
self.proxy = Proxy.from_url(socks5_proxy_url)
|
28
|
+
|
29
|
+
def name(self) -> str:
|
30
|
+
return "asyncio socks5"
|
31
|
+
|
32
|
+
async def make_socket( # type: ignore[no-untyped-def]
|
33
|
+
self,
|
34
|
+
af,
|
35
|
+
socktype,
|
36
|
+
proto=0,
|
37
|
+
source=None,
|
38
|
+
destination=None,
|
39
|
+
timeout=None,
|
40
|
+
ssl_context=None,
|
41
|
+
server_hostname=None,
|
42
|
+
):
|
43
|
+
if socktype == socket.SOCK_STREAM:
|
44
|
+
if destination is None:
|
45
|
+
# This shouldn't happen, but we check to make code analysis software
|
46
|
+
# happier.
|
47
|
+
raise ValueError("destination required for stream sockets")
|
48
|
+
sock = await self.proxy.connect(dest_host=destination[0], dest_port=destination[1])
|
49
|
+
(r, w) = await _maybe_wait_for( # type: ignore[no-untyped-call]
|
50
|
+
asyncio.open_connection(
|
51
|
+
None,
|
52
|
+
None,
|
53
|
+
sock=sock,
|
54
|
+
ssl=ssl_context,
|
55
|
+
family=af,
|
56
|
+
proto=proto,
|
57
|
+
local_addr=source,
|
58
|
+
server_hostname=server_hostname,
|
59
|
+
),
|
60
|
+
timeout,
|
61
|
+
)
|
62
|
+
return StreamSocket(af, r, w) # type: ignore[no-untyped-call]
|
63
|
+
raise NotImplementedError(
|
64
|
+
"unsupported socket " + f"type {socktype}"
|
65
|
+
) # pragma: no cover
|
66
|
+
|
67
|
+
|
68
|
+
class Socks5Resolver:
|
69
|
+
|
70
|
+
def __init__(self, logger: Logger | PlaywrightCaptureLogAdapter, socks5_proxy: str, dns_resolver: str | list[str] | None=None):
|
71
|
+
self.logger = logger
|
72
|
+
# configure set to false means we don't want to load resolv.conf
|
73
|
+
self.resolver = Resolver(configure=False)
|
74
|
+
self.resolver.cache = Cache(900)
|
75
|
+
self.resolver.timeout = 2
|
76
|
+
self.resolver.lifetime = 4
|
77
|
+
|
78
|
+
if not dns_resolver:
|
79
|
+
# Fallback to 1.1.1.1
|
80
|
+
dns_resolver = ['1.1.1.1']
|
81
|
+
elif isinstance(dns_resolver, str):
|
82
|
+
dns_resolver = [dns_resolver]
|
83
|
+
self.resolver.nameservers = dns_resolver
|
84
|
+
|
85
|
+
self.backend = Socks5Backend(socks5_proxy_url=socks5_proxy)
|
86
|
+
|
87
|
+
def get_cache(self, domain: str, rdatatype: dns.rdatatype.RdataType=dns.rdatatype.A) -> dns.resolver.Answer | None:
|
88
|
+
# Get domain from cache
|
89
|
+
return self.resolver.cache.get((dns.name.from_text(domain), rdatatype, dns.rdataclass.IN))
|
90
|
+
|
91
|
+
async def resolve(self, domain: str, semaphore: asyncio.Semaphore, rdatatype: dns.rdatatype.RdataType=dns.rdatatype.A) -> dns.resolver.Answer | None:
|
92
|
+
# Resolve the A record only for the domain, might want to do AAAA instead.
|
93
|
+
async with semaphore:
|
94
|
+
max_retries = 3
|
95
|
+
while max_retries > 0:
|
96
|
+
try:
|
97
|
+
response = await self.resolver.resolve(domain, rdatatype,
|
98
|
+
tcp=True, backend=self.backend)
|
99
|
+
return response
|
100
|
+
except dns.resolver.LifetimeTimeout:
|
101
|
+
# Retry a few times on timeout, it happens.
|
102
|
+
max_retries -= 1
|
103
|
+
if max_retries > 0:
|
104
|
+
self.logger.debug(f"[Socks5] Timeout resolving {domain}, retrying.")
|
105
|
+
await asyncio.sleep(1)
|
106
|
+
else:
|
107
|
+
self.logger.info(f"[Socks5] Timeout resolving {domain}.")
|
108
|
+
except Exception as e:
|
109
|
+
self.logger.info(f"[Socks5] Error resolving {domain}: {e}")
|
110
|
+
break
|
111
|
+
return None
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.
|
3
|
+
version = "1.29.1"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = [
|
6
6
|
{name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
|
@@ -12,18 +12,20 @@ requires-python = ">=3.9"
|
|
12
12
|
dynamic = [ "classifiers" ]
|
13
13
|
|
14
14
|
dependencies = [
|
15
|
-
"playwright (>=1.
|
15
|
+
"playwright (>=1.52.0)",
|
16
16
|
"dateparser (>=1.2.1)",
|
17
|
-
"beautifulsoup4[charset-normalizer,lxml] (>=4.13.
|
17
|
+
"beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)",
|
18
18
|
"w3lib (>=2.3.1)",
|
19
19
|
"tzdata (>=2025.2)",
|
20
20
|
"playwright-stealth (>=1.0.6)",
|
21
|
-
"setuptools (>=
|
21
|
+
"setuptools (>=80.0.1)",
|
22
22
|
"puremagic (>=1.28)",
|
23
23
|
"async-timeout (>=5.0.1) ; python_version < \"3.11\"",
|
24
|
-
"aiohttp[speedups] (>=3.11.
|
24
|
+
"aiohttp[speedups] (>=3.11.18)",
|
25
25
|
"aiohttp-socks (>=0.10.1)",
|
26
|
-
"typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\""
|
26
|
+
"typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\"",
|
27
|
+
"dnspython (>=2.7.0,<3.0.0)",
|
28
|
+
"python-socks (>=2.7.1,<3.0.0)"
|
27
29
|
]
|
28
30
|
|
29
31
|
[project.urls]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|