PlaywrightCapture 1.31.3__tar.gz → 1.31.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/PKG-INFO +4 -4
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/playwrightcapture/capture.py +20 -6
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/pyproject.toml +6 -6
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/LICENSE +0 -0
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/README.md +0 -0
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/playwrightcapture/py.typed +0 -0
- {playwrightcapture-1.31.3 → playwrightcapture-1.31.5}/playwrightcapture/socks5dnslookup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.31.
|
3
|
+
Version: 1.31.5
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
@@ -20,14 +20,14 @@ Classifier: Topic :: Security
|
|
20
20
|
Provides-Extra: recaptcha
|
21
21
|
Requires-Dist: SpeechRecognition (>=3.14.3) ; extra == "recaptcha"
|
22
22
|
Requires-Dist: aiohttp-socks (>=0.10.1)
|
23
|
-
Requires-Dist: aiohttp[speedups] (>=3.12.
|
23
|
+
Requires-Dist: aiohttp[speedups] (>=3.12.14)
|
24
24
|
Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
25
25
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
|
26
26
|
Requires-Dist: dateparser (>=1.2.2)
|
27
27
|
Requires-Dist: dnspython (>=2.7.0,<3.0.0)
|
28
|
-
Requires-Dist: playwright (>=1.
|
28
|
+
Requires-Dist: playwright (>=1.54.0)
|
29
29
|
Requires-Dist: playwright-stealth (>=2)
|
30
|
-
Requires-Dist: puremagic (>=1.
|
30
|
+
Requires-Dist: puremagic (>=1.30)
|
31
31
|
Requires-Dist: pydub (>=0.25.1) ; (python_version < "3.10") and (extra == "recaptcha")
|
32
32
|
Requires-Dist: pydub-ng (>=0.2.0) ; (python_version >= "3.10") and (extra == "recaptcha")
|
33
33
|
Requires-Dist: python-socks (>=2.7.1,<3.0.0)
|
@@ -118,7 +118,8 @@ class Capture():
|
|
118
118
|
proxy: str | dict[str, str] | None=None,
|
119
119
|
socks5_dns_resolver: str | list[str] | None=None,
|
120
120
|
general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
|
121
|
-
uuid: str | None=None, headless: bool=True
|
121
|
+
uuid: str | None=None, headless: bool=True,
|
122
|
+
*, init_script: str | None=None):
|
122
123
|
"""Captures a page with Playwright.
|
123
124
|
|
124
125
|
:param browser: The browser to use for the capture.
|
@@ -129,6 +130,7 @@ class Capture():
|
|
129
130
|
:param loglevel: Python loglevel
|
130
131
|
:param uuid: The UUID of the capture.
|
131
132
|
:param headless: Whether to run the browser in headless mode. WARNING: requires to run in a graphical environment.
|
133
|
+
:param init_script: An optional JavaScript that will be executed on each page - See https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-init-script
|
132
134
|
"""
|
133
135
|
master_logger = logging.getLogger('playwrightcapture')
|
134
136
|
master_logger.setLevel(loglevel)
|
@@ -179,6 +181,8 @@ class Capture():
|
|
179
181
|
self._color_scheme: Literal['dark', 'light', 'no-preference', 'null'] | None = None
|
180
182
|
self._java_script_enabled = True
|
181
183
|
|
184
|
+
self._init_script = init_script
|
185
|
+
|
182
186
|
def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
|
183
187
|
splitted = urlsplit(proxy)
|
184
188
|
if splitted.username and splitted.password:
|
@@ -460,6 +464,9 @@ class Capture():
|
|
460
464
|
)
|
461
465
|
self.context.set_default_timeout(self._capture_timeout * 1000)
|
462
466
|
|
467
|
+
if self._init_script:
|
468
|
+
await self.context.add_init_script(script=self._init_script)
|
469
|
+
|
463
470
|
# very quick and dirty get a platform from the UA so it's not always Win32
|
464
471
|
# This this is deprecated and not very important.
|
465
472
|
# Ref: https://developer.mozilla.org/en-US/docs/Web/API/Navigator/platform
|
@@ -491,7 +498,7 @@ class Capture():
|
|
491
498
|
# 'navigator_vendor': False, # It's set correctly by playwright
|
492
499
|
'navigator_webdriver': True,
|
493
500
|
# 'sec_ch_ua': True,
|
494
|
-
|
501
|
+
'webgl_vendor': True, # It's not net correctly by playwright in headless mode.
|
495
502
|
|
496
503
|
# ## Overwrite the default values
|
497
504
|
'navigator_languages_override': None,
|
@@ -501,10 +508,10 @@ class Capture():
|
|
501
508
|
# 'sec_ch_ua_override': Stealth._get_greased_chrome_sec_ua_ch(ua),
|
502
509
|
# 'webgl_renderer_override': None,
|
503
510
|
# 'webgl_vendor_override': None,
|
504
|
-
})
|
505
511
|
|
506
|
-
|
507
|
-
|
512
|
+
# For testing
|
513
|
+
# 'script_logging': True,
|
514
|
+
})
|
508
515
|
|
509
516
|
if self.cookies:
|
510
517
|
try:
|
@@ -552,6 +559,9 @@ class Capture():
|
|
552
559
|
elif self.browser_name == 'chromium':
|
553
560
|
await self.context.grant_permissions(chromium_permissions)
|
554
561
|
|
562
|
+
# Apply stealth
|
563
|
+
await stealth.apply_stealth_async(self.context)
|
564
|
+
|
555
565
|
async def __cloudflare_bypass_attempt(self, page: Page) -> None:
|
556
566
|
# This method aims to bypass cloudflare checks, but it mostly doesn't work.
|
557
567
|
max_tries = 5
|
@@ -1354,7 +1364,11 @@ class Capture():
|
|
1354
1364
|
return href
|
1355
1365
|
|
1356
1366
|
urls: set[str] = set()
|
1357
|
-
|
1367
|
+
try:
|
1368
|
+
soup = BeautifulSoup(rendered_html, "lxml")
|
1369
|
+
except Exception as e:
|
1370
|
+
self.logger.info(f'Unable to parse HTML: {e}')
|
1371
|
+
soup = BeautifulSoup(rendered_html, "html.parser")
|
1358
1372
|
|
1359
1373
|
rendered_hostname = urlparse(rendered_url).hostname
|
1360
1374
|
# The simple ones: the links.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.31.
|
3
|
+
version = "1.31.5"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = [
|
6
6
|
{name="Raphaël Vinot", email= "raphael.vinot@circl.lu"}
|
@@ -12,16 +12,16 @@ requires-python = ">=3.9,<4.0"
|
|
12
12
|
dynamic = [ "classifiers" ]
|
13
13
|
|
14
14
|
dependencies = [
|
15
|
-
"playwright (>=1.
|
15
|
+
"playwright (>=1.54.0)",
|
16
16
|
"dateparser (>=1.2.2)",
|
17
17
|
"beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)",
|
18
18
|
"w3lib (>=2.3.1)",
|
19
19
|
"tzdata (>=2025.2)",
|
20
20
|
"playwright-stealth (>=2)",
|
21
21
|
"setuptools (>=80.9.0)",
|
22
|
-
"puremagic (>=1.
|
22
|
+
"puremagic (>=1.30)",
|
23
23
|
"async-timeout (>=5.0.1) ; python_version < \"3.11\"",
|
24
|
-
"aiohttp[speedups] (>=3.12.
|
24
|
+
"aiohttp[speedups] (>=3.12.14)",
|
25
25
|
"aiohttp-socks (>=0.10.1)",
|
26
26
|
"typing-extensions (>=4.12.2,<5.0.0) ; python_version < \"3.12\"",
|
27
27
|
"dnspython (>=2.7.0,<3.0.0)",
|
@@ -51,8 +51,8 @@ recaptcha = [
|
|
51
51
|
[tool.poetry.group.dev.dependencies]
|
52
52
|
types-beautifulsoup4 = "^4.12.0.20250516"
|
53
53
|
pytest = "^8.4.1"
|
54
|
-
mypy = "^1.
|
55
|
-
types-dateparser = "^1.2.
|
54
|
+
mypy = "^1.17.0"
|
55
|
+
types-dateparser = "^1.2.2.20250627"
|
56
56
|
types-pytz = "^2025.2.0.20250516"
|
57
57
|
|
58
58
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|