PlaywrightCapture 1.31.0__py3-none-any.whl → 1.31.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +54 -39
- {playwrightcapture-1.31.0.dist-info → playwrightcapture-1.31.2.dist-info}/METADATA +6 -6
- {playwrightcapture-1.31.0.dist-info → playwrightcapture-1.31.2.dist-info}/RECORD +5 -5
- {playwrightcapture-1.31.0.dist-info → playwrightcapture-1.31.2.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.31.0.dist-info → playwrightcapture-1.31.2.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -13,12 +13,11 @@ import sys
|
|
13
13
|
import time
|
14
14
|
|
15
15
|
from base64 import b64decode
|
16
|
-
from dataclasses import dataclass
|
17
16
|
from io import BytesIO
|
18
17
|
from logging import LoggerAdapter, Logger
|
19
18
|
from tempfile import NamedTemporaryFile
|
20
19
|
from typing import Any, Literal, TYPE_CHECKING
|
21
|
-
from collections.abc import MutableMapping
|
20
|
+
from collections.abc import MutableMapping
|
22
21
|
from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit
|
23
22
|
from zipfile import ZipFile
|
24
23
|
|
@@ -31,7 +30,7 @@ from charset_normalizer import from_bytes
|
|
31
30
|
from playwright._impl._errors import TargetClosedError
|
32
31
|
from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
|
33
32
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
34
|
-
from playwright_stealth import
|
33
|
+
from playwright_stealth import Stealth, ALL_EVASIONS_DISABLED_KWARGS # type: ignore[attr-defined]
|
35
34
|
from puremagic import PureError, from_string
|
36
35
|
from w3lib.html import strip_html5_whitespace
|
37
36
|
from w3lib.url import canonicalize_url, safe_url_string
|
@@ -101,42 +100,11 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
|
101
100
|
|
102
101
|
|
103
102
|
# good test pages:
|
104
|
-
# https://
|
103
|
+
# https://kaliiiiiiiiii.github.io/brotector/?crash=false
|
104
|
+
# https://www.browserscan.net/bot-detection
|
105
|
+
# https://fingerprint.com/products/bot-detection/
|
105
106
|
# https://fingerprintjs.github.io/BotD/main/
|
106
107
|
|
107
|
-
@dataclass
|
108
|
-
class PCStealthConfig(StealthConfig):
|
109
|
-
|
110
|
-
@property
|
111
|
-
def enabled_scripts(self) -> Iterator[str]:
|
112
|
-
self.chrome_app = True
|
113
|
-
self.chrome_csi = True
|
114
|
-
self.chrome_runtime = True
|
115
|
-
self.chrome_load_times = True
|
116
|
-
self.navigator_plugins = True
|
117
|
-
self.hairline = True
|
118
|
-
self.iframe_content_window = True
|
119
|
-
self.media_codecs = True
|
120
|
-
|
121
|
-
# permissions are handled directly in playwright
|
122
|
-
self.navigator_permissions = False
|
123
|
-
# Platform is correct now
|
124
|
-
self.navigator_platform = False
|
125
|
-
# probably useless, but it will fallback to 4 regardless
|
126
|
-
self.navigator_hardware_concurrency = 4
|
127
|
-
# Webgl vendor is correct now
|
128
|
-
self.webgl_vendor = False
|
129
|
-
# Set by the viewport
|
130
|
-
self.outerdimensions = False
|
131
|
-
|
132
|
-
# Not working with Playwright 1.45+
|
133
|
-
self.navigator_languages = False # Causes issue
|
134
|
-
self.navigator_user_agent = False # Causes issues
|
135
|
-
self.navigator_vendor = False # Causes issues
|
136
|
-
|
137
|
-
yield from super().enabled_scripts
|
138
|
-
|
139
|
-
|
140
108
|
class Capture():
|
141
109
|
|
142
110
|
_browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
|
@@ -492,6 +460,52 @@ class Capture():
|
|
492
460
|
)
|
493
461
|
self.context.set_default_timeout(self._capture_timeout * 1000)
|
494
462
|
|
463
|
+
# very quick and dirty get a platform from the UA so it's not always Win32
|
464
|
+
# This this is deprecated and not very important.
|
465
|
+
# Ref: https://developer.mozilla.org/en-US/docs/Web/API/Navigator/platform
|
466
|
+
if any(x in ua.lower() for x in ['windows', 'win32', 'win64']):
|
467
|
+
_platform = 'Win32'
|
468
|
+
elif any(x in ua.lower() for x in ['macintosh', 'mac os x', 'macos']):
|
469
|
+
_platform = 'MacIntel'
|
470
|
+
elif any(x in ua.lower() for x in ['linux', 'ubuntu']):
|
471
|
+
_platform = 'Linux x86_64'
|
472
|
+
else:
|
473
|
+
_platform = 'Win32'
|
474
|
+
|
475
|
+
# Enable stealth mode
|
476
|
+
stealth = Stealth(
|
477
|
+
**{**ALL_EVASIONS_DISABLED_KWARGS, # type: ignore[arg-type]
|
478
|
+
'chrome_app': True,
|
479
|
+
'chrome_csi': True,
|
480
|
+
'chrome_load_times': True,
|
481
|
+
'chrome_runtime': True,
|
482
|
+
'hairline': True,
|
483
|
+
'iframe_content_window': True,
|
484
|
+
'media_codecs': True,
|
485
|
+
# 'navigator_hardware_concurrency': False,
|
486
|
+
# 'navigator_languages': False, # handled by playwright directly
|
487
|
+
# 'navigator_permissions': False, # handled by playwright directly
|
488
|
+
'navigator_platform': True,
|
489
|
+
'navigator_plugins': True,
|
490
|
+
# 'navigator_user_agent': True, # Set by playwright
|
491
|
+
# 'navigator_vendor': False, # It's set correctly by playwright
|
492
|
+
'navigator_webdriver': True,
|
493
|
+
# 'sec_ch_ua': True,
|
494
|
+
# 'webgl_vendor': False, # It's set correctly by playwright
|
495
|
+
|
496
|
+
# ## Overwrite the default values
|
497
|
+
'navigator_languages_override': None,
|
498
|
+
'navigator_platform_override': _platform,
|
499
|
+
# 'navigator_user_agent_override': ua, # Already Set in playwright context
|
500
|
+
# 'navigator_vendor_override': None,
|
501
|
+
# 'sec_ch_ua_override': Stealth._get_greased_chrome_sec_ua_ch(ua),
|
502
|
+
# 'webgl_renderer_override': None,
|
503
|
+
# 'webgl_vendor_override': None,
|
504
|
+
})
|
505
|
+
|
506
|
+
# stealth.hook_playwright_context(self.playwright)
|
507
|
+
await stealth.apply_stealth_async(self.context)
|
508
|
+
|
495
509
|
if self.cookies:
|
496
510
|
try:
|
497
511
|
await self.context.add_cookies(self.cookies)
|
@@ -1002,6 +1016,8 @@ class Capture():
|
|
1002
1016
|
capturing_sub = False
|
1003
1017
|
try:
|
1004
1018
|
page = await self.context.new_page()
|
1019
|
+
# client = await page.context.new_cdp_session(page)
|
1020
|
+
# await client.detach()
|
1005
1021
|
except Error as e:
|
1006
1022
|
self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}')
|
1007
1023
|
self.should_retry = True
|
@@ -1021,8 +1037,6 @@ class Capture():
|
|
1021
1037
|
await self.__dialog_clickthrough(page)
|
1022
1038
|
await self.__dialog_tarteaucitron_clickthrough(page)
|
1023
1039
|
|
1024
|
-
await stealth_async(page, PCStealthConfig())
|
1025
|
-
|
1026
1040
|
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
1027
1041
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
1028
1042
|
page.on("requestfinished", store_request)
|
@@ -1547,6 +1561,7 @@ class Capture():
|
|
1547
1561
|
'net::ERR_INVALID_REDIRECT',
|
1548
1562
|
'net::ERR_NAME_NOT_RESOLVED',
|
1549
1563
|
'net::ERR_NETWORK_ACCESS_DENIED',
|
1564
|
+
'net::ERR_PROXY_CONNECTION_FAILED',
|
1550
1565
|
'net::ERR_QUIC_PROTOCOL_ERROR',
|
1551
1566
|
'net::ERR_SOCKET_NOT_CONNECTED',
|
1552
1567
|
'net::ERR_SOCKS_CONNECTION_FAILED',
|
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.31.
|
3
|
+
Version: 1.31.2
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
7
7
|
Author-email: raphael.vinot@circl.lu
|
8
|
-
Requires-Python: >=3.9
|
8
|
+
Requires-Python: >=3.9,<4.0
|
9
9
|
Classifier: Intended Audience :: Science/Research
|
10
10
|
Classifier: Intended Audience :: Telecommunications Industry
|
11
11
|
Classifier: License :: OSI Approved :: BSD License
|
@@ -20,13 +20,13 @@ Classifier: Topic :: Security
|
|
20
20
|
Provides-Extra: recaptcha
|
21
21
|
Requires-Dist: SpeechRecognition (>=3.14.3) ; extra == "recaptcha"
|
22
22
|
Requires-Dist: aiohttp-socks (>=0.10.1)
|
23
|
-
Requires-Dist: aiohttp[speedups] (>=3.12.
|
23
|
+
Requires-Dist: aiohttp[speedups] (>=3.12.13)
|
24
24
|
Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
25
25
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
|
26
|
-
Requires-Dist: dateparser (>=1.2.
|
26
|
+
Requires-Dist: dateparser (>=1.2.2)
|
27
27
|
Requires-Dist: dnspython (>=2.7.0,<3.0.0)
|
28
|
-
Requires-Dist: playwright (>=1.
|
29
|
-
Requires-Dist: playwright-stealth (>=
|
28
|
+
Requires-Dist: playwright (>=1.53.0)
|
29
|
+
Requires-Dist: playwright-stealth (>=2)
|
30
30
|
Requires-Dist: puremagic (>=1.29)
|
31
31
|
Requires-Dist: pydub (>=0.25.1) ; (python_version < "3.10") and (extra == "recaptcha")
|
32
32
|
Requires-Dist: pydub-ng (>=0.2.0) ; (python_version >= "3.10") and (extra == "recaptcha")
|
@@ -1,10 +1,10 @@
|
|
1
1
|
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=
|
2
|
+
playwrightcapture/capture.py,sha256=lXcBWWEWRmQem6Rs_yj51PB74JFlRL4k1kggFCMxFTA,86586
|
3
3
|
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
4
|
playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
|
5
5
|
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
playwrightcapture/socks5dnslookup.py,sha256=ZpOf8tgsRQZi-WDcn9JbbG1bKz9DSfK_jz1l53UI1Ho,4058
|
7
|
-
playwrightcapture-1.31.
|
8
|
-
playwrightcapture-1.31.
|
9
|
-
playwrightcapture-1.31.
|
10
|
-
playwrightcapture-1.31.
|
7
|
+
playwrightcapture-1.31.2.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
8
|
+
playwrightcapture-1.31.2.dist-info/METADATA,sha256=6JDSFOTJaWk1oER6gis-AbR-vKAaX3vXwOPa9sV-bFA,3285
|
9
|
+
playwrightcapture-1.31.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
10
|
+
playwrightcapture-1.31.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|