PlaywrightCapture 1.31.0__py3-none-any.whl → 1.31.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,12 +13,11 @@ import sys
13
13
  import time
14
14
 
15
15
  from base64 import b64decode
16
- from dataclasses import dataclass
17
16
  from io import BytesIO
18
17
  from logging import LoggerAdapter, Logger
19
18
  from tempfile import NamedTemporaryFile
20
19
  from typing import Any, Literal, TYPE_CHECKING
21
- from collections.abc import MutableMapping, Iterator
20
+ from collections.abc import MutableMapping
22
21
  from urllib.parse import urlparse, unquote, urljoin, urlsplit, urlunsplit
23
22
  from zipfile import ZipFile
24
23
 
@@ -31,7 +30,7 @@ from charset_normalizer import from_bytes
31
30
  from playwright._impl._errors import TargetClosedError
32
31
  from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
33
32
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
34
- from playwright_stealth import stealth_async, StealthConfig # type: ignore[attr-defined]
33
+ from playwright_stealth import Stealth, ALL_EVASIONS_DISABLED_KWARGS # type: ignore[attr-defined]
35
34
  from puremagic import PureError, from_string
36
35
  from w3lib.html import strip_html5_whitespace
37
36
  from w3lib.url import canonicalize_url, safe_url_string
@@ -101,42 +100,11 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
101
100
 
102
101
 
103
102
  # good test pages:
104
- # https://bot.incolumitas.com/
103
+ # https://kaliiiiiiiiii.github.io/brotector/?crash=false
104
+ # https://www.browserscan.net/bot-detection
105
+ # https://fingerprint.com/products/bot-detection/
105
106
  # https://fingerprintjs.github.io/BotD/main/
106
107
 
107
- @dataclass
108
- class PCStealthConfig(StealthConfig):
109
-
110
- @property
111
- def enabled_scripts(self) -> Iterator[str]:
112
- self.chrome_app = True
113
- self.chrome_csi = True
114
- self.chrome_runtime = True
115
- self.chrome_load_times = True
116
- self.navigator_plugins = True
117
- self.hairline = True
118
- self.iframe_content_window = True
119
- self.media_codecs = True
120
-
121
- # permissions are handled directly in playwright
122
- self.navigator_permissions = False
123
- # Platform is correct now
124
- self.navigator_platform = False
125
- # probably useless, but it will fallback to 4 regardless
126
- self.navigator_hardware_concurrency = 4
127
- # Webgl vendor is correct now
128
- self.webgl_vendor = False
129
- # Set by the viewport
130
- self.outerdimensions = False
131
-
132
- # Not working with Playwright 1.45+
133
- self.navigator_languages = False # Causes issue
134
- self.navigator_user_agent = False # Causes issues
135
- self.navigator_vendor = False # Causes issues
136
-
137
- yield from super().enabled_scripts
138
-
139
-
140
108
  class Capture():
141
109
 
142
110
  _browsers: list[BROWSER] = ['chromium', 'firefox', 'webkit']
@@ -492,6 +460,52 @@ class Capture():
492
460
  )
493
461
  self.context.set_default_timeout(self._capture_timeout * 1000)
494
462
 
463
+ # very quick and dirty get a platform from the UA so it's not always Win32
464
+ # This this is deprecated and not very important.
465
+ # Ref: https://developer.mozilla.org/en-US/docs/Web/API/Navigator/platform
466
+ if any(x in ua.lower() for x in ['windows', 'win32', 'win64']):
467
+ _platform = 'Win32'
468
+ elif any(x in ua.lower() for x in ['macintosh', 'mac os x', 'macos']):
469
+ _platform = 'MacIntel'
470
+ elif any(x in ua.lower() for x in ['linux', 'ubuntu']):
471
+ _platform = 'Linux x86_64'
472
+ else:
473
+ _platform = 'Win32'
474
+
475
+ # Enable stealth mode
476
+ stealth = Stealth(
477
+ **{**ALL_EVASIONS_DISABLED_KWARGS, # type: ignore[arg-type]
478
+ 'chrome_app': True,
479
+ 'chrome_csi': True,
480
+ 'chrome_load_times': True,
481
+ 'chrome_runtime': True,
482
+ 'hairline': True,
483
+ 'iframe_content_window': True,
484
+ 'media_codecs': True,
485
+ # 'navigator_hardware_concurrency': False,
486
+ # 'navigator_languages': False, # handled by playwright directly
487
+ # 'navigator_permissions': False, # handled by playwright directly
488
+ 'navigator_platform': True,
489
+ 'navigator_plugins': True,
490
+ # 'navigator_user_agent': True, # Set by playwright
491
+ # 'navigator_vendor': False, # It's set correctly by playwright
492
+ 'navigator_webdriver': True,
493
+ # 'sec_ch_ua': True,
494
+ # 'webgl_vendor': False, # It's set correctly by playwright
495
+
496
+ # ## Overwrite the default values
497
+ 'navigator_languages_override': None,
498
+ 'navigator_platform_override': _platform,
499
+ # 'navigator_user_agent_override': ua, # Already Set in playwright context
500
+ # 'navigator_vendor_override': None,
501
+ # 'sec_ch_ua_override': Stealth._get_greased_chrome_sec_ua_ch(ua),
502
+ # 'webgl_renderer_override': None,
503
+ # 'webgl_vendor_override': None,
504
+ })
505
+
506
+ # stealth.hook_playwright_context(self.playwright)
507
+ await stealth.apply_stealth_async(self.context)
508
+
495
509
  if self.cookies:
496
510
  try:
497
511
  await self.context.add_cookies(self.cookies)
@@ -1002,6 +1016,8 @@ class Capture():
1002
1016
  capturing_sub = False
1003
1017
  try:
1004
1018
  page = await self.context.new_page()
1019
+ # client = await page.context.new_cdp_session(page)
1020
+ # await client.detach()
1005
1021
  except Error as e:
1006
1022
  self.logger.warning(f'Unable to create new page, the context is in a broken state: {e}')
1007
1023
  self.should_retry = True
@@ -1021,8 +1037,6 @@ class Capture():
1021
1037
  await self.__dialog_clickthrough(page)
1022
1038
  await self.__dialog_tarteaucitron_clickthrough(page)
1023
1039
 
1024
- await stealth_async(page, PCStealthConfig())
1025
-
1026
1040
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
1027
1041
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
1028
1042
  page.on("requestfinished", store_request)
@@ -1547,6 +1561,7 @@ class Capture():
1547
1561
  'net::ERR_INVALID_REDIRECT',
1548
1562
  'net::ERR_NAME_NOT_RESOLVED',
1549
1563
  'net::ERR_NETWORK_ACCESS_DENIED',
1564
+ 'net::ERR_PROXY_CONNECTION_FAILED',
1550
1565
  'net::ERR_QUIC_PROTOCOL_ERROR',
1551
1566
  'net::ERR_SOCKET_NOT_CONNECTED',
1552
1567
  'net::ERR_SOCKS_CONNECTION_FAILED',
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.31.0
3
+ Version: 1.31.1
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
7
7
  Author-email: raphael.vinot@circl.lu
8
- Requires-Python: >=3.9
8
+ Requires-Python: >=3.9,<4.0
9
9
  Classifier: Intended Audience :: Science/Research
10
10
  Classifier: Intended Audience :: Telecommunications Industry
11
11
  Classifier: License :: OSI Approved :: BSD License
@@ -20,13 +20,13 @@ Classifier: Topic :: Security
20
20
  Provides-Extra: recaptcha
21
21
  Requires-Dist: SpeechRecognition (>=3.14.3) ; extra == "recaptcha"
22
22
  Requires-Dist: aiohttp-socks (>=0.10.1)
23
- Requires-Dist: aiohttp[speedups] (>=3.12.12)
23
+ Requires-Dist: aiohttp[speedups] (>=3.12.13)
24
24
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
25
25
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
26
26
  Requires-Dist: dateparser (>=1.2.1)
27
27
  Requires-Dist: dnspython (>=2.7.0,<3.0.0)
28
28
  Requires-Dist: playwright (>=1.52.0)
29
- Requires-Dist: playwright-stealth (>=1.0.6)
29
+ Requires-Dist: playwright-stealth (>=2)
30
30
  Requires-Dist: puremagic (>=1.29)
31
31
  Requires-Dist: pydub (>=0.25.1) ; (python_version < "3.10") and (extra == "recaptcha")
32
32
  Requires-Dist: pydub-ng (>=0.2.0) ; (python_version >= "3.10") and (extra == "recaptcha")
@@ -1,10 +1,10 @@
1
1
  playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=6taJ5k4Aolpp4Z_YRbbgt9AksWnQPoOeb3GR2x1TmCc,85259
2
+ playwrightcapture/capture.py,sha256=lXcBWWEWRmQem6Rs_yj51PB74JFlRL4k1kggFCMxFTA,86586
3
3
  playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
4
  playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
5
5
  playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  playwrightcapture/socks5dnslookup.py,sha256=ZpOf8tgsRQZi-WDcn9JbbG1bKz9DSfK_jz1l53UI1Ho,4058
7
- playwrightcapture-1.31.0.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
8
- playwrightcapture-1.31.0.dist-info/METADATA,sha256=kzY4EECi1UvBriPhS60Oh1plT4XbeH_hsKkXHqwHOZo,3284
9
- playwrightcapture-1.31.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- playwrightcapture-1.31.0.dist-info/RECORD,,
7
+ playwrightcapture-1.31.1.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
8
+ playwrightcapture-1.31.1.dist-info/METADATA,sha256=9dN2yvPavtQN-iheDOgUT15lxoFvM_MzPPnM_skV3sA,3285
9
+ playwrightcapture-1.31.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
+ playwrightcapture-1.31.1.dist-info/RECORD,,