PlaywrightCapture 1.31.3__py3-none-any.whl → 1.31.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -118,7 +118,8 @@ class Capture():
118
118
  proxy: str | dict[str, str] | None=None,
119
119
  socks5_dns_resolver: str | list[str] | None=None,
120
120
  general_timeout_in_sec: int | None=None, loglevel: str | int='INFO',
121
- uuid: str | None=None, headless: bool=True):
121
+ uuid: str | None=None, headless: bool=True,
122
+ *, init_script: str | None=None):
122
123
  """Captures a page with Playwright.
123
124
 
124
125
  :param browser: The browser to use for the capture.
@@ -129,6 +130,7 @@ class Capture():
129
130
  :param loglevel: Python loglevel
130
131
  :param uuid: The UUID of the capture.
131
132
  :param headless: Whether to run the browser in headless mode. WARNING: requires to run in a graphical environment.
133
+ :param init_script: An optional JavaScript that will be executed on each page - See https://playwright.dev/python/docs/api/class-browsercontext#browser-context-add-init-script
132
134
  """
133
135
  master_logger = logging.getLogger('playwrightcapture')
134
136
  master_logger.setLevel(loglevel)
@@ -179,6 +181,8 @@ class Capture():
179
181
  self._color_scheme: Literal['dark', 'light', 'no-preference', 'null'] | None = None
180
182
  self._java_script_enabled = True
181
183
 
184
+ self._init_script = init_script
185
+
182
186
  def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
183
187
  splitted = urlsplit(proxy)
184
188
  if splitted.username and splitted.password:
@@ -460,6 +464,9 @@ class Capture():
460
464
  )
461
465
  self.context.set_default_timeout(self._capture_timeout * 1000)
462
466
 
467
+ if self._init_script:
468
+ await self.context.add_init_script(script=self._init_script)
469
+
463
470
  # very quick and dirty get a platform from the UA so it's not always Win32
464
471
  # This this is deprecated and not very important.
465
472
  # Ref: https://developer.mozilla.org/en-US/docs/Web/API/Navigator/platform
@@ -491,7 +498,7 @@ class Capture():
491
498
  # 'navigator_vendor': False, # It's set correctly by playwright
492
499
  'navigator_webdriver': True,
493
500
  # 'sec_ch_ua': True,
494
- # 'webgl_vendor': False, # It's set correctly by playwright
501
+ 'webgl_vendor': True, # It's not net correctly by playwright in headless mode.
495
502
 
496
503
  # ## Overwrite the default values
497
504
  'navigator_languages_override': None,
@@ -501,10 +508,10 @@ class Capture():
501
508
  # 'sec_ch_ua_override': Stealth._get_greased_chrome_sec_ua_ch(ua),
502
509
  # 'webgl_renderer_override': None,
503
510
  # 'webgl_vendor_override': None,
504
- })
505
511
 
506
- # stealth.hook_playwright_context(self.playwright)
507
- await stealth.apply_stealth_async(self.context)
512
+ # For testing
513
+ # 'script_logging': True,
514
+ })
508
515
 
509
516
  if self.cookies:
510
517
  try:
@@ -552,6 +559,9 @@ class Capture():
552
559
  elif self.browser_name == 'chromium':
553
560
  await self.context.grant_permissions(chromium_permissions)
554
561
 
562
+ # Apply stealth
563
+ await stealth.apply_stealth_async(self.context)
564
+
555
565
  async def __cloudflare_bypass_attempt(self, page: Page) -> None:
556
566
  # This method aims to bypass cloudflare checks, but it mostly doesn't work.
557
567
  max_tries = 5
@@ -1354,7 +1364,11 @@ class Capture():
1354
1364
  return href
1355
1365
 
1356
1366
  urls: set[str] = set()
1357
- soup = BeautifulSoup(rendered_html, "lxml")
1367
+ try:
1368
+ soup = BeautifulSoup(rendered_html, "lxml")
1369
+ except Exception as e:
1370
+ self.logger.info(f'Unable to parse HTML: {e}')
1371
+ soup = BeautifulSoup(rendered_html, "html.parser")
1358
1372
 
1359
1373
  rendered_hostname = urlparse(rendered_url).hostname
1360
1374
  # The simple ones: the links.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PlaywrightCapture
3
- Version: 1.31.3
3
+ Version: 1.31.5
4
4
  Summary: A simple library to capture websites using playwright
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -20,14 +20,14 @@ Classifier: Topic :: Security
20
20
  Provides-Extra: recaptcha
21
21
  Requires-Dist: SpeechRecognition (>=3.14.3) ; extra == "recaptcha"
22
22
  Requires-Dist: aiohttp-socks (>=0.10.1)
23
- Requires-Dist: aiohttp[speedups] (>=3.12.13)
23
+ Requires-Dist: aiohttp[speedups] (>=3.12.14)
24
24
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
25
25
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.13.4)
26
26
  Requires-Dist: dateparser (>=1.2.2)
27
27
  Requires-Dist: dnspython (>=2.7.0,<3.0.0)
28
- Requires-Dist: playwright (>=1.53.0)
28
+ Requires-Dist: playwright (>=1.54.0)
29
29
  Requires-Dist: playwright-stealth (>=2)
30
- Requires-Dist: puremagic (>=1.29)
30
+ Requires-Dist: puremagic (>=1.30)
31
31
  Requires-Dist: pydub (>=0.25.1) ; (python_version < "3.10") and (extra == "recaptcha")
32
32
  Requires-Dist: pydub-ng (>=0.2.0) ; (python_version >= "3.10") and (extra == "recaptcha")
33
33
  Requires-Dist: python-socks (>=2.7.1,<3.0.0)
@@ -1,10 +1,10 @@
1
1
  playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=p5N16ymdpSXDu738YGtn_KjYChzcqtkqWMyTNQfbhKU,86562
2
+ playwrightcapture/capture.py,sha256=CNV-TbnUj8HXFaEmxIxKTeuXsKkVFsA7bCdNJKCTvnI,87166
3
3
  playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
4
  playwrightcapture/helpers.py,sha256=Xqs09zHhzAWnpBtQ0A9YAxg80P3Lj7aBj5M2WuEr0so,1843
5
5
  playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  playwrightcapture/socks5dnslookup.py,sha256=ZpOf8tgsRQZi-WDcn9JbbG1bKz9DSfK_jz1l53UI1Ho,4058
7
- playwrightcapture-1.31.3.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
8
- playwrightcapture-1.31.3.dist-info/METADATA,sha256=p6R6YfXdVpPfpNMBIljl4v_Lu30N6ZdZ-RC3UGa83OY,3285
9
- playwrightcapture-1.31.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- playwrightcapture-1.31.3.dist-info/RECORD,,
7
+ playwrightcapture-1.31.5.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
8
+ playwrightcapture-1.31.5.dist-info/METADATA,sha256=44oFmm1jYLUz1vPX2HJgCe3CJSO-BqMDzdIZfGBNtnw,3285
9
+ playwrightcapture-1.31.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
+ playwrightcapture-1.31.5.dist-info/RECORD,,