PlaywrightCapture 1.25.9__py3-none-any.whl → 1.25.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -96,28 +96,40 @@ class PlaywrightCaptureLogAdapter(LoggerAdapter): # type: ignore[type-arg]
96
96
  return msg, kwargs
97
97
 
98
98
 
99
+ # good test pages:
100
+ # https://bot.incolumitas.com/
101
+ # https://fingerprintjs.github.io/BotD/main/
102
+
99
103
  @dataclass
100
104
  class PCStealthConfig(StealthConfig): # type: ignore[misc]
101
105
 
102
106
  @property
103
107
  def enabled_scripts(self) -> Generator[str, None, None]:
104
- self.webdriver = True
105
- self.webgl_vendor = True
106
108
  self.chrome_app = True
107
109
  self.chrome_csi = True
108
- self.chrome_load_times = True
109
110
  self.chrome_runtime = True
111
+ self.chrome_load_times = True
112
+ self.navigator_plugins = True
113
+ self.hairline = True
110
114
  self.iframe_content_window = True
111
115
  self.media_codecs = True
116
+
117
+ # permissions are handled directly in playwright
118
+ self.navigator_permissions = False
119
+ # Platform is correct now
120
+ self.navigator_platform = False
121
+ # probably useless, but it will fallback to 4 regardless
112
122
  self.navigator_hardware_concurrency = 4
123
+ # Webgl vendor is correct now
124
+ self.webgl_vendor = False
125
+ # Set by the viewport
126
+ self.outerdimensions = False
127
+
128
+ # Not working with Playwright 1.45+
113
129
  self.navigator_languages = False # Causes issue
114
- self.navigator_permissions = True
115
- self.navigator_platform = True
116
- self.navigator_plugins = True
117
130
  self.navigator_user_agent = False # Causes issues
118
131
  self.navigator_vendor = False # Causes issues
119
- self.outerdimensions = True
120
- self.hairline = True
132
+
121
133
  yield from super().enabled_scripts
122
134
 
123
135
 
@@ -184,7 +196,7 @@ class Capture():
184
196
  self._viewport: ViewportSize | None = None
185
197
  self._user_agent: str = ''
186
198
  self._timezone_id: str = ''
187
- self._locale: str = ''
199
+ self._locale: str = 'en-US'
188
200
  self._color_scheme: Literal['dark', 'light', 'no-preference', 'null'] | None = None
189
201
 
190
202
  def __prepare_proxy_playwright(self, proxy: str) -> ProxySettings:
@@ -462,21 +474,22 @@ class Capture():
462
474
  # NOTE: Which perms are supported by which browsers varies
463
475
  # See https://github.com/microsoft/playwright/issues/16577
464
476
  chromium_permissions = [
465
- 'geolocation',
466
- 'midi',
467
- 'midi-sysex',
468
- 'notifications',
469
- 'camera',
470
- 'microphone',
471
- 'background-sync',
472
- 'ambient-light-sensor',
473
477
  'accelerometer',
474
- 'gyroscope',
475
- 'magnetometer',
476
478
  'accessibility-events',
479
+ 'ambient-light-sensor',
480
+ 'background-sync',
481
+ 'camera',
477
482
  'clipboard-read',
478
483
  'clipboard-write',
479
- 'payment-handler'
484
+ 'geolocation',
485
+ 'gyroscope',
486
+ 'magnetometer',
487
+ 'microphone',
488
+ 'midi-sysex',
489
+ 'midi',
490
+ 'notifications',
491
+ 'payment-handler',
492
+ 'storage-access'
480
493
  ]
481
494
 
482
495
  firefox_permissions = ['geolocation', 'notifications']
@@ -588,6 +601,8 @@ class Capture():
588
601
  elif await page.get_by_test_id("uc-accept-all-button").is_visible():
589
602
  self.logger.info('Consent window found, clicking through.')
590
603
  await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
604
+ elif await page.locator('#axeptio_btn_acceptAll').is_visible():
605
+ await page.locator('#axeptio_btn_acceptAll').click(timeout=2000)
591
606
  else:
592
607
  self.logger.info('Consent window found (dialog), but no button to click through.')
593
608
  await page.add_locator_handler(
@@ -633,6 +648,24 @@ class Capture():
633
648
  )
634
649
  self.logger.info('Piwik handler added')
635
650
 
651
+ async def __frame_consent(self, frame: Frame) -> bool:
652
+ """Search & Click content in iframes. Cannot easily use the locator handler for this without having many many handlers.
653
+ And the iframes don't have a title or a role to easily identify them so we just try with generic locators that vary by language."""
654
+ got_button: bool = False
655
+ if await frame.get_by_label("Alle akzeptieren").is_visible():
656
+ got_button = True
657
+ await frame.get_by_label("Alle akzeptieren").click(timeout=2000)
658
+ elif await frame.get_by_label("Accept & continue").is_visible():
659
+ got_button = True
660
+ await frame.get_by_label("Accept & continue").click(timeout=2000)
661
+ elif await frame.get_by_label("Accepter et continuer").is_visible():
662
+ got_button = True
663
+ await frame.get_by_label("Accepter et continuer").click(timeout=2000)
664
+ elif await frame.get_by_label("Accepteer").is_visible():
665
+ got_button = True
666
+ await frame.get_by_label("Accepteer").click(timeout=2000)
667
+ return got_button
668
+
636
669
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
637
670
  referer: str | None=None,
638
671
  page: Page | None=None, depth: int=0,
@@ -720,7 +753,6 @@ class Capture():
720
753
  await self.__dialog_clickthrough(page)
721
754
 
722
755
  await stealth_async(page, PCStealthConfig())
723
- # await stealth_async(page)
724
756
 
725
757
  page.set_default_timeout((self._capture_timeout - 2) * 1000)
726
758
  # trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
@@ -799,6 +831,8 @@ class Capture():
799
831
  self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
800
832
  except Error as e:
801
833
  self.logger.warning(f'Error while resolving captcha on {url}: {e}')
834
+ except (TimeoutError, asyncio.TimeoutError) as e:
835
+ self.logger.warning(f'[Timeout] Error while resolving captcha on {url}: {e}')
802
836
  except Exception as e:
803
837
  self.logger.exception(f'General error with captcha solving on {url}: {e}')
804
838
  # ======
@@ -897,16 +931,32 @@ class Capture():
897
931
  # self.logger.warning('Unable to move time forward.')
898
932
 
899
933
  self.logger.debug('Done with instrumentation, waiting for network idle.')
934
+ if allow_tracking:
935
+ self.logger.debug('Check iFrames for button')
936
+ for frame in page.frames:
937
+ frame_title = await frame.title()
938
+ self.logger.debug(f'Check button on {frame_title}')
939
+ if await self.__frame_consent(frame):
940
+ self.logger.debug(f'Got button on {frame_title}')
941
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
942
+ self.logger.debug('Done with iFrames.')
943
+
900
944
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
901
945
  await self._safe_wait(page)
946
+
902
947
  self.logger.debug('Done with instrumentation, done with waiting.')
903
948
 
904
949
  if content := await self._failsafe_get_content(page):
905
950
  to_return['html'] = content
906
951
 
907
952
  if 'html' in to_return and to_return['html'] is not None and with_favicon:
908
- to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html'])
909
- got_favicons = True
953
+ try:
954
+ to_return['potential_favicons'] = await self.get_favicons(page.url, to_return['html'])
955
+ got_favicons = True
956
+ except (TimeoutError, asyncio.TimeoutError) as e:
957
+ self.logger.warning(f'[Timeout] Unable to get favicons: {e}')
958
+ except Exception as e:
959
+ self.logger.warning(f'Unable to get favicons: {e}')
910
960
 
911
961
  to_return['last_redirected_url'] = page.url
912
962
  to_return['png'] = await self._failsafe_get_screenshot(page)
@@ -1209,7 +1259,8 @@ class Capture():
1209
1259
  if self.proxy and self.proxy.get('server'):
1210
1260
  connector = ProxyConnector.from_url(self.proxy['server'])
1211
1261
 
1212
- async with aiohttp.ClientSession(connector=connector) as session:
1262
+ timeout = aiohttp.ClientTimeout(total=10)
1263
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
1213
1264
  while True:
1214
1265
  try:
1215
1266
  href = await main_frame.get_by_role("link", name="Alternatively, download audio as MP3").get_attribute("href")
@@ -1219,7 +1270,7 @@ class Capture():
1219
1270
  if not href:
1220
1271
  self.logger.warning('Unable to find download link for captcha.')
1221
1272
  return False
1222
- async with session.get(href, timeout=10, ssl=False) as response:
1273
+ async with session.get(href, ssl=False) as response:
1223
1274
  response.raise_for_status()
1224
1275
  mp3_content = await response.read()
1225
1276
  with NamedTemporaryFile() as mp3_file, NamedTemporaryFile() as wav_file:
@@ -1417,7 +1468,8 @@ class Capture():
1417
1468
  return set()
1418
1469
  to_fetch, to_return = extracted_favicons
1419
1470
  to_fetch.add('/favicon.ico')
1420
- async with aiohttp.ClientSession(connector=connector) as session:
1471
+ timeout = aiohttp.ClientTimeout(total=10)
1472
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
1421
1473
  session.headers['user-agent'] = self.user_agent
1422
1474
  for u in to_fetch:
1423
1475
  try:
@@ -1427,7 +1479,7 @@ class Capture():
1427
1479
  if url_to_fetch in self._requests:
1428
1480
  favicon = self._requests[url_to_fetch]
1429
1481
  if not favicon:
1430
- async with session.get(url_to_fetch, timeout=5, ssl=False) as favicon_response:
1482
+ async with session.get(url_to_fetch, ssl=False) as favicon_response:
1431
1483
  favicon_response.raise_for_status()
1432
1484
  favicon = await favicon_response.read()
1433
1485
  if favicon:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.9
3
+ Version: 1.25.11
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -21,17 +21,17 @@ Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
23
  Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
24
- Requires-Dist: aiohttp-socks (>=0.8.4,<0.9.0)
25
- Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
24
+ Requires-Dist: aiohttp-socks (>=0.9,<0.10)
25
+ Requires-Dist: aiohttp[speedups] (>=3.10.3,<4.0.0)
26
26
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
27
27
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
28
28
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
29
- Requires-Dist: playwright (>=1.45.1,<2.0.0)
29
+ Requires-Dist: playwright (>=1.46.0,<2.0.0)
30
30
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
- Requires-Dist: puremagic (>=1.26,<2.0)
31
+ Requires-Dist: puremagic (>=1.27,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=71.1.0,<72.0.0)
34
+ Requires-Dist: setuptools (>=72.1.0,<73.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -0,0 +1,9 @@
1
+ playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
+ playwrightcapture/capture.py,sha256=Rmo_EVRlR9btsgE2H99OtGPRZwIe8RVq-JCc2GzUWiI,74446
3
+ playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
+ playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
+ playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ playwrightcapture-1.25.11.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.25.11.dist-info/METADATA,sha256=nGuO6TAlz2lKM15HiIgZJ4iERLBO_AXNBBpgqo8nfhM,3172
8
+ playwrightcapture-1.25.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
+ playwrightcapture-1.25.11.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=uS8e87-7jl8F7TgfzhKhlV4pGf8n6twu9rVzzlqIhXM,71671
3
- playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
- playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
- playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.25.9.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.25.9.dist-info/METADATA,sha256=7ds0ymzTNfkYLajoJTc-t1G4wOhPHRbxmg6CCZkMtUE,3173
8
- playwrightcapture-1.25.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- playwrightcapture-1.25.9.dist-info/RECORD,,