PlaywrightCapture 1.25.10__tar.gz → 1.25.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.10
3
+ Version: 1.25.12
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -22,16 +22,16 @@ Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
23
  Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
24
24
  Requires-Dist: aiohttp-socks (>=0.9,<0.10)
25
- Requires-Dist: aiohttp[speedups] (>=3.10.1,<4.0.0)
25
+ Requires-Dist: aiohttp[speedups] (>=3.10.3,<4.0.0)
26
26
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
27
27
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
28
28
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
29
- Requires-Dist: playwright (>=1.45.1,<2.0.0)
29
+ Requires-Dist: playwright (>=1.46.0,<2.0.0)
30
30
  Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
- Requires-Dist: puremagic (>=1.26,<2.0)
31
+ Requires-Dist: puremagic (>=1.27,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=72.1.0,<73.0.0)
34
+ Requires-Dist: setuptools (>=72.2.0,<73.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -532,7 +532,7 @@ class Capture():
532
532
  async def handler() -> None:
533
533
  self.logger.debug('Didomi dialog found, clicking through.')
534
534
  if await page.locator("#didomi-notice-agree-button").is_visible():
535
- await page.locator("#didomi-notice-agree-button").click(timeout=30000)
535
+ await page.locator("#didomi-notice-agree-button").click(timeout=3000)
536
536
 
537
537
  await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
538
538
  self.logger.info('Didomi handler added')
@@ -575,10 +575,7 @@ class Capture():
575
575
 
576
576
  async def __dialog_alert_dialog_clickthrough(self, page: Page) -> None:
577
577
  async def handler() -> None:
578
- if await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").is_visible():
579
- self.logger.info('Consent window found, clicking through.')
580
- await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").click(timeout=2000)
581
- elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
578
+ if await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
582
579
  await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
583
580
  else:
584
581
  self.logger.info('Consent window found (alert dialog), but no button to click through.')
@@ -601,6 +598,11 @@ class Capture():
601
598
  elif await page.get_by_test_id("uc-accept-all-button").is_visible():
602
599
  self.logger.info('Consent window found, clicking through.')
603
600
  await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
601
+ elif await page.locator('#axeptio_btn_acceptAll').is_visible():
602
+ await page.locator('#axeptio_btn_acceptAll').click(timeout=2000)
603
+ elif await page.locator('.fc-cta-consent').is_visible():
604
+ # https://developers.google.com/funding-choices/fc-api-docs
605
+ await page.locator('.fc-cta-consent').click(timeout=2000)
604
606
  else:
605
607
  self.logger.info('Consent window found (dialog), but no button to click through.')
606
608
  await page.add_locator_handler(
@@ -646,6 +648,67 @@ class Capture():
646
648
  )
647
649
  self.logger.info('Piwik handler added')
648
650
 
651
+ async def __frame_consent(self, frame: Frame) -> bool:
652
+ """Search & Click content in iframes. Cannot easily use the locator handler for this without having many many handlers.
653
+ And the iframes don't have a title or a role to easily identify them so we just try with generic locators that vary by language."""
654
+
655
+ labels_to_click: list[str] = [
656
+ # German
657
+ "Alle akzeptieren",
658
+ "Zustimmen & weiter",
659
+ # French
660
+ "Accepter et continuer",
661
+ "Tout accepter",
662
+ "Accepter",
663
+ "Accepter les cookies",
664
+ # English
665
+ "Accept & continue",
666
+ "Accept all",
667
+ # Dutch
668
+ "Accepteer",
669
+ # Spanish
670
+ "Aceptar todo",
671
+ # Italian
672
+ "Accetta tutto",
673
+ # Arabic
674
+ "قبول الكل",
675
+ # Portuguese
676
+ "Aceitar tudo",
677
+ # Polish
678
+ "Akceptuj wszystko",
679
+ ]
680
+
681
+ got_button: bool = False
682
+ try:
683
+ if await frame.locator("button.button__acceptAll").is_visible():
684
+ self.logger.info('Consent window found, clicking through.')
685
+ got_button = True
686
+ await frame.locator("button.button__acceptAll").click(timeout=2000)
687
+ for label in labels_to_click:
688
+ if await frame.get_by_label(label).is_visible():
689
+ got_button = True
690
+ self.logger.debug(f'Got button by label on frame: {label}')
691
+ await frame.get_by_label(label).click(timeout=2000)
692
+ break
693
+ if await frame.get_by_role("button", name=label).is_visible():
694
+ got_button = True
695
+ self.logger.debug(f'Got button by role on frame: {label}')
696
+ await frame.get_by_role("button", name=label).click(timeout=2000)
697
+ break
698
+ except Exception as e:
699
+ self.logger.info(f'Issue with frame consent: {e}')
700
+ return got_button
701
+
702
+ async def _move_time_forward(self, page: Page, time: int) -> None:
703
+ time = max(time, 7)
704
+ try:
705
+ async with timeout(3):
706
+ await page.clock.run_for(random.randint((time - 5) * 1000,
707
+ (time + 5) * 1000))
708
+ self.logger.debug(f'Moved time forward by ~{time}s.')
709
+ except (TimeoutError, asyncio.TimeoutError):
710
+ self.logger.warning('Unable to move time forward.')
711
+
649
712
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
650
713
  referer: str | None=None,
651
714
  page: Page | None=None, depth: int=0,
@@ -714,7 +777,7 @@ class Capture():
714
777
  capturing_sub = False
715
778
  try:
716
779
  page = await self.context.new_page()
717
- # await page.clock.install()
780
+ await page.clock.install()
718
781
  except Error as e:
719
782
  self.logger.warning(f'The context is in a broken state: {e}')
720
783
  self.should_retry = True
@@ -839,6 +902,21 @@ class Capture():
839
902
  except Exception as e:
840
903
  self.logger.warning(f'Could not find body: {e}')
841
904
 
905
+ await self._wait_for_random_timeout(page, 5)
906
+ # triggering clicks on very generic frames is sometimes impossible, using button and common language.
907
+ self.logger.debug('Check other frames for button')
908
+ for frame in page.frames:
909
+ if await self.__frame_consent(frame):
910
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
911
+ self.logger.debug('Done with frames.')
912
+
913
+ self.logger.debug('Check main frame for button')
914
+ if await self.__frame_consent(page.main_frame):
915
+ self.logger.debug('Got button on main frame')
916
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
917
+
918
+ await self._move_time_forward(page, 10)
919
+
842
920
  if parsed_url.fragment:
843
921
  # We got a fragment, make sure we go to it and scroll only a little bit.
844
922
  fragment = unquote(parsed_url.fragment)
@@ -902,17 +980,13 @@ class Capture():
902
980
  z.writestr(f'{i}_{filename}', file_content)
903
981
  to_return["downloaded_file"] = mem_zip.getvalue()
904
982
 
905
- # fast forward 30s
906
- # try:
907
- # async with timeout(3):
908
- # await page.clock.run_for("47")
909
- # self.logger.debug('Moved time forward.')
910
- # except (TimeoutError, asyncio.TimeoutError):
911
- # self.logger.warning('Unable to move time forward.')
983
+ # fast forward ~30s
984
+ await self._move_time_forward(page, 30)
912
985
 
913
986
  self.logger.debug('Done with instrumentation, waiting for network idle.')
914
987
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
915
988
  await self._safe_wait(page)
989
+
916
990
  self.logger.debug('Done with instrumentation, done with waiting.')
917
991
 
918
992
  if content := await self._failsafe_get_content(page):
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.25.10"
3
+ version = "1.25.12"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -19,7 +19,7 @@ classifiers=[
19
19
 
20
20
  [tool.poetry.dependencies]
21
21
  python = "^3.8"
22
- playwright = "^1.45.1"
22
+ playwright = "^1.46.0"
23
23
  dateparser = "^1.2.0"
24
24
  beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
25
25
  w3lib = "^2.2.1"
@@ -28,10 +28,10 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
28
28
  pytz = {"version" = "^2024.1", python = "<3.9"}
29
29
  tzdata = "^2024.1"
30
30
  playwright-stealth = "^1.0.6"
31
- setuptools = "^72.1.0"
32
- puremagic = "^1.26"
31
+ setuptools = "^72.2.0"
32
+ puremagic = "^1.27"
33
33
  async-timeout = {version = "^4.0.3", python = "<3.11"}
34
- aiohttp = {extras = ["speedups"], version = "^3.10.1"}
34
+ aiohttp = {extras = ["speedups"], version = "^3.10.3"}
35
35
  aiohttp-socks = "^0.9"
36
36
 
37
37
  [tool.poetry.extras]