PlaywrightCapture 1.25.11__tar.gz → 1.25.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.11
3
+ Version: 1.25.13
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
31
  Requires-Dist: puremagic (>=1.27,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=72.1.0,<73.0.0)
34
+ Requires-Dist: setuptools (>=72.2.0,<73.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -532,7 +532,7 @@ class Capture():
532
532
  async def handler() -> None:
533
533
  self.logger.debug('Didomi dialog found, clicking through.')
534
534
  if await page.locator("#didomi-notice-agree-button").is_visible():
535
- await page.locator("#didomi-notice-agree-button").click(timeout=30000)
535
+ await page.locator("#didomi-notice-agree-button").click(timeout=3000)
536
536
 
537
537
  await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
538
538
  self.logger.info('Didomi handler added')
@@ -575,10 +575,7 @@ class Capture():
575
575
 
576
576
  async def __dialog_alert_dialog_clickthrough(self, page: Page) -> None:
577
577
  async def handler() -> None:
578
- if await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").is_visible():
579
- self.logger.info('Consent window found, clicking through.')
580
- await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").click(timeout=2000)
581
- elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
578
+ if await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
582
579
  await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
583
580
  else:
584
581
  self.logger.info('Consent window found (alert dialog), but no button to click through.')
@@ -603,6 +600,9 @@ class Capture():
603
600
  await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
604
601
  elif await page.locator('#axeptio_btn_acceptAll').is_visible():
605
602
  await page.locator('#axeptio_btn_acceptAll').click(timeout=2000)
603
+ elif await page.locator('.fc-cta-consent').is_visible():
604
+ # https://developers.google.com/funding-choices/fc-api-docs
605
+ await page.locator('.fc-cta-consent').click(timeout=2000)
606
606
  else:
607
607
  self.logger.info('Consent window found (dialog), but no button to click through.')
608
608
  await page.add_locator_handler(
@@ -636,6 +636,28 @@ class Capture():
636
636
  )
637
637
  self.logger.info('Yahoo handler added')
638
638
 
639
+ async def __dialog_tarteaucitron_clickthrough(self, page: Page) -> None:
640
+ # https://github.com/AmauriC/tarteaucitron.js/
641
+ async def handler() -> None:
642
+ if await page.locator('#tarteaucitronAlertBig').locator('button.tarteaucitronAllow').is_visible():
643
+ self.logger.debug('Got TarteAuCitron big , clicking through.')
644
+ await page.locator('#tarteaucitronAlertBig').locator("button.tarteaucitronAllow").click(timeout=2000)
645
+ elif await page.locator('#tarteaucitronAlertSmall').locator('button.tarteaucitronAllow').is_visible():
646
+ self.logger.debug('Got TarteAuCitron small, clicking through.')
647
+ await page.locator('#tarteaucitronAlertSmall').locator("button.tarteaucitronAllow").click(timeout=2000)
648
+
649
+ await page.add_locator_handler(
650
+ page.locator('#tarteaucitronAlertBig'),
651
+ handler,
652
+ times=1, no_wait_after=True
653
+ )
654
+ await page.add_locator_handler(
655
+ page.locator('#tarteaucitronAlertSmall'),
656
+ handler,
657
+ times=1, no_wait_after=True
658
+ )
659
+ self.logger.info('TarteAuCitron handler added')
660
+
639
661
  async def __dialog_ppms_clickthrough(self, page: Page) -> None:
640
662
  async def handler() -> None:
641
663
  if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
@@ -651,21 +673,81 @@ class Capture():
651
673
  async def __frame_consent(self, frame: Frame) -> bool:
652
674
  """Search & Click content in iframes. Cannot easily use the locator handler for this without having many many handlers.
653
675
  And the iframes don't have a title or a role to easily identify them so we just try with generic locators that vary by language."""
676
+
677
+ labels_to_click: list[str] = [
678
+ # German
679
+ "Alle akzeptieren",
680
+ "Zustimmen & weiter",
681
+ # French
682
+ "Accepter et continuer",
683
+ "Tout accepter",
684
+ "Accepter",
685
+ "Accepter les cookies",
686
+ "Autoriser",
687
+ # English
688
+ "Accept & continue",
689
+ "Accept all",
690
+ "Accept",
691
+ "Agree and close",
692
+ # Dutch
693
+ "Accepteer",
694
+ # Spanish
695
+ "Aceptar todo",
696
+ # Italian
697
+ "Accetta tutto",
698
+ # Arabic
699
+ "قبول الكل",
700
+ # Portuguese
701
+ "Aceitar tudo",
702
+ # Polish
703
+ "Akceptuj wszystko",
704
+ ]
705
+
654
706
  got_button: bool = False
655
- if await frame.get_by_label("Alle akzeptieren").is_visible():
656
- got_button = True
657
- await frame.get_by_label("Alle akzeptieren").click(timeout=2000)
658
- elif await frame.get_by_label("Accept & continue").is_visible():
659
- got_button = True
660
- await frame.get_by_label("Accept & continue").click(timeout=2000)
661
- elif await frame.get_by_label("Accepter et continuer").is_visible():
662
- got_button = True
663
- await frame.get_by_label("Accepter et continuer").click(timeout=2000)
664
- elif await frame.get_by_label("Accepteer").is_visible():
665
- got_button = True
666
- await frame.get_by_label("Accepteer").click(timeout=2000)
707
+ try:
708
+ try:
709
+ async with timeout(5):
710
+ if await frame.locator("button.button__acceptAll").is_visible():
711
+ self.logger.info('Consent window found, clicking through.')
712
+ got_button = True
713
+ await frame.locator("button.button__acceptAll").click(timeout=2000)
714
+ except (TimeoutError, asyncio.TimeoutError) as e:
715
+ self.logger.warning(f'Frame consent timeout: {e}')
716
+
717
+ for label in labels_to_click:
718
+ try:
719
+ async with timeout(5):
720
+ if await frame.get_by_label(label).is_visible():
721
+ got_button = True
722
+ self.logger.debug(f'Got button by label on frame: {label}')
723
+ await frame.get_by_label(label).click(timeout=2000)
724
+ break
725
+ except (TimeoutError, asyncio.TimeoutError) as e:
726
+ self.logger.warning(f'Frame consent timeout: {e}')
727
+
728
+ try:
729
+ async with timeout(5):
730
+ if await frame.get_by_role("button", name=label).is_visible():
731
+ got_button = True
732
+ self.logger.debug(f'Got button by role on frame: {label}')
733
+ await frame.get_by_role("button", name=label).click(timeout=2000)
734
+ break
735
+ except (TimeoutError, asyncio.TimeoutError) as e:
736
+ self.logger.warning(f'Frame consent timeout: {e}')
737
+ except Exception as e:
738
+ self.logger.info(f'Issue with frame consent: {e}')
667
739
  return got_button
668
740
 
741
+ async def _move_time_forward(self, page: Page, time: int) -> None:
742
+ time = max(time, 7)
743
+ try:
744
+ async with timeout(3):
745
+ await page.clock.run_for(random.randint((time - 5) * 1000,
746
+ (time + 5) * 1000))
747
+ self.logger.debug(f'Moved time forward by ~{time}s.')
748
+ except (TimeoutError, asyncio.TimeoutError):
749
+ self.logger.warning('Unable to move time forward.')
750
+
669
751
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
670
752
  referer: str | None=None,
671
753
  page: Page | None=None, depth: int=0,
@@ -734,7 +816,7 @@ class Capture():
734
816
  capturing_sub = False
735
817
  try:
736
818
  page = await self.context.new_page()
737
- # await page.clock.install()
819
+ await page.clock.install()
738
820
  except Error as e:
739
821
  self.logger.warning(f'The context is in a broken state: {e}')
740
822
  self.should_retry = True
@@ -751,6 +833,7 @@ class Capture():
751
833
  await self.__dialog_ppms_clickthrough(page)
752
834
  await self.__dialog_alert_dialog_clickthrough(page)
753
835
  await self.__dialog_clickthrough(page)
836
+ await self.__dialog_tarteaucitron_clickthrough(page)
754
837
 
755
838
  await stealth_async(page, PCStealthConfig())
756
839
 
@@ -841,8 +924,13 @@ class Capture():
841
924
  self.logger.debug('Done with captcha.')
842
925
 
843
926
  # move mouse
844
- await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
845
- self.logger.debug('Moved mouse.')
927
+ try:
928
+ async with timeout(5):
929
+ await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
930
+ self.logger.debug('Moved mouse.')
931
+ except (asyncio.TimeoutError, TimeoutError):
932
+ self.logger.debug('Moving the mouse caused a timeout.')
933
+
846
934
  await self._wait_for_random_timeout(page, 5)
847
935
  self.logger.debug('Keep going after moving mouse.')
848
936
 
@@ -859,6 +947,21 @@ class Capture():
859
947
  except Exception as e:
860
948
  self.logger.warning(f'Could not find body: {e}')
861
949
 
950
+ await self._wait_for_random_timeout(page, 5)
951
+ # triggering clicks on very generic frames is sometimes impossible, using button and common language.
952
+ self.logger.debug('Check other frames for button')
953
+ for frame in page.frames:
954
+ if await self.__frame_consent(frame):
955
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
956
+ self.logger.debug('Done with frames.')
957
+
958
+ self.logger.debug('Check main frame for button')
959
+ if await self.__frame_consent(page.main_frame):
960
+ self.logger.debug('Got button on main frame')
961
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
962
+
963
+ await self._move_time_forward(page, 10)
964
+
862
965
  if parsed_url.fragment:
863
966
  # We got a fragment, make sure we go to it and scroll only a little bit.
864
967
  fragment = unquote(parsed_url.fragment)
@@ -897,11 +1000,14 @@ class Capture():
897
1000
  self.logger.debug('Keep going after moving on page.')
898
1001
 
899
1002
  try:
900
- await page.keyboard.press('PageUp')
901
- self.logger.debug('PageUp on keyboard')
902
- await self._wait_for_random_timeout(page, 3)
903
- await page.keyboard.press('PageDown')
904
- self.logger.debug('PageDown on keyboard')
1003
+ async with timeout(5):
1004
+ await page.keyboard.press('PageUp')
1005
+ self.logger.debug('PageUp on keyboard')
1006
+ await self._wait_for_random_timeout(page, 3)
1007
+ await page.keyboard.press('PageDown')
1008
+ self.logger.debug('PageDown on keyboard')
1009
+ except (asyncio.TimeoutError, TimeoutError):
1010
+ self.logger.debug('Using keyboard caused a timeout.')
905
1011
  except Error as e:
906
1012
  self.logger.debug(f'Unable to use keyboard: {e}')
907
1013
  if self.wait_for_download > 0:
@@ -922,25 +1028,10 @@ class Capture():
922
1028
  z.writestr(f'{i}_{filename}', file_content)
923
1029
  to_return["downloaded_file"] = mem_zip.getvalue()
924
1030
 
925
- # fast forward 30s
926
- # try:
927
- # async with timeout(3):
928
- # await page.clock.run_for("47")
929
- # self.logger.debug('Moved time forward.')
930
- # except (TimeoutError, asyncio.TimeoutError):
931
- # self.logger.warning('Unable to move time forward.')
1031
+ # fast forward ~30s
1032
+ await self._move_time_forward(page, 30)
932
1033
 
933
1034
  self.logger.debug('Done with instrumentation, waiting for network idle.')
934
- if allow_tracking:
935
- self.logger.debug('Check iFrames for button')
936
- for frame in page.frames:
937
- frame_title = await frame.title()
938
- self.logger.debug(f'Check button on {frame_title}')
939
- if await self.__frame_consent(frame):
940
- self.logger.debug(f'Got button on {frame_title}')
941
- await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
942
- self.logger.debug('Done with iFrames.')
943
-
944
1035
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
945
1036
  await self._safe_wait(page)
946
1037
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.25.11"
3
+ version = "1.25.13"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -28,7 +28,7 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
28
28
  pytz = {"version" = "^2024.1", python = "<3.9"}
29
29
  tzdata = "^2024.1"
30
30
  playwright-stealth = "^1.0.6"
31
- setuptools = "^72.1.0"
31
+ setuptools = "^72.2.0"
32
32
  puremagic = "^1.27"
33
33
  async-timeout = {version = "^4.0.3", python = "<3.11"}
34
34
  aiohttp = {extras = ["speedups"], version = "^3.10.3"}