PlaywrightCapture 1.25.11__py3-none-any.whl → 1.25.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -532,7 +532,7 @@ class Capture():
532
532
  async def handler() -> None:
533
533
  self.logger.debug('Didomi dialog found, clicking through.')
534
534
  if await page.locator("#didomi-notice-agree-button").is_visible():
535
- await page.locator("#didomi-notice-agree-button").click(timeout=30000)
535
+ await page.locator("#didomi-notice-agree-button").click(timeout=3000)
536
536
 
537
537
  await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
538
538
  self.logger.info('Didomi handler added')
@@ -575,10 +575,7 @@ class Capture():
575
575
 
576
576
  async def __dialog_alert_dialog_clickthrough(self, page: Page) -> None:
577
577
  async def handler() -> None:
578
- if await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").is_visible():
579
- self.logger.info('Consent window found, clicking through.')
580
- await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").click(timeout=2000)
581
- elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
578
+ if await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
582
579
  await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
583
580
  else:
584
581
  self.logger.info('Consent window found (alert dialog), but no button to click through.')
@@ -603,6 +600,9 @@ class Capture():
603
600
  await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
604
601
  elif await page.locator('#axeptio_btn_acceptAll').is_visible():
605
602
  await page.locator('#axeptio_btn_acceptAll').click(timeout=2000)
603
+ elif await page.locator('.fc-cta-consent').is_visible():
604
+ # https://developers.google.com/funding-choices/fc-api-docs
605
+ await page.locator('.fc-cta-consent').click(timeout=2000)
606
606
  else:
607
607
  self.logger.info('Consent window found (dialog), but no button to click through.')
608
608
  await page.add_locator_handler(
@@ -651,21 +651,64 @@ class Capture():
651
651
  async def __frame_consent(self, frame: Frame) -> bool:
652
652
  """Search & Click content in iframes. Cannot easily use the locator handler for this without having many many handlers.
653
653
  And the iframes don't have a title or a role to easily identify them so we just try with generic locators that vary by language."""
654
+
655
+ labels_to_click: list[str] = [
656
+ # German
657
+ "Alle akzeptieren",
658
+ "Zustimmen & weiter",
659
+ # French
660
+ "Accepter et continuer",
661
+ "Tout accepter",
662
+ "Accepter",
663
+ "Accepter les cookies",
664
+ # English
665
+ "Accept & continue",
666
+ "Accept all",
667
+ # Dutch
668
+ "Accepteer",
669
+ # Spanish
670
+ "Aceptar todo",
671
+ # Italian
672
+ "Accetta tutto",
673
+ # Arabic
674
+ "قبول الكل",
675
+ # Portuguese
676
+ "Aceitar tudo",
677
+ # Polish
678
+ "Akceptuj wszystko",
679
+ ]
680
+
654
681
  got_button: bool = False
655
- if await frame.get_by_label("Alle akzeptieren").is_visible():
656
- got_button = True
657
- await frame.get_by_label("Alle akzeptieren").click(timeout=2000)
658
- elif await frame.get_by_label("Accept & continue").is_visible():
659
- got_button = True
660
- await frame.get_by_label("Accept & continue").click(timeout=2000)
661
- elif await frame.get_by_label("Accepter et continuer").is_visible():
662
- got_button = True
663
- await frame.get_by_label("Accepter et continuer").click(timeout=2000)
664
- elif await frame.get_by_label("Accepteer").is_visible():
665
- got_button = True
666
- await frame.get_by_label("Accepteer").click(timeout=2000)
682
+ try:
683
+ if await frame.locator("button.button__acceptAll").is_visible():
684
+ self.logger.info('Consent window found, clicking through.')
685
+ got_button = True
686
+ await frame.locator("button.button__acceptAll").click(timeout=2000)
687
+ for label in labels_to_click:
688
+ if await frame.get_by_label(label).is_visible():
689
+ got_button = True
690
+ self.logger.debug(f'Got button by label on frame: {label}')
691
+ await frame.get_by_label(label).click(timeout=2000)
692
+ break
693
+ if await frame.get_by_role("button", name=label).is_visible():
694
+ got_button = True
695
+ self.logger.debug(f'Got button by role on frame: {label}')
696
+ await frame.get_by_role("button", name=label).click(timeout=2000)
697
+ break
698
+ except Exception as e:
699
+ self.logger.info(f'Issue with frame consent: {e}')
667
700
  return got_button
668
701
 
702
+ async def _move_time_forward(self, page: Page, time: int) -> None:
703
+ time = max(time, 7)
704
+ try:
705
+ async with timeout(3):
706
+ await page.clock.run_for(random.randint((time - 5) * 1000,
707
+ (time + 5) * 1000))
708
+ self.logger.debug(f'Moved time forward by ~{time}s.')
709
+ except (TimeoutError, asyncio.TimeoutError):
710
+ self.logger.warning('Unable to move time forward.')
711
+
669
712
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
670
713
  referer: str | None=None,
671
714
  page: Page | None=None, depth: int=0,
@@ -734,7 +777,7 @@ class Capture():
734
777
  capturing_sub = False
735
778
  try:
736
779
  page = await self.context.new_page()
737
- # await page.clock.install()
780
+ await page.clock.install()
738
781
  except Error as e:
739
782
  self.logger.warning(f'The context is in a broken state: {e}')
740
783
  self.should_retry = True
@@ -859,6 +902,21 @@ class Capture():
859
902
  except Exception as e:
860
903
  self.logger.warning(f'Could not find body: {e}')
861
904
 
905
+ await self._wait_for_random_timeout(page, 5)
906
+ # triggering clicks on very generic frames is sometimes impossible, using button and common language.
907
+ self.logger.debug('Check other frames for button')
908
+ for frame in page.frames:
909
+ if await self.__frame_consent(frame):
910
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
911
+ self.logger.debug('Done with frames.')
912
+
913
+ self.logger.debug('Check main frame for button')
914
+ if await self.__frame_consent(page.main_frame):
915
+ self.logger.debug('Got button on main frame')
916
+ await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
917
+
918
+ await self._move_time_forward(page, 10)
919
+
862
920
  if parsed_url.fragment:
863
921
  # We got a fragment, make sure we go to it and scroll only a little bit.
864
922
  fragment = unquote(parsed_url.fragment)
@@ -922,25 +980,10 @@ class Capture():
922
980
  z.writestr(f'{i}_{filename}', file_content)
923
981
  to_return["downloaded_file"] = mem_zip.getvalue()
924
982
 
925
- # fast forward 30s
926
- # try:
927
- # async with timeout(3):
928
- # await page.clock.run_for("47")
929
- # self.logger.debug('Moved time forward.')
930
- # except (TimeoutError, asyncio.TimeoutError):
931
- # self.logger.warning('Unable to move time forward.')
983
+ # fast forward ~30s
984
+ await self._move_time_forward(page, 30)
932
985
 
933
986
  self.logger.debug('Done with instrumentation, waiting for network idle.')
934
- if allow_tracking:
935
- self.logger.debug('Check iFrames for button')
936
- for frame in page.frames:
937
- frame_title = await frame.title()
938
- self.logger.debug(f'Check button on {frame_title}')
939
- if await self.__frame_consent(frame):
940
- self.logger.debug(f'Got button on {frame_title}')
941
- await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
942
- self.logger.debug('Done with iFrames.')
943
-
944
987
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
945
988
  await self._safe_wait(page)
946
989
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.25.11
3
+ Version: 1.25.12
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
31
31
  Requires-Dist: puremagic (>=1.27,<2.0)
32
32
  Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
33
33
  Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
34
- Requires-Dist: setuptools (>=72.1.0,<73.0.0)
34
+ Requires-Dist: setuptools (>=72.2.0,<73.0.0)
35
35
  Requires-Dist: tzdata (>=2024.1,<2025.0)
36
36
  Requires-Dist: w3lib (>=2.2.1,<3.0.0)
37
37
  Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
@@ -1,9 +1,9 @@
1
1
  playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
2
- playwrightcapture/capture.py,sha256=Rmo_EVRlR9btsgE2H99OtGPRZwIe8RVq-JCc2GzUWiI,74446
2
+ playwrightcapture/capture.py,sha256=R8ayCKpD1Q78ewaQ8-4tuMP8XVE8YGOwhwVYZDs4a8g,75867
3
3
  playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
4
4
  playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
5
5
  playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- playwrightcapture-1.25.11.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
- playwrightcapture-1.25.11.dist-info/METADATA,sha256=nGuO6TAlz2lKM15HiIgZJ4iERLBO_AXNBBpgqo8nfhM,3172
8
- playwrightcapture-1.25.11.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- playwrightcapture-1.25.11.dist-info/RECORD,,
6
+ playwrightcapture-1.25.12.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
7
+ playwrightcapture-1.25.12.dist-info/METADATA,sha256=Kw9CVpRSN9vLL1XSf_ZGIKAzX11VY4MJuXrO0OVCmHw,3172
8
+ playwrightcapture-1.25.12.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
+ playwrightcapture-1.25.12.dist-info/RECORD,,