PlaywrightCapture 1.25.11__tar.gz → 1.25.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/PKG-INFO +2 -2
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/playwrightcapture/capture.py +133 -42
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/pyproject.toml +2 -2
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/LICENSE +0 -0
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/README.md +0 -0
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.25.11 → playwrightcapture-1.25.13}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.13
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
|
31
31
|
Requires-Dist: puremagic (>=1.27,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=72.
|
34
|
+
Requires-Dist: setuptools (>=72.2.0,<73.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -532,7 +532,7 @@ class Capture():
|
|
532
532
|
async def handler() -> None:
|
533
533
|
self.logger.debug('Didomi dialog found, clicking through.')
|
534
534
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
535
|
-
await page.locator("#didomi-notice-agree-button").click(timeout=
|
535
|
+
await page.locator("#didomi-notice-agree-button").click(timeout=3000)
|
536
536
|
|
537
537
|
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
|
538
538
|
self.logger.info('Didomi handler added')
|
@@ -575,10 +575,7 @@ class Capture():
|
|
575
575
|
|
576
576
|
async def __dialog_alert_dialog_clickthrough(self, page: Page) -> None:
|
577
577
|
async def handler() -> None:
|
578
|
-
if await page.
|
579
|
-
self.logger.info('Consent window found, clicking through.')
|
580
|
-
await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").click(timeout=2000)
|
581
|
-
elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
578
|
+
if await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
582
579
|
await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
|
583
580
|
else:
|
584
581
|
self.logger.info('Consent window found (alert dialog), but no button to click through.')
|
@@ -603,6 +600,9 @@ class Capture():
|
|
603
600
|
await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
|
604
601
|
elif await page.locator('#axeptio_btn_acceptAll').is_visible():
|
605
602
|
await page.locator('#axeptio_btn_acceptAll').click(timeout=2000)
|
603
|
+
elif await page.locator('.fc-cta-consent').is_visible():
|
604
|
+
# https://developers.google.com/funding-choices/fc-api-docs
|
605
|
+
await page.locator('.fc-cta-consent').click(timeout=2000)
|
606
606
|
else:
|
607
607
|
self.logger.info('Consent window found (dialog), but no button to click through.')
|
608
608
|
await page.add_locator_handler(
|
@@ -636,6 +636,28 @@ class Capture():
|
|
636
636
|
)
|
637
637
|
self.logger.info('Yahoo handler added')
|
638
638
|
|
639
|
+
async def __dialog_tarteaucitron_clickthrough(self, page: Page) -> None:
|
640
|
+
# https://github.com/AmauriC/tarteaucitron.js/
|
641
|
+
async def handler() -> None:
|
642
|
+
if await page.locator('#tarteaucitronAlertBig').locator('button.tarteaucitronAllow').is_visible():
|
643
|
+
self.logger.debug('Got TarteAuCitron big , clicking through.')
|
644
|
+
await page.locator('#tarteaucitronAlertBig').locator("button.tarteaucitronAllow").click(timeout=2000)
|
645
|
+
elif await page.locator('#tarteaucitronAlertSmall').locator('button.tarteaucitronAllow').is_visible():
|
646
|
+
self.logger.debug('Got TarteAuCitron small, clicking through.')
|
647
|
+
await page.locator('#tarteaucitronAlertSmall').locator("button.tarteaucitronAllow").click(timeout=2000)
|
648
|
+
|
649
|
+
await page.add_locator_handler(
|
650
|
+
page.locator('#tarteaucitronAlertBig'),
|
651
|
+
handler,
|
652
|
+
times=1, no_wait_after=True
|
653
|
+
)
|
654
|
+
await page.add_locator_handler(
|
655
|
+
page.locator('#tarteaucitronAlertSmall'),
|
656
|
+
handler,
|
657
|
+
times=1, no_wait_after=True
|
658
|
+
)
|
659
|
+
self.logger.info('TarteAuCitron handler added')
|
660
|
+
|
639
661
|
async def __dialog_ppms_clickthrough(self, page: Page) -> None:
|
640
662
|
async def handler() -> None:
|
641
663
|
if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
|
@@ -651,21 +673,81 @@ class Capture():
|
|
651
673
|
async def __frame_consent(self, frame: Frame) -> bool:
|
652
674
|
"""Search & Click content in iframes. Cannot easily use the locator handler for this without having many many handlers.
|
653
675
|
And the iframes don't have a title or a role to easily identify them so we just try with generic locators that vary by language."""
|
676
|
+
|
677
|
+
labels_to_click: list[str] = [
|
678
|
+
# German
|
679
|
+
"Alle akzeptieren",
|
680
|
+
"Zustimmen & weiter",
|
681
|
+
# French
|
682
|
+
"Accepter et continuer",
|
683
|
+
"Tout accepter",
|
684
|
+
"Accepter",
|
685
|
+
"Accepter les cookies",
|
686
|
+
"Autoriser",
|
687
|
+
# English
|
688
|
+
"Accept & continue",
|
689
|
+
"Accept all",
|
690
|
+
"Accept",
|
691
|
+
"Agree and close",
|
692
|
+
# Dutch
|
693
|
+
"Accepteer",
|
694
|
+
# Spanish
|
695
|
+
"Aceptar todo",
|
696
|
+
# Italian
|
697
|
+
"Accetta tutto",
|
698
|
+
# Arabic
|
699
|
+
"قبول الكل",
|
700
|
+
# Portuguese
|
701
|
+
"Aceitar tudo",
|
702
|
+
# Polish
|
703
|
+
"Akceptuj wszystko",
|
704
|
+
]
|
705
|
+
|
654
706
|
got_button: bool = False
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
707
|
+
try:
|
708
|
+
try:
|
709
|
+
async with timeout(5):
|
710
|
+
if await frame.locator("button.button__acceptAll").is_visible():
|
711
|
+
self.logger.info('Consent window found, clicking through.')
|
712
|
+
got_button = True
|
713
|
+
await frame.locator("button.button__acceptAll").click(timeout=2000)
|
714
|
+
except (TimeoutError, asyncio.TimeoutError) as e:
|
715
|
+
self.logger.warning(f'Frame consent timeout: {e}')
|
716
|
+
|
717
|
+
for label in labels_to_click:
|
718
|
+
try:
|
719
|
+
async with timeout(5):
|
720
|
+
if await frame.get_by_label(label).is_visible():
|
721
|
+
got_button = True
|
722
|
+
self.logger.debug(f'Got button by label on frame: {label}')
|
723
|
+
await frame.get_by_label(label).click(timeout=2000)
|
724
|
+
break
|
725
|
+
except (TimeoutError, asyncio.TimeoutError) as e:
|
726
|
+
self.logger.warning(f'Frame consent timeout: {e}')
|
727
|
+
|
728
|
+
try:
|
729
|
+
async with timeout(5):
|
730
|
+
if await frame.get_by_role("button", name=label).is_visible():
|
731
|
+
got_button = True
|
732
|
+
self.logger.debug(f'Got button by role on frame: {label}')
|
733
|
+
await frame.get_by_role("button", name=label).click(timeout=2000)
|
734
|
+
break
|
735
|
+
except (TimeoutError, asyncio.TimeoutError) as e:
|
736
|
+
self.logger.warning(f'Frame consent timeout: {e}')
|
737
|
+
except Exception as e:
|
738
|
+
self.logger.info(f'Issue with frame consent: {e}')
|
667
739
|
return got_button
|
668
740
|
|
741
|
+
async def _move_time_forward(self, page: Page, time: int) -> None:
|
742
|
+
time = max(time, 7)
|
743
|
+
try:
|
744
|
+
async with timeout(3):
|
745
|
+
await page.clock.run_for(random.randint((time - 5) * 1000,
|
746
|
+
(time + 5) * 1000))
|
747
|
+
self.logger.debug(f'Moved time forward by ~{time}s.')
|
748
|
+
except (TimeoutError, asyncio.TimeoutError):
|
749
|
+
self.logger.warning('Unable to move time forward.')
|
750
|
+
|
669
751
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
670
752
|
referer: str | None=None,
|
671
753
|
page: Page | None=None, depth: int=0,
|
@@ -734,7 +816,7 @@ class Capture():
|
|
734
816
|
capturing_sub = False
|
735
817
|
try:
|
736
818
|
page = await self.context.new_page()
|
737
|
-
|
819
|
+
await page.clock.install()
|
738
820
|
except Error as e:
|
739
821
|
self.logger.warning(f'The context is in a broken state: {e}')
|
740
822
|
self.should_retry = True
|
@@ -751,6 +833,7 @@ class Capture():
|
|
751
833
|
await self.__dialog_ppms_clickthrough(page)
|
752
834
|
await self.__dialog_alert_dialog_clickthrough(page)
|
753
835
|
await self.__dialog_clickthrough(page)
|
836
|
+
await self.__dialog_tarteaucitron_clickthrough(page)
|
754
837
|
|
755
838
|
await stealth_async(page, PCStealthConfig())
|
756
839
|
|
@@ -841,8 +924,13 @@ class Capture():
|
|
841
924
|
self.logger.debug('Done with captcha.')
|
842
925
|
|
843
926
|
# move mouse
|
844
|
-
|
845
|
-
|
927
|
+
try:
|
928
|
+
async with timeout(5):
|
929
|
+
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
930
|
+
self.logger.debug('Moved mouse.')
|
931
|
+
except (asyncio.TimeoutError, TimeoutError):
|
932
|
+
self.logger.debug('Moving the mouse caused a timeout.')
|
933
|
+
|
846
934
|
await self._wait_for_random_timeout(page, 5)
|
847
935
|
self.logger.debug('Keep going after moving mouse.')
|
848
936
|
|
@@ -859,6 +947,21 @@ class Capture():
|
|
859
947
|
except Exception as e:
|
860
948
|
self.logger.warning(f'Could not find body: {e}')
|
861
949
|
|
950
|
+
await self._wait_for_random_timeout(page, 5)
|
951
|
+
# triggering clicks on very generic frames is sometimes impossible, using button and common language.
|
952
|
+
self.logger.debug('Check other frames for button')
|
953
|
+
for frame in page.frames:
|
954
|
+
if await self.__frame_consent(frame):
|
955
|
+
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
956
|
+
self.logger.debug('Done with frames.')
|
957
|
+
|
958
|
+
self.logger.debug('Check main frame for button')
|
959
|
+
if await self.__frame_consent(page.main_frame):
|
960
|
+
self.logger.debug('Got button on main frame')
|
961
|
+
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
962
|
+
|
963
|
+
await self._move_time_forward(page, 10)
|
964
|
+
|
862
965
|
if parsed_url.fragment:
|
863
966
|
# We got a fragment, make sure we go to it and scroll only a little bit.
|
864
967
|
fragment = unquote(parsed_url.fragment)
|
@@ -897,11 +1000,14 @@ class Capture():
|
|
897
1000
|
self.logger.debug('Keep going after moving on page.')
|
898
1001
|
|
899
1002
|
try:
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
1003
|
+
async with timeout(5):
|
1004
|
+
await page.keyboard.press('PageUp')
|
1005
|
+
self.logger.debug('PageUp on keyboard')
|
1006
|
+
await self._wait_for_random_timeout(page, 3)
|
1007
|
+
await page.keyboard.press('PageDown')
|
1008
|
+
self.logger.debug('PageDown on keyboard')
|
1009
|
+
except (asyncio.TimeoutError, TimeoutError):
|
1010
|
+
self.logger.debug('Using keyboard caused a timeout.')
|
905
1011
|
except Error as e:
|
906
1012
|
self.logger.debug(f'Unable to use keyboard: {e}')
|
907
1013
|
if self.wait_for_download > 0:
|
@@ -922,25 +1028,10 @@ class Capture():
|
|
922
1028
|
z.writestr(f'{i}_{filename}', file_content)
|
923
1029
|
to_return["downloaded_file"] = mem_zip.getvalue()
|
924
1030
|
|
925
|
-
# fast forward 30s
|
926
|
-
|
927
|
-
# async with timeout(3):
|
928
|
-
# await page.clock.run_for("47")
|
929
|
-
# self.logger.debug('Moved time forward.')
|
930
|
-
# except (TimeoutError, asyncio.TimeoutError):
|
931
|
-
# self.logger.warning('Unable to move time forward.')
|
1031
|
+
# fast forward ~30s
|
1032
|
+
await self._move_time_forward(page, 30)
|
932
1033
|
|
933
1034
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
934
|
-
if allow_tracking:
|
935
|
-
self.logger.debug('Check iFrames for button')
|
936
|
-
for frame in page.frames:
|
937
|
-
frame_title = await frame.title()
|
938
|
-
self.logger.debug(f'Check button on {frame_title}')
|
939
|
-
if await self.__frame_consent(frame):
|
940
|
-
self.logger.debug(f'Got button on {frame_title}')
|
941
|
-
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
942
|
-
self.logger.debug('Done with iFrames.')
|
943
|
-
|
944
1035
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
|
945
1036
|
await self._safe_wait(page)
|
946
1037
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.25.
|
3
|
+
version = "1.25.13"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -28,7 +28,7 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
|
|
28
28
|
pytz = {"version" = "^2024.1", python = "<3.9"}
|
29
29
|
tzdata = "^2024.1"
|
30
30
|
playwright-stealth = "^1.0.6"
|
31
|
-
setuptools = "^72.
|
31
|
+
setuptools = "^72.2.0"
|
32
32
|
puremagic = "^1.27"
|
33
33
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
34
|
aiohttp = {extras = ["speedups"], version = "^3.10.3"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|