PlaywrightCapture 1.25.11__py3-none-any.whl → 1.25.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +78 -35
- {playwrightcapture-1.25.11.dist-info → playwrightcapture-1.25.12.dist-info}/METADATA +2 -2
- {playwrightcapture-1.25.11.dist-info → playwrightcapture-1.25.12.dist-info}/RECORD +5 -5
- {playwrightcapture-1.25.11.dist-info → playwrightcapture-1.25.12.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.25.11.dist-info → playwrightcapture-1.25.12.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -532,7 +532,7 @@ class Capture():
|
|
532
532
|
async def handler() -> None:
|
533
533
|
self.logger.debug('Didomi dialog found, clicking through.')
|
534
534
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
535
|
-
await page.locator("#didomi-notice-agree-button").click(timeout=
|
535
|
+
await page.locator("#didomi-notice-agree-button").click(timeout=3000)
|
536
536
|
|
537
537
|
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
|
538
538
|
self.logger.info('Didomi handler added')
|
@@ -575,10 +575,7 @@ class Capture():
|
|
575
575
|
|
576
576
|
async def __dialog_alert_dialog_clickthrough(self, page: Page) -> None:
|
577
577
|
async def handler() -> None:
|
578
|
-
if await page.
|
579
|
-
self.logger.info('Consent window found, clicking through.')
|
580
|
-
await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").click(timeout=2000)
|
581
|
-
elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
578
|
+
if await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
582
579
|
await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
|
583
580
|
else:
|
584
581
|
self.logger.info('Consent window found (alert dialog), but no button to click through.')
|
@@ -603,6 +600,9 @@ class Capture():
|
|
603
600
|
await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
|
604
601
|
elif await page.locator('#axeptio_btn_acceptAll').is_visible():
|
605
602
|
await page.locator('#axeptio_btn_acceptAll').click(timeout=2000)
|
603
|
+
elif await page.locator('.fc-cta-consent').is_visible():
|
604
|
+
# https://developers.google.com/funding-choices/fc-api-docs
|
605
|
+
await page.locator('.fc-cta-consent').click(timeout=2000)
|
606
606
|
else:
|
607
607
|
self.logger.info('Consent window found (dialog), but no button to click through.')
|
608
608
|
await page.add_locator_handler(
|
@@ -651,21 +651,64 @@ class Capture():
|
|
651
651
|
async def __frame_consent(self, frame: Frame) -> bool:
|
652
652
|
"""Search & Click content in iframes. Cannot easily use the locator handler for this without having many many handlers.
|
653
653
|
And the iframes don't have a title or a role to easily identify them so we just try with generic locators that vary by language."""
|
654
|
+
|
655
|
+
labels_to_click: list[str] = [
|
656
|
+
# German
|
657
|
+
"Alle akzeptieren",
|
658
|
+
"Zustimmen & weiter",
|
659
|
+
# French
|
660
|
+
"Accepter et continuer",
|
661
|
+
"Tout accepter",
|
662
|
+
"Accepter",
|
663
|
+
"Accepter les cookies",
|
664
|
+
# English
|
665
|
+
"Accept & continue",
|
666
|
+
"Accept all",
|
667
|
+
# Dutch
|
668
|
+
"Accepteer",
|
669
|
+
# Spanish
|
670
|
+
"Aceptar todo",
|
671
|
+
# Italian
|
672
|
+
"Accetta tutto",
|
673
|
+
# Arabic
|
674
|
+
"قبول الكل",
|
675
|
+
# Portuguese
|
676
|
+
"Aceitar tudo",
|
677
|
+
# Polish
|
678
|
+
"Akceptuj wszystko",
|
679
|
+
]
|
680
|
+
|
654
681
|
got_button: bool = False
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
682
|
+
try:
|
683
|
+
if await frame.locator("button.button__acceptAll").is_visible():
|
684
|
+
self.logger.info('Consent window found, clicking through.')
|
685
|
+
got_button = True
|
686
|
+
await frame.locator("button.button__acceptAll").click(timeout=2000)
|
687
|
+
for label in labels_to_click:
|
688
|
+
if await frame.get_by_label(label).is_visible():
|
689
|
+
got_button = True
|
690
|
+
self.logger.debug(f'Got button by label on frame: {label}')
|
691
|
+
await frame.get_by_label(label).click(timeout=2000)
|
692
|
+
break
|
693
|
+
if await frame.get_by_role("button", name=label).is_visible():
|
694
|
+
got_button = True
|
695
|
+
self.logger.debug(f'Got button by role on frame: {label}')
|
696
|
+
await frame.get_by_role("button", name=label).click(timeout=2000)
|
697
|
+
break
|
698
|
+
except Exception as e:
|
699
|
+
self.logger.info(f'Issue with frame consent: {e}')
|
667
700
|
return got_button
|
668
701
|
|
702
|
+
async def _move_time_forward(self, page: Page, time: int) -> None:
|
703
|
+
time = max(time, 7)
|
704
|
+
try:
|
705
|
+
async with timeout(3):
|
706
|
+
await page.clock.run_for(random.randint((time - 5) * 1000,
|
707
|
+
(time + 5) * 1000))
|
708
|
+
self.logger.debug(f'Moved time forward by ~{time}s.')
|
709
|
+
except (TimeoutError, asyncio.TimeoutError):
|
710
|
+
self.logger.warning('Unable to move time forward.')
|
711
|
+
|
669
712
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
670
713
|
referer: str | None=None,
|
671
714
|
page: Page | None=None, depth: int=0,
|
@@ -734,7 +777,7 @@ class Capture():
|
|
734
777
|
capturing_sub = False
|
735
778
|
try:
|
736
779
|
page = await self.context.new_page()
|
737
|
-
|
780
|
+
await page.clock.install()
|
738
781
|
except Error as e:
|
739
782
|
self.logger.warning(f'The context is in a broken state: {e}')
|
740
783
|
self.should_retry = True
|
@@ -859,6 +902,21 @@ class Capture():
|
|
859
902
|
except Exception as e:
|
860
903
|
self.logger.warning(f'Could not find body: {e}')
|
861
904
|
|
905
|
+
await self._wait_for_random_timeout(page, 5)
|
906
|
+
# triggering clicks on very generic frames is sometimes impossible, using button and common language.
|
907
|
+
self.logger.debug('Check other frames for button')
|
908
|
+
for frame in page.frames:
|
909
|
+
if await self.__frame_consent(frame):
|
910
|
+
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
911
|
+
self.logger.debug('Done with frames.')
|
912
|
+
|
913
|
+
self.logger.debug('Check main frame for button')
|
914
|
+
if await self.__frame_consent(page.main_frame):
|
915
|
+
self.logger.debug('Got button on main frame')
|
916
|
+
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
917
|
+
|
918
|
+
await self._move_time_forward(page, 10)
|
919
|
+
|
862
920
|
if parsed_url.fragment:
|
863
921
|
# We got a fragment, make sure we go to it and scroll only a little bit.
|
864
922
|
fragment = unquote(parsed_url.fragment)
|
@@ -922,25 +980,10 @@ class Capture():
|
|
922
980
|
z.writestr(f'{i}_{filename}', file_content)
|
923
981
|
to_return["downloaded_file"] = mem_zip.getvalue()
|
924
982
|
|
925
|
-
# fast forward 30s
|
926
|
-
|
927
|
-
# async with timeout(3):
|
928
|
-
# await page.clock.run_for("47")
|
929
|
-
# self.logger.debug('Moved time forward.')
|
930
|
-
# except (TimeoutError, asyncio.TimeoutError):
|
931
|
-
# self.logger.warning('Unable to move time forward.')
|
983
|
+
# fast forward ~30s
|
984
|
+
await self._move_time_forward(page, 30)
|
932
985
|
|
933
986
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
934
|
-
if allow_tracking:
|
935
|
-
self.logger.debug('Check iFrames for button')
|
936
|
-
for frame in page.frames:
|
937
|
-
frame_title = await frame.title()
|
938
|
-
self.logger.debug(f'Check button on {frame_title}')
|
939
|
-
if await self.__frame_consent(frame):
|
940
|
-
self.logger.debug(f'Got button on {frame_title}')
|
941
|
-
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
942
|
-
self.logger.debug('Done with iFrames.')
|
943
|
-
|
944
987
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
|
945
988
|
await self._safe_wait(page)
|
946
989
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.12
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -31,7 +31,7 @@ Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
|
31
31
|
Requires-Dist: puremagic (>=1.27,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=72.
|
34
|
+
Requires-Dist: setuptools (>=72.2.0,<73.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -1,9 +1,9 @@
|
|
1
1
|
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=
|
2
|
+
playwrightcapture/capture.py,sha256=R8ayCKpD1Q78ewaQ8-4tuMP8XVE8YGOwhwVYZDs4a8g,75867
|
3
3
|
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
4
|
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
5
|
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.25.
|
7
|
-
playwrightcapture-1.25.
|
8
|
-
playwrightcapture-1.25.
|
9
|
-
playwrightcapture-1.25.
|
6
|
+
playwrightcapture-1.25.12.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.25.12.dist-info/METADATA,sha256=Kw9CVpRSN9vLL1XSf_ZGIKAzX11VY4MJuXrO0OVCmHw,3172
|
8
|
+
playwrightcapture-1.25.12.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.25.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|