PlaywrightCapture 1.25.10__py3-none-any.whl → 1.25.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +87 -13
- {playwrightcapture-1.25.10.dist-info → playwrightcapture-1.25.12.dist-info}/METADATA +5 -5
- {playwrightcapture-1.25.10.dist-info → playwrightcapture-1.25.12.dist-info}/RECORD +5 -5
- {playwrightcapture-1.25.10.dist-info → playwrightcapture-1.25.12.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.25.10.dist-info → playwrightcapture-1.25.12.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -532,7 +532,7 @@ class Capture():
|
|
532
532
|
async def handler() -> None:
|
533
533
|
self.logger.debug('Didomi dialog found, clicking through.')
|
534
534
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
535
|
-
await page.locator("#didomi-notice-agree-button").click(timeout=
|
535
|
+
await page.locator("#didomi-notice-agree-button").click(timeout=3000)
|
536
536
|
|
537
537
|
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
|
538
538
|
self.logger.info('Didomi handler added')
|
@@ -575,10 +575,7 @@ class Capture():
|
|
575
575
|
|
576
576
|
async def __dialog_alert_dialog_clickthrough(self, page: Page) -> None:
|
577
577
|
async def handler() -> None:
|
578
|
-
if await page.
|
579
|
-
self.logger.info('Consent window found, clicking through.')
|
580
|
-
await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").click(timeout=2000)
|
581
|
-
elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
578
|
+
if await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
|
582
579
|
await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
|
583
580
|
else:
|
584
581
|
self.logger.info('Consent window found (alert dialog), but no button to click through.')
|
@@ -601,6 +598,11 @@ class Capture():
|
|
601
598
|
elif await page.get_by_test_id("uc-accept-all-button").is_visible():
|
602
599
|
self.logger.info('Consent window found, clicking through.')
|
603
600
|
await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
|
601
|
+
elif await page.locator('#axeptio_btn_acceptAll').is_visible():
|
602
|
+
await page.locator('#axeptio_btn_acceptAll').click(timeout=2000)
|
603
|
+
elif await page.locator('.fc-cta-consent').is_visible():
|
604
|
+
# https://developers.google.com/funding-choices/fc-api-docs
|
605
|
+
await page.locator('.fc-cta-consent').click(timeout=2000)
|
604
606
|
else:
|
605
607
|
self.logger.info('Consent window found (dialog), but no button to click through.')
|
606
608
|
await page.add_locator_handler(
|
@@ -646,6 +648,67 @@ class Capture():
|
|
646
648
|
)
|
647
649
|
self.logger.info('Piwik handler added')
|
648
650
|
|
651
|
+
async def __frame_consent(self, frame: Frame) -> bool:
|
652
|
+
"""Search & Click content in iframes. Cannot easily use the locator handler for this without having many many handlers.
|
653
|
+
And the iframes don't have a title or a role to easily identify them so we just try with generic locators that vary by language."""
|
654
|
+
|
655
|
+
labels_to_click: list[str] = [
|
656
|
+
# German
|
657
|
+
"Alle akzeptieren",
|
658
|
+
"Zustimmen & weiter",
|
659
|
+
# French
|
660
|
+
"Accepter et continuer",
|
661
|
+
"Tout accepter",
|
662
|
+
"Accepter",
|
663
|
+
"Accepter les cookies",
|
664
|
+
# English
|
665
|
+
"Accept & continue",
|
666
|
+
"Accept all",
|
667
|
+
# Dutch
|
668
|
+
"Accepteer",
|
669
|
+
# Spanish
|
670
|
+
"Aceptar todo",
|
671
|
+
# Italian
|
672
|
+
"Accetta tutto",
|
673
|
+
# Arabic
|
674
|
+
"قبول الكل",
|
675
|
+
# Portuguese
|
676
|
+
"Aceitar tudo",
|
677
|
+
# Polish
|
678
|
+
"Akceptuj wszystko",
|
679
|
+
]
|
680
|
+
|
681
|
+
got_button: bool = False
|
682
|
+
try:
|
683
|
+
if await frame.locator("button.button__acceptAll").is_visible():
|
684
|
+
self.logger.info('Consent window found, clicking through.')
|
685
|
+
got_button = True
|
686
|
+
await frame.locator("button.button__acceptAll").click(timeout=2000)
|
687
|
+
for label in labels_to_click:
|
688
|
+
if await frame.get_by_label(label).is_visible():
|
689
|
+
got_button = True
|
690
|
+
self.logger.debug(f'Got button by label on frame: {label}')
|
691
|
+
await frame.get_by_label(label).click(timeout=2000)
|
692
|
+
break
|
693
|
+
if await frame.get_by_role("button", name=label).is_visible():
|
694
|
+
got_button = True
|
695
|
+
self.logger.debug(f'Got button by role on frame: {label}')
|
696
|
+
await frame.get_by_role("button", name=label).click(timeout=2000)
|
697
|
+
break
|
698
|
+
except Exception as e:
|
699
|
+
self.logger.info(f'Issue with frame consent: {e}')
|
700
|
+
return got_button
|
701
|
+
|
702
|
+
async def _move_time_forward(self, page: Page, time: int) -> None:
|
703
|
+
time = max(time, 7)
|
704
|
+
try:
|
705
|
+
async with timeout(3):
|
706
|
+
await page.clock.run_for(random.randint((time - 5) * 1000,
|
707
|
+
(time + 5) * 1000))
|
708
|
+
self.logger.debug(f'Moved time forward by ~{time}s.')
|
709
|
+
except (TimeoutError, asyncio.TimeoutError):
|
710
|
+
self.logger.warning('Unable to move time forward.')
|
711
|
+
|
649
712
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
650
713
|
referer: str | None=None,
|
651
714
|
page: Page | None=None, depth: int=0,
|
@@ -714,7 +777,7 @@ class Capture():
|
|
714
777
|
capturing_sub = False
|
715
778
|
try:
|
716
779
|
page = await self.context.new_page()
|
717
|
-
|
780
|
+
await page.clock.install()
|
718
781
|
except Error as e:
|
719
782
|
self.logger.warning(f'The context is in a broken state: {e}')
|
720
783
|
self.should_retry = True
|
@@ -839,6 +902,21 @@ class Capture():
|
|
839
902
|
except Exception as e:
|
840
903
|
self.logger.warning(f'Could not find body: {e}')
|
841
904
|
|
905
|
+
await self._wait_for_random_timeout(page, 5)
|
906
|
+
# triggering clicks on very generic frames is sometimes impossible, using button and common language.
|
907
|
+
self.logger.debug('Check other frames for button')
|
908
|
+
for frame in page.frames:
|
909
|
+
if await self.__frame_consent(frame):
|
910
|
+
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
911
|
+
self.logger.debug('Done with frames.')
|
912
|
+
|
913
|
+
self.logger.debug('Check main frame for button')
|
914
|
+
if await self.__frame_consent(page.main_frame):
|
915
|
+
self.logger.debug('Got button on main frame')
|
916
|
+
await self._wait_for_random_timeout(page, 10) # Wait 10 sec after click
|
917
|
+
|
918
|
+
await self._move_time_forward(page, 10)
|
919
|
+
|
842
920
|
if parsed_url.fragment:
|
843
921
|
# We got a fragment, make sure we go to it and scroll only a little bit.
|
844
922
|
fragment = unquote(parsed_url.fragment)
|
@@ -902,17 +980,13 @@ class Capture():
|
|
902
980
|
z.writestr(f'{i}_{filename}', file_content)
|
903
981
|
to_return["downloaded_file"] = mem_zip.getvalue()
|
904
982
|
|
905
|
-
# fast forward 30s
|
906
|
-
|
907
|
-
# async with timeout(3):
|
908
|
-
# await page.clock.run_for("47")
|
909
|
-
# self.logger.debug('Moved time forward.')
|
910
|
-
# except (TimeoutError, asyncio.TimeoutError):
|
911
|
-
# self.logger.warning('Unable to move time forward.')
|
983
|
+
# fast forward ~30s
|
984
|
+
await self._move_time_forward(page, 30)
|
912
985
|
|
913
986
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
914
987
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
|
915
988
|
await self._safe_wait(page)
|
989
|
+
|
916
990
|
self.logger.debug('Done with instrumentation, done with waiting.')
|
917
991
|
|
918
992
|
if content := await self._failsafe_get_content(page):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.12
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -22,16 +22,16 @@ Classifier: Topic :: Security
|
|
22
22
|
Provides-Extra: recaptcha
|
23
23
|
Requires-Dist: SpeechRecognition (>=3.10.4,<4.0.0) ; extra == "recaptcha"
|
24
24
|
Requires-Dist: aiohttp-socks (>=0.9,<0.10)
|
25
|
-
Requires-Dist: aiohttp[speedups] (>=3.10.
|
25
|
+
Requires-Dist: aiohttp[speedups] (>=3.10.3,<4.0.0)
|
26
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
27
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
|
-
Requires-Dist: playwright (>=1.
|
29
|
+
Requires-Dist: playwright (>=1.46.0,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
|
-
Requires-Dist: puremagic (>=1.
|
31
|
+
Requires-Dist: puremagic (>=1.27,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=72.
|
34
|
+
Requires-Dist: setuptools (>=72.2.0,<73.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.2.1,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -1,9 +1,9 @@
|
|
1
1
|
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=
|
2
|
+
playwrightcapture/capture.py,sha256=R8ayCKpD1Q78ewaQ8-4tuMP8XVE8YGOwhwVYZDs4a8g,75867
|
3
3
|
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
4
|
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
5
|
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.25.
|
7
|
-
playwrightcapture-1.25.
|
8
|
-
playwrightcapture-1.25.
|
9
|
-
playwrightcapture-1.25.
|
6
|
+
playwrightcapture-1.25.12.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.25.12.dist-info/METADATA,sha256=Kw9CVpRSN9vLL1XSf_ZGIKAzX11VY4MJuXrO0OVCmHw,3172
|
8
|
+
playwrightcapture-1.25.12.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.25.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|