PlaywrightCapture 1.23.11__py3-none-any.whl → 1.23.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +19 -12
- {playwrightcapture-1.23.11.dist-info → playwrightcapture-1.23.13.dist-info}/METADATA +1 -1
- {playwrightcapture-1.23.11.dist-info → playwrightcapture-1.23.13.dist-info}/RECORD +5 -5
- {playwrightcapture-1.23.11.dist-info → playwrightcapture-1.23.13.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.23.11.dist-info → playwrightcapture-1.23.13.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -122,7 +122,7 @@ class Capture():
|
|
122
122
|
'password': proxy.get('password', '')}
|
123
123
|
|
124
124
|
self.should_retry: bool = False
|
125
|
-
self.__network_not_idle: int =
|
125
|
+
self.__network_not_idle: int = 2 # makes sure we do not wait for network idle the max amount of time the capture is allowed to take
|
126
126
|
self._cookies: list[SetCookieParam] = []
|
127
127
|
self._http_credentials: HttpCredentials = {}
|
128
128
|
self._geolocation: Geolocation = {}
|
@@ -461,7 +461,7 @@ class Capture():
|
|
461
461
|
await page.locator("#onetrust-accept-btn-handler").click()
|
462
462
|
|
463
463
|
await page.add_locator_handler(
|
464
|
-
page.locator('
|
464
|
+
page.locator('#onetrust-banner-sdk'),
|
465
465
|
handler
|
466
466
|
)
|
467
467
|
self.logger.info('OT handler added')
|
@@ -524,6 +524,7 @@ class Capture():
|
|
524
524
|
) -> CaptureResponse:
|
525
525
|
|
526
526
|
to_return: CaptureResponse = {}
|
527
|
+
got_favicons = False
|
527
528
|
|
528
529
|
# We don't need to be super strict on the lock, as it simply triggers a wait for network idle before stoping the capture
|
529
530
|
# but we still need it to be an integer in case we have more than one download triggered and one finished when the others haven't
|
@@ -551,8 +552,12 @@ class Capture():
|
|
551
552
|
|
552
553
|
async def store_request(request: Request) -> None:
|
553
554
|
# This method is called on each request, to store the body (if it is an image) in a dict indexed by URL
|
555
|
+
if got_favicons:
|
556
|
+
return
|
554
557
|
try:
|
555
558
|
if response := await request.response():
|
559
|
+
if got_favicons:
|
560
|
+
return
|
556
561
|
if response.ok:
|
557
562
|
try:
|
558
563
|
if body := await response.body():
|
@@ -585,7 +590,7 @@ class Capture():
|
|
585
590
|
await stealth_async(page)
|
586
591
|
page.set_default_timeout(self._capture_timeout * 1000)
|
587
592
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
588
|
-
page.on("
|
593
|
+
page.on("requestfinished", store_request)
|
589
594
|
|
590
595
|
try:
|
591
596
|
# Parse the URL. If there is a fragment, we need to scroll to it manually
|
@@ -645,8 +650,8 @@ class Capture():
|
|
645
650
|
# Same technique as: https://github.com/NikolaiT/uncaptcha3
|
646
651
|
if CAN_SOLVE_CAPTCHA:
|
647
652
|
try:
|
648
|
-
if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=
|
649
|
-
and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=
|
653
|
+
if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
|
654
|
+
and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
|
650
655
|
self.logger.info('Found a captcha')
|
651
656
|
await self._recaptcha_solver(page)
|
652
657
|
except PlaywrightTimeoutError as e:
|
@@ -675,7 +680,7 @@ class Capture():
|
|
675
680
|
# We got a fragment, make sure we go to it and scroll only a little bit.
|
676
681
|
fragment = unquote(parsed_url.fragment)
|
677
682
|
try:
|
678
|
-
await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=
|
683
|
+
await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
|
679
684
|
await self._safe_wait(page)
|
680
685
|
await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
|
681
686
|
self.logger.debug('Jumped to fragment.')
|
@@ -717,10 +722,12 @@ class Capture():
|
|
717
722
|
|
718
723
|
to_return['last_redirected_url'] = page.url
|
719
724
|
|
720
|
-
to_return['png'] = await self._failsafe_get_screenshot(page)
|
721
|
-
|
722
725
|
if 'html' in to_return and to_return['html'] is not None and with_favicon:
|
723
726
|
to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
|
727
|
+
got_favicons = True
|
728
|
+
|
729
|
+
await self._safe_wait(page)
|
730
|
+
to_return['png'] = await self._failsafe_get_screenshot(page)
|
724
731
|
|
725
732
|
if self.wait_for_download > 0:
|
726
733
|
self.logger.info('Waiting for download to finish...')
|
@@ -853,17 +860,17 @@ class Capture():
|
|
853
860
|
|
854
861
|
async def _failsafe_get_screenshot(self, page: Page) -> bytes:
|
855
862
|
try:
|
856
|
-
return await page.screenshot(full_page=True)
|
863
|
+
return await page.screenshot(full_page=True, timeout=5000)
|
857
864
|
except Error as e:
|
858
865
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
|
859
866
|
|
860
867
|
try:
|
861
|
-
return await page.screenshot(full_page=True, scale="css")
|
868
|
+
return await page.screenshot(full_page=True, scale="css", timeout=10000)
|
862
869
|
except Error as e:
|
863
870
|
self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
|
864
871
|
|
865
872
|
try:
|
866
|
-
return await page.screenshot()
|
873
|
+
return await page.screenshot(scale="css", animations='disabled', timeout=10000)
|
867
874
|
except Error as e:
|
868
875
|
self.logger.warning(f"Unable to get any screenshot: {e}")
|
869
876
|
raise e
|
@@ -871,7 +878,7 @@ class Capture():
|
|
871
878
|
async def _safe_wait(self, page: Page) -> None:
|
872
879
|
try:
|
873
880
|
# If we don't have networkidle relatively quick, it's probably because we're playing a video.
|
874
|
-
await page.wait_for_load_state('networkidle', timeout=
|
881
|
+
await page.wait_for_load_state('networkidle', timeout=self._capture_timeout / self.__network_not_idle)
|
875
882
|
except PlaywrightTimeoutError:
|
876
883
|
# Network never idle, keep going
|
877
884
|
self.__network_not_idle += 1
|
@@ -1,9 +1,9 @@
|
|
1
1
|
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=
|
2
|
+
playwrightcapture/capture.py,sha256=6fOxWTdJT7ViyTnm1cGcuHi2v5sWB-06Fsg2PlsNmX0,57699
|
3
3
|
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
4
|
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
5
|
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.23.
|
7
|
-
playwrightcapture-1.23.
|
8
|
-
playwrightcapture-1.23.
|
9
|
-
playwrightcapture-1.23.
|
6
|
+
playwrightcapture-1.23.13.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.23.13.dist-info/METADATA,sha256=QK9T71_mPN1K-xKeOS8D5hWYR5DuA_plG1mJNtLmto8,3048
|
8
|
+
playwrightcapture-1.23.13.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.23.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|