PlaywrightCapture 1.25.3__tar.gz → 1.25.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/PKG-INFO +1 -1
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/capture.py +55 -33
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/pyproject.toml +1 -1
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/LICENSE +0 -0
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/README.md +0 -0
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.25.3 → playwrightcapture-1.25.5}/playwrightcapture/py.typed +0 -0
@@ -165,10 +165,15 @@ class Capture():
|
|
165
165
|
if proxy:
|
166
166
|
if isinstance(proxy, str):
|
167
167
|
self.proxy = {'server': proxy}
|
168
|
-
|
168
|
+
elif isinstance(proxy, dict):
|
169
169
|
self.proxy = {'server': proxy['server'], 'bypass': proxy.get('bypass', ''),
|
170
170
|
'username': proxy.get('username', ''),
|
171
171
|
'password': proxy.get('password', '')}
|
172
|
+
elif isinstance(proxy, int):
|
173
|
+
# This is clearly a mistake, just ignoring it
|
174
|
+
self.logger.warning('Proxy is an integer, this is a mistake, ignoring it.')
|
175
|
+
else:
|
176
|
+
raise InvalidPlaywrightParameter(f'Invalid proxy parameter: "{proxy}" ({type(proxy)})')
|
172
177
|
|
173
178
|
self.should_retry: bool = False
|
174
179
|
self.__network_not_idle: int = 2 # makes sure we do not wait for network idle the max amount of time the capture is allowed to take
|
@@ -682,8 +687,7 @@ class Capture():
|
|
682
687
|
capturing_sub = False
|
683
688
|
try:
|
684
689
|
page = await self.context.new_page()
|
685
|
-
await page.clock.install()
|
686
|
-
page.on("dialog", lambda dialog: dialog.accept())
|
690
|
+
# await page.clock.install()
|
687
691
|
except Error as e:
|
688
692
|
self.logger.warning(f'The context is in a broken state: {e}')
|
689
693
|
self.should_retry = True
|
@@ -707,6 +711,7 @@ class Capture():
|
|
707
711
|
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
708
712
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
709
713
|
page.on("requestfinished", store_request)
|
714
|
+
page.on("dialog", lambda dialog: dialog.accept())
|
710
715
|
|
711
716
|
try:
|
712
717
|
# Parse the URL. If there is a fragment, we need to scroll to it manually
|
@@ -762,31 +767,31 @@ class Capture():
|
|
762
767
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
763
768
|
self.logger.debug('Start instrumentation.')
|
764
769
|
|
765
|
-
# ==== recaptcha
|
766
|
-
# Same technique as: https://github.com/NikolaiT/uncaptcha3
|
767
|
-
if CAN_SOLVE_CAPTCHA:
|
768
|
-
try:
|
769
|
-
if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
|
770
|
-
and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
|
771
|
-
self.logger.info('Found a captcha')
|
772
|
-
await self._recaptcha_solver(page)
|
773
|
-
except PlaywrightTimeoutError as e:
|
774
|
-
self.logger.info(f'Captcha on {url} is not ready: {e}')
|
775
|
-
except TargetClosedError as e:
|
776
|
-
self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
|
777
|
-
except Error as e:
|
778
|
-
self.logger.warning(f'Error while resolving captcha on {url}: {e}')
|
779
|
-
except Exception as e:
|
780
|
-
self.logger.exception(f'General error with captcha solving on {url}: {e}')
|
781
|
-
# ======
|
782
|
-
# NOTE: testing
|
783
|
-
# await self.__cloudflare_bypass_attempt(page)
|
784
|
-
self.logger.debug('Done with captcha.')
|
785
|
-
|
786
770
|
# check if we have anything on the page. If we don't, the page is not working properly.
|
787
771
|
if await self._failsafe_get_content(page):
|
788
772
|
self.logger.debug('Got rendered content')
|
789
773
|
|
774
|
+
# ==== recaptcha
|
775
|
+
# Same technique as: https://github.com/NikolaiT/uncaptcha3
|
776
|
+
if CAN_SOLVE_CAPTCHA:
|
777
|
+
try:
|
778
|
+
if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
|
779
|
+
and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
|
780
|
+
self.logger.info('Found a captcha')
|
781
|
+
await self._recaptcha_solver(page)
|
782
|
+
except PlaywrightTimeoutError as e:
|
783
|
+
self.logger.info(f'Captcha on {url} is not ready: {e}')
|
784
|
+
except TargetClosedError as e:
|
785
|
+
self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
|
786
|
+
except Error as e:
|
787
|
+
self.logger.warning(f'Error while resolving captcha on {url}: {e}')
|
788
|
+
except Exception as e:
|
789
|
+
self.logger.exception(f'General error with captcha solving on {url}: {e}')
|
790
|
+
# ======
|
791
|
+
# NOTE: testing
|
792
|
+
# await self.__cloudflare_bypass_attempt(page)
|
793
|
+
self.logger.debug('Done with captcha.')
|
794
|
+
|
790
795
|
# move mouse
|
791
796
|
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
792
797
|
self.logger.debug('Moved mouse.')
|
@@ -812,7 +817,7 @@ class Capture():
|
|
812
817
|
try:
|
813
818
|
await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
|
814
819
|
await self._wait_for_random_timeout(page, 2)
|
815
|
-
async with timeout(
|
820
|
+
async with timeout(5):
|
816
821
|
await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
|
817
822
|
self.logger.debug('Jumped to fragment.')
|
818
823
|
except PlaywrightTimeoutError as e:
|
@@ -821,20 +826,24 @@ class Capture():
|
|
821
826
|
self.logger.warning(f'Target closed, unable to go to fragment "{fragment}": {e}')
|
822
827
|
except Error as e:
|
823
828
|
self.logger.exception(f'Unable to go to fragment "{fragment}": {e}')
|
824
|
-
except TimeoutError:
|
829
|
+
except (asyncio.TimeoutError, TimeoutError):
|
825
830
|
self.logger.debug('Unable to scroll due to timeout')
|
831
|
+
except (asyncio.CancelledError):
|
832
|
+
self.logger.debug('Unable to scroll due to timeout, call canceled')
|
826
833
|
else:
|
827
834
|
# scroll more
|
828
835
|
try:
|
829
836
|
# NOTE using page.mouse.wheel causes the instrumentation to fail, sometimes.
|
830
837
|
# 2024-07-08: Also, it sometimes get stuck.
|
831
|
-
async with timeout(
|
838
|
+
async with timeout(5):
|
832
839
|
await page.mouse.wheel(delta_y=random.uniform(1500, 3000), delta_x=0)
|
833
840
|
self.logger.debug('Scrolled down.')
|
834
841
|
except Error as e:
|
835
842
|
self.logger.debug(f'Unable to scroll: {e}')
|
836
|
-
except TimeoutError:
|
843
|
+
except (TimeoutError, asyncio.TimeoutError):
|
837
844
|
self.logger.debug('Unable to scroll due to timeout')
|
845
|
+
except (asyncio.CancelledError):
|
846
|
+
self.logger.debug('Unable to scroll due to timeout, call canceled')
|
838
847
|
|
839
848
|
await self._wait_for_random_timeout(page, 3)
|
840
849
|
self.logger.debug('Keep going after moving on page.')
|
@@ -866,8 +875,12 @@ class Capture():
|
|
866
875
|
to_return["downloaded_file"] = mem_zip.getvalue()
|
867
876
|
|
868
877
|
# fast forward 30s
|
869
|
-
|
870
|
-
|
878
|
+
# try:
|
879
|
+
# async with timeout(3):
|
880
|
+
# await page.clock.run_for("47")
|
881
|
+
# self.logger.debug('Moved time forward.')
|
882
|
+
# except (TimeoutError, asyncio.TimeoutError):
|
883
|
+
# self.logger.warning('Unable to move time forward.')
|
871
884
|
|
872
885
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
873
886
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
|
@@ -915,7 +928,7 @@ class Capture():
|
|
915
928
|
rendered_hostname_only=rendered_hostname_only,
|
916
929
|
max_depth_capture_time=max_capture_time)
|
917
930
|
to_return['children'].append(child_capture) # type: ignore[union-attr]
|
918
|
-
except (TimeoutError, asyncio.
|
931
|
+
except (TimeoutError, asyncio.TimeoutError):
|
919
932
|
self.logger.info(f'Timeout error, took more than {max_capture_time}s. Unable to capture {url}.')
|
920
933
|
consecutive_errors += 1
|
921
934
|
except Exception as e:
|
@@ -943,6 +956,9 @@ class Capture():
|
|
943
956
|
except PlaywrightTimeoutError as e:
|
944
957
|
to_return['error'] = f"The capture took too long - {e.message}"
|
945
958
|
self.should_retry = True
|
959
|
+
except (asyncio.TimeoutError, TimeoutError):
|
960
|
+
to_return['error'] = "Something in the capture took too long"
|
961
|
+
self.should_retry = True
|
946
962
|
except TargetClosedError as e:
|
947
963
|
to_return['error'] = f"The target was closed - {e}"
|
948
964
|
self.should_retry = True
|
@@ -1078,8 +1094,9 @@ class Capture():
|
|
1078
1094
|
tries = 3
|
1079
1095
|
while tries:
|
1080
1096
|
try:
|
1081
|
-
|
1082
|
-
|
1097
|
+
async with timeout(30):
|
1098
|
+
return await page.content()
|
1099
|
+
except (Error, TimeoutError):
|
1083
1100
|
self.logger.debug('Unable to get page content, trying again.')
|
1084
1101
|
tries -= 1
|
1085
1102
|
await self._wait_for_random_timeout(page, 1)
|
@@ -1225,6 +1242,11 @@ class Capture():
|
|
1225
1242
|
if ': ' in name:
|
1226
1243
|
_, name = name.split(': ', maxsplit=1)
|
1227
1244
|
exception._name = name.strip()
|
1245
|
+
else:
|
1246
|
+
# The format changed in Playwright 1.43.0, the name of the method that failed is set before the exception itself.
|
1247
|
+
if ': ' in exception.message:
|
1248
|
+
_, name = exception.message.split(': ', maxsplit=1)
|
1249
|
+
exception._name = name.strip()
|
1228
1250
|
|
1229
1251
|
def _exception_is_network_error(self, exception: Error) -> bool:
|
1230
1252
|
if exception.name in [
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|