PlaywrightCapture 1.25.2__py3-none-any.whl → 1.25.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- playwrightcapture/capture.py +38 -27
- {playwrightcapture-1.25.2.dist-info → playwrightcapture-1.25.4.dist-info}/METADATA +2 -2
- {playwrightcapture-1.25.2.dist-info → playwrightcapture-1.25.4.dist-info}/RECORD +5 -5
- {playwrightcapture-1.25.2.dist-info → playwrightcapture-1.25.4.dist-info}/LICENSE +0 -0
- {playwrightcapture-1.25.2.dist-info → playwrightcapture-1.25.4.dist-info}/WHEEL +0 -0
playwrightcapture/capture.py
CHANGED
@@ -32,7 +32,7 @@ from playwright._impl._errors import TargetClosedError
|
|
32
32
|
from playwright.async_api import async_playwright, Frame, Error, Page, Download, Request
|
33
33
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
34
34
|
from playwright_stealth import stealth_async, StealthConfig # type: ignore[import-untyped]
|
35
|
-
from puremagic import PureError, from_string
|
35
|
+
from puremagic import PureError, from_string
|
36
36
|
from w3lib.html import strip_html5_whitespace
|
37
37
|
from w3lib.url import canonicalize_url, safe_url_string
|
38
38
|
|
@@ -682,7 +682,7 @@ class Capture():
|
|
682
682
|
capturing_sub = False
|
683
683
|
try:
|
684
684
|
page = await self.context.new_page()
|
685
|
-
await page.clock.install()
|
685
|
+
# await page.clock.install()
|
686
686
|
except Error as e:
|
687
687
|
self.logger.warning(f'The context is in a broken state: {e}')
|
688
688
|
self.should_retry = True
|
@@ -706,6 +706,7 @@ class Capture():
|
|
706
706
|
page.set_default_timeout((self._capture_timeout - 2) * 1000)
|
707
707
|
# trigger a callback on each request to store it in a dict indexed by URL to get it back from the favicon fetcher
|
708
708
|
page.on("requestfinished", store_request)
|
709
|
+
page.on("dialog", lambda dialog: dialog.accept())
|
709
710
|
|
710
711
|
try:
|
711
712
|
# Parse the URL. If there is a fragment, we need to scroll to it manually
|
@@ -761,31 +762,31 @@ class Capture():
|
|
761
762
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
762
763
|
self.logger.debug('Start instrumentation.')
|
763
764
|
|
764
|
-
# ==== recaptcha
|
765
|
-
# Same technique as: https://github.com/NikolaiT/uncaptcha3
|
766
|
-
if CAN_SOLVE_CAPTCHA:
|
767
|
-
try:
|
768
|
-
if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
|
769
|
-
and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
|
770
|
-
self.logger.info('Found a captcha')
|
771
|
-
await self._recaptcha_solver(page)
|
772
|
-
except PlaywrightTimeoutError as e:
|
773
|
-
self.logger.info(f'Captcha on {url} is not ready: {e}')
|
774
|
-
except TargetClosedError as e:
|
775
|
-
self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
|
776
|
-
except Error as e:
|
777
|
-
self.logger.warning(f'Error while resolving captcha on {url}: {e}')
|
778
|
-
except Exception as e:
|
779
|
-
self.logger.exception(f'General error with captcha solving on {url}: {e}')
|
780
|
-
# ======
|
781
|
-
# NOTE: testing
|
782
|
-
# await self.__cloudflare_bypass_attempt(page)
|
783
|
-
self.logger.debug('Done with captcha.')
|
784
|
-
|
785
765
|
# check if we have anything on the page. If we don't, the page is not working properly.
|
786
766
|
if await self._failsafe_get_content(page):
|
787
767
|
self.logger.debug('Got rendered content')
|
788
768
|
|
769
|
+
# ==== recaptcha
|
770
|
+
# Same technique as: https://github.com/NikolaiT/uncaptcha3
|
771
|
+
if CAN_SOLVE_CAPTCHA:
|
772
|
+
try:
|
773
|
+
if (await page.locator("//iframe[@title='reCAPTCHA']").first.is_visible(timeout=3000)
|
774
|
+
and await page.locator("//iframe[@title='reCAPTCHA']").first.is_enabled(timeout=2000)):
|
775
|
+
self.logger.info('Found a captcha')
|
776
|
+
await self._recaptcha_solver(page)
|
777
|
+
except PlaywrightTimeoutError as e:
|
778
|
+
self.logger.info(f'Captcha on {url} is not ready: {e}')
|
779
|
+
except TargetClosedError as e:
|
780
|
+
self.logger.warning(f'Target closed while resolving captcha on {url}: {e}')
|
781
|
+
except Error as e:
|
782
|
+
self.logger.warning(f'Error while resolving captcha on {url}: {e}')
|
783
|
+
except Exception as e:
|
784
|
+
self.logger.exception(f'General error with captcha solving on {url}: {e}')
|
785
|
+
# ======
|
786
|
+
# NOTE: testing
|
787
|
+
# await self.__cloudflare_bypass_attempt(page)
|
788
|
+
self.logger.debug('Done with captcha.')
|
789
|
+
|
789
790
|
# move mouse
|
790
791
|
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
791
792
|
self.logger.debug('Moved mouse.')
|
@@ -865,8 +866,12 @@ class Capture():
|
|
865
866
|
to_return["downloaded_file"] = mem_zip.getvalue()
|
866
867
|
|
867
868
|
# fast forward 30s
|
868
|
-
|
869
|
-
|
869
|
+
# try:
|
870
|
+
# async with timeout(3):
|
871
|
+
# await page.clock.run_for("47")
|
872
|
+
# self.logger.debug('Moved time forward.')
|
873
|
+
# except TimeoutError:
|
874
|
+
# self.logger.warning('Unable to move time forward.')
|
870
875
|
|
871
876
|
self.logger.debug('Done with instrumentation, waiting for network idle.')
|
872
877
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
|
@@ -1077,8 +1082,9 @@ class Capture():
|
|
1077
1082
|
tries = 3
|
1078
1083
|
while tries:
|
1079
1084
|
try:
|
1080
|
-
|
1081
|
-
|
1085
|
+
async with timeout(30):
|
1086
|
+
return await page.content()
|
1087
|
+
except (Error, TimeoutError):
|
1082
1088
|
self.logger.debug('Unable to get page content, trying again.')
|
1083
1089
|
tries -= 1
|
1084
1090
|
await self._wait_for_random_timeout(page, 1)
|
@@ -1224,6 +1230,11 @@ class Capture():
|
|
1224
1230
|
if ': ' in name:
|
1225
1231
|
_, name = name.split(': ', maxsplit=1)
|
1226
1232
|
exception._name = name.strip()
|
1233
|
+
else:
|
1234
|
+
# The format changed in Playwright 1.43.0, the name of the method that failed is set before the exception itself.
|
1235
|
+
if ': ' in exception.message:
|
1236
|
+
_, name = exception.message.split(': ', maxsplit=1)
|
1237
|
+
exception._name = name.strip()
|
1227
1238
|
|
1228
1239
|
def _exception_is_network_error(self, exception: Error) -> bool:
|
1229
1240
|
if exception.name in [
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.25.
|
3
|
+
Version: 1.25.4
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -28,7 +28,7 @@ Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
29
|
Requires-Dist: playwright (>=1.45.0,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
|
-
Requires-Dist: puremagic (>=1.
|
31
|
+
Requires-Dist: puremagic (>=1.26,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
34
|
Requires-Dist: setuptools (>=70.3.0,<71.0.0)
|
@@ -1,9 +1,9 @@
|
|
1
1
|
playwrightcapture/__init__.py,sha256=F90Y8wYS13tDjgsfjuFrCfmzQfdnH44G-ovuilJfLEE,511
|
2
|
-
playwrightcapture/capture.py,sha256=
|
2
|
+
playwrightcapture/capture.py,sha256=ANNPmaTgAIDihdqRDXkuc4LBjZeqcA7EAQpr7zXEpww,70047
|
3
3
|
playwrightcapture/exceptions.py,sha256=LhGJQCGHzEu7Sx2Dfl28OFeDg1OmrwufFjAWXlxQnEA,366
|
4
4
|
playwrightcapture/helpers.py,sha256=SXQLEuxMs8-bcWykMiUVosHzzxBKuS-QC0gBV3OmKmo,1764
|
5
5
|
playwrightcapture/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
playwrightcapture-1.25.
|
7
|
-
playwrightcapture-1.25.
|
8
|
-
playwrightcapture-1.25.
|
9
|
-
playwrightcapture-1.25.
|
6
|
+
playwrightcapture-1.25.4.dist-info/LICENSE,sha256=uwFc39fTLacBUG-XTuxX6IQKTKhg4z14gWOLt3ex4Ho,1775
|
7
|
+
playwrightcapture-1.25.4.dist-info/METADATA,sha256=gsWFbtAU24Ag1VpP65y6M3tOl5wTy2dWWVNf5AmOETU,3173
|
8
|
+
playwrightcapture-1.25.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
9
|
+
playwrightcapture-1.25.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|