PlaywrightCapture 1.23.14__tar.gz → 1.24.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/PKG-INFO +2 -2
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/playwrightcapture/capture.py +28 -6
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/pyproject.toml +2 -2
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/LICENSE +0 -0
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/README.md +0 -0
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.23.14 → playwrightcapture-1.24.0}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.24.0
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Topic :: Internet
|
21
21
|
Classifier: Topic :: Security
|
22
22
|
Provides-Extra: recaptcha
|
23
|
-
Requires-Dist: SpeechRecognition (>=3.10.
|
23
|
+
Requires-Dist: SpeechRecognition (>=3.10.2,<4.0.0) ; extra == "recaptcha"
|
24
24
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
25
25
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
26
26
|
Requires-Dist: playwright (>=1.42.0,<2.0.0)
|
@@ -511,10 +511,16 @@ class Capture():
|
|
511
511
|
if await page.locator(".qc-cmp2-summary-buttons").locator("button").first.is_visible():
|
512
512
|
self.logger.info('Consent window found, clicking through.')
|
513
513
|
await page.locator(".qc-cmp2-summary-buttons").locator("button").locator("nth=-1").click(timeout=2000)
|
514
|
+
elif await page.locator("#popin_tc_privacy").locator("#popin_tc_privacy_button_2").is_visible():
|
515
|
+
self.logger.info('Consent window found, clicking through.')
|
516
|
+
await page.locator("#popin_tc_privacy").locator("#popin_tc_privacy_button_2").click(timeout=2000)
|
517
|
+
elif await page.get_by_test_id("uc-accept-all-button").is_visible():
|
518
|
+
self.logger.info('Consent window found, clicking through.')
|
519
|
+
await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
|
514
520
|
else:
|
515
521
|
self.logger.info('Consent window found, but no button to click through.')
|
516
522
|
await page.add_locator_handler(
|
517
|
-
page.get_by_role("dialog"),
|
523
|
+
page.get_by_role("dialog").last,
|
518
524
|
handler
|
519
525
|
)
|
520
526
|
self.logger.info('dialog handler added')
|
@@ -543,6 +549,18 @@ class Capture():
|
|
543
549
|
)
|
544
550
|
self.logger.info('Yahoo handler added')
|
545
551
|
|
552
|
+
async def __dialog_ppms_clickthrough(self, page: Page) -> None:
|
553
|
+
async def handler() -> None:
|
554
|
+
self.logger.info('######## piwik found, clicking through.')
|
555
|
+
if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
|
556
|
+
await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
|
557
|
+
|
558
|
+
await page.add_locator_handler(
|
559
|
+
page.locator('#ppms_cm_popup_overlay'),
|
560
|
+
handler
|
561
|
+
)
|
562
|
+
self.logger.info('Yahoo handler added')
|
563
|
+
|
546
564
|
async def capture_page(self, url: str, *, max_depth_capture_time: int,
|
547
565
|
referer: str | None=None,
|
548
566
|
page: Page | None=None, depth: int=0,
|
@@ -614,6 +632,7 @@ class Capture():
|
|
614
632
|
await self.__dialog_cookiebot_clickthrough(page)
|
615
633
|
await self.__dialog_complianz_clickthrough(page)
|
616
634
|
await self.__dialog_yahoo_clickthrough(page)
|
635
|
+
await self.__dialog_ppms_clickthrough(page)
|
617
636
|
await self.__dialog_alert_dialog_clickthrough(page)
|
618
637
|
await self.__dialog_clickthrough(page)
|
619
638
|
|
@@ -672,10 +691,6 @@ class Capture():
|
|
672
691
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
673
692
|
self.logger.debug('Start instrumentation.')
|
674
693
|
|
675
|
-
if allow_tracking:
|
676
|
-
# This event is required trigger the add_locator_handler
|
677
|
-
await page.locator("body").click(button="right")
|
678
|
-
|
679
694
|
# ==== recaptcha
|
680
695
|
# Same technique as: https://github.com/NikolaiT/uncaptcha3
|
681
696
|
if CAN_SOLVE_CAPTCHA:
|
@@ -700,6 +715,12 @@ class Capture():
|
|
700
715
|
# check if we have anything on the page. If we don't, the page is not working properly.
|
701
716
|
if await self._failsafe_get_content(page):
|
702
717
|
self.logger.debug('Got rendered content')
|
718
|
+
if allow_tracking:
|
719
|
+
await self._wait_for_random_timeout(page, 2)
|
720
|
+
# This event is required trigger the add_locator_handler
|
721
|
+
if page.locator("body").is_visible():
|
722
|
+
await page.locator("body").click(button="right", timeout=2000)
|
723
|
+
|
703
724
|
# move mouse
|
704
725
|
await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
|
705
726
|
self.logger.debug('Moved mouse.')
|
@@ -905,6 +926,7 @@ class Capture():
|
|
905
926
|
raise e
|
906
927
|
|
907
928
|
async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
|
929
|
+
max_wait: float
|
908
930
|
try:
|
909
931
|
if force_max_wait_in_sec is not None:
|
910
932
|
max_wait = force_max_wait_in_sec
|
@@ -1095,7 +1117,7 @@ class Capture():
|
|
1095
1117
|
if timeout > 1000:
|
1096
1118
|
self.logger.warning(f'The waiting time is too long {timeout}, we expect seconds, not miliseconds.')
|
1097
1119
|
timeout = int(timeout / 1000)
|
1098
|
-
_wait_time = random.randrange(timeout * 1000 - 500, timeout * 1000 + 500)
|
1120
|
+
_wait_time = random.randrange(max(timeout * 1000 - 500, 500), max(timeout * 1000 + 500, 1000))
|
1099
1121
|
await page.wait_for_timeout(_wait_time)
|
1100
1122
|
|
1101
1123
|
def make_frame_tree(self, frame: Frame) -> dict[str, list[dict[str, Any]]]:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.
|
3
|
+
version = "1.24.0"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -25,7 +25,7 @@ beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
|
|
25
25
|
w3lib = "^2.1.2"
|
26
26
|
requests = {extras = ["socks"], version = "^2.31.0"}
|
27
27
|
pydub = {version = "^0.25.1", optional = true}
|
28
|
-
SpeechRecognition = {version = "^3.10.
|
28
|
+
SpeechRecognition = {version = "^3.10.2", optional = true}
|
29
29
|
pytz = {"version" = "^2024.1", python = "<3.9"}
|
30
30
|
tzdata = "^2024.1"
|
31
31
|
playwright-stealth = "^1.0.6"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|