PlaywrightCapture 1.23.14__tar.gz → 1.24.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.23.14
3
+ Version: 1.24.1
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
- Requires-Dist: SpeechRecognition (>=3.10.1,<4.0.0) ; extra == "recaptcha"
23
+ Requires-Dist: SpeechRecognition (>=3.10.2,<4.0.0) ; extra == "recaptcha"
24
24
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
25
25
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
26
26
  Requires-Dist: playwright (>=1.42.0,<2.0.0)
@@ -511,10 +511,16 @@ class Capture():
511
511
  if await page.locator(".qc-cmp2-summary-buttons").locator("button").first.is_visible():
512
512
  self.logger.info('Consent window found, clicking through.')
513
513
  await page.locator(".qc-cmp2-summary-buttons").locator("button").locator("nth=-1").click(timeout=2000)
514
+ elif await page.locator("#popin_tc_privacy").locator("#popin_tc_privacy_button_2").is_visible():
515
+ self.logger.info('Consent window found, clicking through.')
516
+ await page.locator("#popin_tc_privacy").locator("#popin_tc_privacy_button_2").click(timeout=2000)
517
+ elif await page.get_by_test_id("uc-accept-all-button").is_visible():
518
+ self.logger.info('Consent window found, clicking through.')
519
+ await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
514
520
  else:
515
521
  self.logger.info('Consent window found, but no button to click through.')
516
522
  await page.add_locator_handler(
517
- page.get_by_role("dialog"),
523
+ page.get_by_role("dialog").last,
518
524
  handler
519
525
  )
520
526
  self.logger.info('dialog handler added')
@@ -543,6 +549,18 @@ class Capture():
543
549
  )
544
550
  self.logger.info('Yahoo handler added')
545
551
 
552
+ async def __dialog_ppms_clickthrough(self, page: Page) -> None:
553
+ async def handler() -> None:
554
+ self.logger.info('######## piwik found, clicking through.')
555
+ if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
556
+ await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
557
+
558
+ await page.add_locator_handler(
559
+ page.locator('#ppms_cm_popup_overlay'),
560
+ handler
561
+ )
562
+ self.logger.info('Yahoo handler added')
563
+
546
564
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
547
565
  referer: str | None=None,
548
566
  page: Page | None=None, depth: int=0,
@@ -614,6 +632,7 @@ class Capture():
614
632
  await self.__dialog_cookiebot_clickthrough(page)
615
633
  await self.__dialog_complianz_clickthrough(page)
616
634
  await self.__dialog_yahoo_clickthrough(page)
635
+ await self.__dialog_ppms_clickthrough(page)
617
636
  await self.__dialog_alert_dialog_clickthrough(page)
618
637
  await self.__dialog_clickthrough(page)
619
638
 
@@ -672,10 +691,6 @@ class Capture():
672
691
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
673
692
  self.logger.debug('Start instrumentation.')
674
693
 
675
- if allow_tracking:
676
- # This event is required trigger the add_locator_handler
677
- await page.locator("body").click(button="right")
678
-
679
694
  # ==== recaptcha
680
695
  # Same technique as: https://github.com/NikolaiT/uncaptcha3
681
696
  if CAN_SOLVE_CAPTCHA:
@@ -700,6 +715,12 @@ class Capture():
700
715
  # check if we have anything on the page. If we don't, the page is not working properly.
701
716
  if await self._failsafe_get_content(page):
702
717
  self.logger.debug('Got rendered content')
718
+ if allow_tracking:
719
+ await self._wait_for_random_timeout(page, 2)
720
+ # This event is required trigger the add_locator_handler
721
+ if await page.locator("body").is_visible():
722
+ await page.locator("body").click(button="right", timeout=2000)
723
+
703
724
  # move mouse
704
725
  await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
705
726
  self.logger.debug('Moved mouse.')
@@ -905,6 +926,7 @@ class Capture():
905
926
  raise e
906
927
 
907
928
  async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
929
+ max_wait: float
908
930
  try:
909
931
  if force_max_wait_in_sec is not None:
910
932
  max_wait = force_max_wait_in_sec
@@ -1095,7 +1117,7 @@ class Capture():
1095
1117
  if timeout > 1000:
1096
1118
  self.logger.warning(f'The waiting time is too long {timeout}, we expect seconds, not miliseconds.')
1097
1119
  timeout = int(timeout / 1000)
1098
- _wait_time = random.randrange(timeout * 1000 - 500, timeout * 1000 + 500)
1120
+ _wait_time = random.randrange(max(timeout * 1000 - 500, 500), max(timeout * 1000 + 500, 1000))
1099
1121
  await page.wait_for_timeout(_wait_time)
1100
1122
 
1101
1123
  def make_frame_tree(self, frame: Frame) -> dict[str, list[dict[str, Any]]]:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.23.14"
3
+ version = "1.24.1"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -25,7 +25,7 @@ beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
25
25
  w3lib = "^2.1.2"
26
26
  requests = {extras = ["socks"], version = "^2.31.0"}
27
27
  pydub = {version = "^0.25.1", optional = true}
28
- SpeechRecognition = {version = "^3.10.1", optional = true}
28
+ SpeechRecognition = {version = "^3.10.2", optional = true}
29
29
  pytz = {"version" = "^2024.1", python = "<3.9"}
30
30
  tzdata = "^2024.1"
31
31
  playwright-stealth = "^1.0.6"