PlaywrightCapture 1.23.13__tar.gz → 1.24.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PlaywrightCapture
3
- Version: 1.23.13
3
+ Version: 1.24.0
4
4
  Summary: A simple library to capture websites using playwright
5
5
  Home-page: https://github.com/Lookyloo/PlaywrightCapture
6
6
  License: BSD-3-Clause
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Topic :: Internet
21
21
  Classifier: Topic :: Security
22
22
  Provides-Extra: recaptcha
23
- Requires-Dist: SpeechRecognition (>=3.10.1,<4.0.0) ; extra == "recaptcha"
23
+ Requires-Dist: SpeechRecognition (>=3.10.2,<4.0.0) ; extra == "recaptcha"
24
24
  Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
25
25
  Requires-Dist: dateparser (>=1.2.0,<2.0.0)
26
26
  Requires-Dist: playwright (>=1.42.0,<2.0.0)
@@ -53,8 +53,8 @@ A very basic example:
53
53
  from playwrightcapture import Capture
54
54
 
55
55
  async with Capture() as capture:
56
- await capture.prepare_context()
57
- entries = await capture.capture_page(url)
56
+ await capture.initialize_context()
57
+ entries = await capture.capture_page(url, max_depth_capture_time=90)
58
58
  ```
59
59
 
60
60
  Entries is a dictionaries that contains (if all goes well) the HAR, the screenshot, all the cookies of the session, the URL as it is in the browser at the end of the capture, and the full HTML page as rendered.
@@ -16,8 +16,8 @@ A very basic example:
16
16
  from playwrightcapture import Capture
17
17
 
18
18
  async with Capture() as capture:
19
- await capture.prepare_context()
20
- entries = await capture.capture_page(url)
19
+ await capture.initialize_context()
20
+ entries = await capture.capture_page(url, max_depth_capture_time=90)
21
21
  ```
22
22
 
23
23
  Entries is a dictionaries that contains (if all goes well) the HAR, the screenshot, all the cookies of the session, the URL as it is in the browser at the end of the capture, and the full HTML page as rendered.
@@ -426,10 +426,10 @@ class Capture():
426
426
  while max_tries > 0:
427
427
  # cf_locator = page.frame_locator("iframe[title=\"Widget containing a Cloudflare security challenge\"]").get_by_label("Verify you are human")
428
428
  cf_locator = page.frame_locator("iframe[title=\"Widget containing a Cloudflare security challenge\"]").get_by_role("checkbox")
429
- await self._safe_wait(page)
429
+ await self._safe_wait(page, 5)
430
430
  await cf_locator.click(force=True, position={"x": random.uniform(1, 32), "y": random.uniform(1, 32)})
431
431
  self.logger.info('Cloudflare widget visible.')
432
- await self._safe_wait(page)
432
+ await self._safe_wait(page, 5)
433
433
  await self._wait_for_random_timeout(page, 2)
434
434
  spinner = page.locator('#challenge-spinner')
435
435
  while True:
@@ -449,7 +449,7 @@ class Capture():
449
449
  async def handler() -> None:
450
450
  self.logger.debug('Didomi dialog found, clicking through.')
451
451
  if await page.locator("#didomi-notice-agree-button").is_visible():
452
- await page.locator("#didomi-notice-agree-button").click()
452
+ await page.locator("#didomi-notice-agree-button").click(timeout=2000)
453
453
 
454
454
  await page.add_locator_handler(page.locator(".didomi-popup-view"), handler)
455
455
  self.logger.info('Didomi handler added')
@@ -458,7 +458,7 @@ class Capture():
458
458
  async def handler() -> None:
459
459
  self.logger.info('######## OT Dialog found, clicking through.')
460
460
  if await page.locator("#onetrust-accept-btn-handler").is_visible():
461
- await page.locator("#onetrust-accept-btn-handler").click()
461
+ await page.locator("#onetrust-accept-btn-handler").click(timeout=2000)
462
462
 
463
463
  await page.add_locator_handler(
464
464
  page.locator('#onetrust-banner-sdk'),
@@ -470,7 +470,7 @@ class Capture():
470
470
  async def handler() -> None:
471
471
  self.logger.info('######## HS Dialog found, clicking through.')
472
472
  if await page.locator("#hs-eu-confirmation-button").is_visible():
473
- await page.locator("#hs-eu-confirmation-button").click()
473
+ await page.locator("#hs-eu-confirmation-button").click(timeout=2000)
474
474
 
475
475
  await page.add_locator_handler(
476
476
  page.locator('#hs-eu-cookie-confirmation'),
@@ -482,7 +482,7 @@ class Capture():
482
482
  async def handler() -> None:
483
483
  self.logger.info('######## Cookiebot Dialog found, clicking through.')
484
484
  if await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").is_visible():
485
- await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").click()
485
+ await page.locator("#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll").click(timeout=2000)
486
486
 
487
487
  await page.add_locator_handler(
488
488
  page.locator('#CybotCookiebotDialogBody'),
@@ -494,20 +494,42 @@ class Capture():
494
494
  async def handler() -> None:
495
495
  if await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").is_visible():
496
496
  self.logger.info('Consent window found, clicking through.')
497
- await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").click()
497
+ await page.frame_locator("iframe[title=\"Consent window\"]").locator("button.button__acceptAll").click(timeout=2000)
498
+ elif await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").is_visible():
499
+ await page.locator('#onetrust-button-group').locator("#onetrust-accept-btn-handler").click(timeout=1000)
498
500
  else:
499
501
  self.logger.info('Consent window found, but no button to click through.')
502
+
500
503
  await page.add_locator_handler(
501
504
  page.get_by_role("alertdialog"),
502
505
  handler
503
506
  )
504
507
  self.logger.info('alert dialog handler added')
505
508
 
509
+ async def __dialog_clickthrough(self, page: Page) -> None:
510
+ async def handler() -> None:
511
+ if await page.locator(".qc-cmp2-summary-buttons").locator("button").first.is_visible():
512
+ self.logger.info('Consent window found, clicking through.')
513
+ await page.locator(".qc-cmp2-summary-buttons").locator("button").locator("nth=-1").click(timeout=2000)
514
+ elif await page.locator("#popin_tc_privacy").locator("#popin_tc_privacy_button_2").is_visible():
515
+ self.logger.info('Consent window found, clicking through.')
516
+ await page.locator("#popin_tc_privacy").locator("#popin_tc_privacy_button_2").click(timeout=2000)
517
+ elif await page.get_by_test_id("uc-accept-all-button").is_visible():
518
+ self.logger.info('Consent window found, clicking through.')
519
+ await page.get_by_test_id("uc-accept-all-button").click(timeout=2000)
520
+ else:
521
+ self.logger.info('Consent window found, but no button to click through.')
522
+ await page.add_locator_handler(
523
+ page.get_by_role("dialog").last,
524
+ handler
525
+ )
526
+ self.logger.info('dialog handler added')
527
+
506
528
  async def __dialog_complianz_clickthrough(self, page: Page) -> None:
507
529
  async def handler() -> None:
508
530
  self.logger.info('######## Complianz found, clicking through.')
509
531
  if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
510
- await page.locator('.cmplz-show').locator("button.cmplz-accept").click()
532
+ await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
511
533
 
512
534
  await page.add_locator_handler(
513
535
  page.locator('.cmplz-show'),
@@ -515,6 +537,30 @@ class Capture():
515
537
  )
516
538
  self.logger.info('Complianz handler added')
517
539
 
540
+ async def __dialog_yahoo_clickthrough(self, page: Page) -> None:
541
+ async def handler() -> None:
542
+ self.logger.info('######## Yahoo found, clicking through.')
543
+ if await page.locator('.con-wizard').locator("button.accept-all").is_visible():
544
+ await page.locator('.con-wizard').locator("button.accept-all").click(timeout=2000)
545
+
546
+ await page.add_locator_handler(
547
+ page.locator('.con-wizard'),
548
+ handler
549
+ )
550
+ self.logger.info('Yahoo handler added')
551
+
552
+ async def __dialog_ppms_clickthrough(self, page: Page) -> None:
553
+ async def handler() -> None:
554
+ self.logger.info('######## piwik found, clicking through.')
555
+ if await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").is_visible():
556
+ await page.locator('.ppms_cm_popup_overlay').locator("button.ppms_cm_agree-to-all").click(timeout=2000)
557
+
558
+ await page.add_locator_handler(
559
+ page.locator('#ppms_cm_popup_overlay'),
560
+ handler
561
+ )
562
+ self.logger.info('Yahoo handler added')
563
+
518
564
  async def capture_page(self, url: str, *, max_depth_capture_time: int,
519
565
  referer: str | None=None,
520
566
  page: Page | None=None, depth: int=0,
@@ -552,13 +598,13 @@ class Capture():
552
598
 
553
599
  async def store_request(request: Request) -> None:
554
600
  # This method is called on each request, to store the body (if it is an image) in a dict indexed by URL
555
- if got_favicons:
601
+ if got_favicons or request.resource_type != 'image':
556
602
  return
557
603
  try:
558
604
  if response := await request.response():
559
605
  if got_favicons:
560
606
  return
561
- if response.ok:
607
+ if request.resource_type == 'image' and response.ok:
562
608
  try:
563
609
  if body := await response.body():
564
610
  try:
@@ -584,8 +630,11 @@ class Capture():
584
630
  await self.__dialog_onetrust_clickthrough(page)
585
631
  await self.__dialog_hubspot_clickthrough(page)
586
632
  await self.__dialog_cookiebot_clickthrough(page)
587
- await self.__dialog_alert_dialog_clickthrough(page)
588
633
  await self.__dialog_complianz_clickthrough(page)
634
+ await self.__dialog_yahoo_clickthrough(page)
635
+ await self.__dialog_ppms_clickthrough(page)
636
+ await self.__dialog_alert_dialog_clickthrough(page)
637
+ await self.__dialog_clickthrough(page)
589
638
 
590
639
  await stealth_async(page)
591
640
  page.set_default_timeout(self._capture_timeout * 1000)
@@ -642,10 +691,6 @@ class Capture():
642
691
  await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
643
692
  self.logger.debug('Start instrumentation.')
644
693
 
645
- if allow_tracking:
646
- # This event is required trigger the add_locator_handler
647
- await page.locator("body").click(button="right")
648
-
649
694
  # ==== recaptcha
650
695
  # Same technique as: https://github.com/NikolaiT/uncaptcha3
651
696
  if CAN_SOLVE_CAPTCHA:
@@ -670,10 +715,16 @@ class Capture():
670
715
  # check if we have anything on the page. If we don't, the page is not working properly.
671
716
  if await self._failsafe_get_content(page):
672
717
  self.logger.debug('Got rendered content')
718
+ if allow_tracking:
719
+ await self._wait_for_random_timeout(page, 2)
720
+ # This event is required trigger the add_locator_handler
721
+ if page.locator("body").is_visible():
722
+ await page.locator("body").click(button="right", timeout=2000)
723
+
673
724
  # move mouse
674
725
  await page.mouse.move(x=random.uniform(300, 800), y=random.uniform(200, 500))
675
726
  self.logger.debug('Moved mouse.')
676
- await self._safe_wait(page)
727
+ await self._wait_for_random_timeout(page, 2)
677
728
  self.logger.debug('Keep going after moving mouse.')
678
729
 
679
730
  if parsed_url.fragment:
@@ -681,7 +732,7 @@ class Capture():
681
732
  fragment = unquote(parsed_url.fragment)
682
733
  try:
683
734
  await page.locator(f'id={fragment}').first.scroll_into_view_if_needed(timeout=3000)
684
- await self._safe_wait(page)
735
+ await self._wait_for_random_timeout(page, 2)
685
736
  await page.mouse.wheel(delta_y=random.uniform(150, 300), delta_x=0)
686
737
  self.logger.debug('Jumped to fragment.')
687
738
  except PlaywrightTimeoutError as e:
@@ -699,39 +750,21 @@ class Capture():
699
750
  except Error as e:
700
751
  self.logger.debug(f'Unable to scroll: {e}')
701
752
 
702
- await self._safe_wait(page)
753
+ await self._wait_for_random_timeout(page, 3)
703
754
  self.logger.debug('Keep going after moving on page.')
704
755
 
705
756
  try:
706
757
  await page.keyboard.press('PageUp')
707
758
  self.logger.debug('PageUp on keyboard')
708
- await self._safe_wait(page)
759
+ await self._wait_for_random_timeout(page, 3)
709
760
  await page.keyboard.press('PageDown')
710
761
  self.logger.debug('PageDown on keyboard')
711
762
  except Error as e:
712
763
  self.logger.debug(f'Unable to use keyboard: {e}')
713
764
 
714
- self.logger.debug('Done with instrumentation, waiting for network idle.')
715
- await self._safe_wait(page)
716
- await self._wait_for_random_timeout(page, 5) # Wait 5 sec after network idle
717
- await self._safe_wait(page)
718
- self.logger.debug('Done with instrumentation, done with waiting.')
719
-
720
- if content := await self._failsafe_get_content(page):
721
- to_return['html'] = content
722
-
723
- to_return['last_redirected_url'] = page.url
724
-
725
- if 'html' in to_return and to_return['html'] is not None and with_favicon:
726
- to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
727
- got_favicons = True
728
-
729
- await self._safe_wait(page)
730
- to_return['png'] = await self._failsafe_get_screenshot(page)
731
-
732
765
  if self.wait_for_download > 0:
733
766
  self.logger.info('Waiting for download to finish...')
734
- await self._safe_wait(page)
767
+ await self._safe_wait(page, 20)
735
768
 
736
769
  if multiple_downloads:
737
770
  if len(multiple_downloads) == 1:
@@ -746,6 +779,22 @@ class Capture():
746
779
  filename, file_content = f_details
747
780
  z.writestr(f'{i}_{filename}', file_content)
748
781
  to_return["downloaded_file"] = mem_zip.getvalue()
782
+
783
+ self.logger.debug('Done with instrumentation, waiting for network idle.')
784
+ await self._wait_for_random_timeout(page, 5) # Wait 5 sec after instrumentation
785
+ await self._safe_wait(page)
786
+ self.logger.debug('Done with instrumentation, done with waiting.')
787
+
788
+ if content := await self._failsafe_get_content(page):
789
+ to_return['html'] = content
790
+
791
+ if 'html' in to_return and to_return['html'] is not None and with_favicon:
792
+ to_return['potential_favicons'] = self.get_favicons(page.url, to_return['html'])
793
+ got_favicons = True
794
+
795
+ to_return['last_redirected_url'] = page.url
796
+ to_return['png'] = await self._failsafe_get_screenshot(page)
797
+
749
798
  self._already_captured.add(url)
750
799
  if depth > 0 and to_return.get('html') and to_return['html']:
751
800
  if child_urls := self._get_links_from_rendered_page(page.url, to_return['html'], rendered_hostname_only):
@@ -859,29 +908,38 @@ class Capture():
859
908
  return to_return
860
909
 
861
910
  async def _failsafe_get_screenshot(self, page: Page) -> bytes:
911
+ self.logger.debug("Capturing a screenshot of the full page.")
862
912
  try:
863
913
  return await page.screenshot(full_page=True, timeout=5000)
864
914
  except Error as e:
865
915
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to scale it down: {e}")
866
916
 
867
917
  try:
868
- return await page.screenshot(full_page=True, scale="css", timeout=10000)
918
+ return await page.screenshot(full_page=True, scale="css", timeout=5000)
869
919
  except Error as e:
870
920
  self.logger.info(f"Capturing a screenshot of the full page failed, trying to get the current viewport only: {e}")
871
921
 
872
922
  try:
873
- return await page.screenshot(scale="css", animations='disabled', timeout=10000)
923
+ return await page.screenshot(scale="css", animations='disabled', caret='initial', timeout=5000)
874
924
  except Error as e:
875
925
  self.logger.warning(f"Unable to get any screenshot: {e}")
876
926
  raise e
877
927
 
878
- async def _safe_wait(self, page: Page) -> None:
928
+ async def _safe_wait(self, page: Page, force_max_wait_in_sec: int | None=None) -> None:
929
+ max_wait: float
879
930
  try:
931
+ if force_max_wait_in_sec is not None:
932
+ max_wait = force_max_wait_in_sec
933
+ else:
934
+ max_wait = self._capture_timeout / self.__network_not_idle
935
+ max_wait *= 1000
936
+ self.logger.debug(f'Waiting for network idle, max wait: {max_wait}s')
880
937
  # If we don't have networkidle relatively quick, it's probably because we're playing a video.
881
- await page.wait_for_load_state('networkidle', timeout=self._capture_timeout / self.__network_not_idle)
938
+ await page.wait_for_load_state('networkidle', timeout=max_wait)
882
939
  except PlaywrightTimeoutError:
883
940
  # Network never idle, keep going
884
941
  self.__network_not_idle += 1
942
+ self.logger.debug(f'Timed out - Waiting for network idle, max wait: {max_wait}s')
885
943
 
886
944
  async def _failsafe_get_content(self, page: Page) -> str | None:
887
945
  ''' The page might be changing for all kind of reason (generally a JS timeout).
@@ -894,7 +952,7 @@ class Capture():
894
952
  self.logger.debug('Unable to get page content, trying again.')
895
953
  tries -= 1
896
954
  await self._wait_for_random_timeout(page, 1)
897
- await self._safe_wait(page)
955
+ await self._safe_wait(page, 5)
898
956
  except Exception as e:
899
957
  self.logger.warning(f'The Playwright Page is in a broken state: {e}.')
900
958
  break
@@ -1004,7 +1062,7 @@ class Capture():
1004
1062
  text = recognizer.recognize_google(audio)
1005
1063
  await main_frame.get_by_role("textbox", name="Enter what you hear").fill(text)
1006
1064
  await main_frame.get_by_role("button", name="Verify").click()
1007
- await self._safe_wait(page)
1065
+ await self._safe_wait(page, 5)
1008
1066
  await self._wait_for_random_timeout(page, random.randint(3, 6))
1009
1067
  try:
1010
1068
  if await recaptcha_init_frame.locator("//span[@id='recaptcha-anchor']").first.is_checked(timeout=5000):
@@ -1059,7 +1117,7 @@ class Capture():
1059
1117
  if timeout > 1000:
1060
1118
  self.logger.warning(f'The waiting time is too long {timeout}, we expect seconds, not miliseconds.')
1061
1119
  timeout = int(timeout / 1000)
1062
- _wait_time = random.randrange(timeout * 1000 - 500, timeout * 1000 + 500)
1120
+ _wait_time = random.randrange(max(timeout * 1000 - 500, 500), max(timeout * 1000 + 500, 1000))
1063
1121
  await page.wait_for_timeout(_wait_time)
1064
1122
 
1065
1123
  def make_frame_tree(self, frame: Frame) -> dict[str, list[dict[str, Any]]]:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "PlaywrightCapture"
3
- version = "1.23.13"
3
+ version = "1.24.0"
4
4
  description = "A simple library to capture websites using playwright"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -25,7 +25,7 @@ beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
25
25
  w3lib = "^2.1.2"
26
26
  requests = {extras = ["socks"], version = "^2.31.0"}
27
27
  pydub = {version = "^0.25.1", optional = true}
28
- SpeechRecognition = {version = "^3.10.1", optional = true}
28
+ SpeechRecognition = {version = "^3.10.2", optional = true}
29
29
  pytz = {"version" = "^2024.1", python = "<3.9"}
30
30
  tzdata = "^2024.1"
31
31
  playwright-stealth = "^1.0.6"