PlaywrightCapture 1.24.9__tar.gz → 1.24.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/PKG-INFO +3 -3
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/playwrightcapture/capture.py +40 -16
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/pyproject.toml +4 -4
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/LICENSE +0 -0
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/README.md +0 -0
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/playwrightcapture/__init__.py +0 -0
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/playwrightcapture/exceptions.py +0 -0
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/playwrightcapture/helpers.py +0 -0
- {playwrightcapture-1.24.9 → playwrightcapture-1.24.11}/playwrightcapture/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: PlaywrightCapture
|
3
|
-
Version: 1.24.
|
3
|
+
Version: 1.24.11
|
4
4
|
Summary: A simple library to capture websites using playwright
|
5
5
|
Home-page: https://github.com/Lookyloo/PlaywrightCapture
|
6
6
|
License: BSD-3-Clause
|
@@ -26,12 +26,12 @@ Requires-Dist: aiohttp[speedups] (>=3.9.5,<4.0.0)
|
|
26
26
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
27
27
|
Requires-Dist: beautifulsoup4[charset-normalizer,lxml] (>=4.12.3,<5.0.0)
|
28
28
|
Requires-Dist: dateparser (>=1.2.0,<2.0.0)
|
29
|
-
Requires-Dist: playwright (>=1.
|
29
|
+
Requires-Dist: playwright (>=1.44.0,<2.0.0)
|
30
30
|
Requires-Dist: playwright-stealth (>=1.0.6,<2.0.0)
|
31
31
|
Requires-Dist: puremagic (>=1.23,<2.0)
|
32
32
|
Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "recaptcha"
|
33
33
|
Requires-Dist: pytz (>=2024.1,<2025.0) ; python_version < "3.9"
|
34
|
-
Requires-Dist: setuptools (>=
|
34
|
+
Requires-Dist: setuptools (>=70.0.0,<71.0.0)
|
35
35
|
Requires-Dist: tzdata (>=2024.1,<2025.0)
|
36
36
|
Requires-Dist: w3lib (>=2.1.2,<3.0.0)
|
37
37
|
Project-URL: Repository, https://github.com/Lookyloo/PlaywrightCapture
|
@@ -474,7 +474,7 @@ class Capture():
|
|
474
474
|
if await page.locator("#didomi-notice-agree-button").is_visible():
|
475
475
|
await page.locator("#didomi-notice-agree-button").click(timeout=2000)
|
476
476
|
|
477
|
-
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler)
|
477
|
+
await page.add_locator_handler(page.locator(".didomi-popup-view").last, handler, times=1, no_wait_after=True)
|
478
478
|
self.logger.info('Didomi handler added')
|
479
479
|
|
480
480
|
async def __dialog_onetrust_clickthrough(self, page: Page) -> None:
|
@@ -484,7 +484,8 @@ class Capture():
|
|
484
484
|
|
485
485
|
await page.add_locator_handler(
|
486
486
|
page.locator('#onetrust-banner-sdk').last,
|
487
|
-
handler
|
487
|
+
handler,
|
488
|
+
times=1, no_wait_after=True
|
488
489
|
)
|
489
490
|
self.logger.info('OT handler added')
|
490
491
|
|
@@ -495,7 +496,8 @@ class Capture():
|
|
495
496
|
|
496
497
|
await page.add_locator_handler(
|
497
498
|
page.locator('#hs-eu-cookie-confirmation').last,
|
498
|
-
handler
|
499
|
+
handler,
|
500
|
+
times=1, no_wait_after=True
|
499
501
|
)
|
500
502
|
self.logger.info('HS handler added')
|
501
503
|
|
@@ -506,7 +508,8 @@ class Capture():
|
|
506
508
|
|
507
509
|
await page.add_locator_handler(
|
508
510
|
page.locator('#CybotCookiebotDialogBody'),
|
509
|
-
handler
|
511
|
+
handler,
|
512
|
+
times=1, no_wait_after=True
|
510
513
|
)
|
511
514
|
self.logger.info('Cookiebot handler added')
|
512
515
|
|
@@ -522,7 +525,8 @@ class Capture():
|
|
522
525
|
|
523
526
|
await page.add_locator_handler(
|
524
527
|
page.get_by_role("alertdialog").last,
|
525
|
-
handler
|
528
|
+
handler,
|
529
|
+
times=1, no_wait_after=True
|
526
530
|
)
|
527
531
|
self.logger.info('alert dialog handler added')
|
528
532
|
|
@@ -541,18 +545,20 @@ class Capture():
|
|
541
545
|
self.logger.info('Consent window found, but no button to click through.')
|
542
546
|
await page.add_locator_handler(
|
543
547
|
page.get_by_role("dialog").last,
|
544
|
-
handler
|
548
|
+
handler,
|
549
|
+
times=1, no_wait_after=True
|
545
550
|
)
|
546
551
|
self.logger.info('dialog handler added')
|
547
552
|
|
548
553
|
async def __dialog_complianz_clickthrough(self, page: Page) -> None:
|
549
554
|
async def handler() -> None:
|
550
|
-
if await page.locator('.cmplz-show').locator("button.cmplz-accept").is_visible():
|
551
|
-
await page.locator('.cmplz-show').locator("button.cmplz-accept").click(timeout=2000)
|
555
|
+
if await page.locator('.cmplz-show').first.locator("button.cmplz-accept").is_visible():
|
556
|
+
await page.locator('.cmplz-show').first.locator("button.cmplz-accept").click(timeout=2000)
|
552
557
|
|
553
558
|
await page.add_locator_handler(
|
554
|
-
page.locator('.cmplz-show'),
|
555
|
-
handler
|
559
|
+
page.locator('.cmplz-show').first,
|
560
|
+
handler,
|
561
|
+
times=1, no_wait_after=True
|
556
562
|
)
|
557
563
|
self.logger.info('Complianz handler added')
|
558
564
|
|
@@ -563,7 +569,8 @@ class Capture():
|
|
563
569
|
|
564
570
|
await page.add_locator_handler(
|
565
571
|
page.locator('.con-wizard'),
|
566
|
-
handler
|
572
|
+
handler,
|
573
|
+
times=1, no_wait_after=True
|
567
574
|
)
|
568
575
|
self.logger.info('Yahoo handler added')
|
569
576
|
|
@@ -574,7 +581,8 @@ class Capture():
|
|
574
581
|
|
575
582
|
await page.add_locator_handler(
|
576
583
|
page.locator('#ppms_cm_popup_overlay'),
|
577
|
-
handler
|
584
|
+
handler,
|
585
|
+
times=1, no_wait_after=True
|
578
586
|
)
|
579
587
|
self.logger.info('Yahoo handler added')
|
580
588
|
|
@@ -644,7 +652,13 @@ class Capture():
|
|
644
652
|
capturing_sub = True
|
645
653
|
else:
|
646
654
|
capturing_sub = False
|
647
|
-
|
655
|
+
try:
|
656
|
+
page = await self.context.new_page()
|
657
|
+
except Error as e:
|
658
|
+
self.logger.warning(f'The context is in a broken state: {e}')
|
659
|
+
self.should_retry = True
|
660
|
+
return to_return
|
661
|
+
|
648
662
|
if allow_tracking:
|
649
663
|
# Add authorization clickthroughs
|
650
664
|
await self.__dialog_didomi_clickthrough(page)
|
@@ -704,8 +718,13 @@ class Capture():
|
|
704
718
|
else:
|
705
719
|
raise initial_error
|
706
720
|
else:
|
707
|
-
|
708
|
-
|
721
|
+
try:
|
722
|
+
await page.bring_to_front()
|
723
|
+
self.logger.debug('Page moved to front.')
|
724
|
+
except Error as e:
|
725
|
+
self.should_retry = True
|
726
|
+
self.logger.warning('Page in a broken state.')
|
727
|
+
raise e
|
709
728
|
|
710
729
|
# page instrumentation
|
711
730
|
await self._wait_for_random_timeout(page, 5) # Wait 5 sec after document loaded
|
@@ -902,6 +921,7 @@ class Capture():
|
|
902
921
|
'Navigation failed because page was closed!',
|
903
922
|
'Target page, context or browser has been closed',
|
904
923
|
'Protocol error (Page.bringToFront): Not attached to an active page',
|
924
|
+
'Peer failed to perform TLS handshake: A packet with illegal or unsupported version was received.',
|
905
925
|
'Peer failed to perform TLS handshake: The TLS connection was non-properly terminated.',
|
906
926
|
'Peer failed to perform TLS handshake: Error sending data: Connection reset by peer',
|
907
927
|
'Peer failed to perform TLS handshake: Error receiving data: Connection reset by peer',
|
@@ -909,7 +929,10 @@ class Capture():
|
|
909
929
|
'Peer sent fatal TLS alert: Internal error',
|
910
930
|
'Load cannot follow more than 20 redirections',
|
911
931
|
'Page crashed',
|
912
|
-
'Error receiving data: Connection reset by peer'
|
932
|
+
'Error receiving data: Connection reset by peer',
|
933
|
+
'Internal SOCKSv5 proxy server error.',
|
934
|
+
'Host unreachable through SOCKSv5 server.',
|
935
|
+
'HTTP/2 Error: NO_ERROR']:
|
913
936
|
# Other errors, let's give it another shot
|
914
937
|
self.logger.info(f'Issue with {url} (retrying): {e.message}')
|
915
938
|
self.should_retry = True
|
@@ -1155,6 +1178,7 @@ class Capture():
|
|
1155
1178
|
'NS_ERROR_ABORT',
|
1156
1179
|
'NS_ERROR_CONNECTION_REFUSED',
|
1157
1180
|
'NS_ERROR_NET_INTERRUPT',
|
1181
|
+
'NS_ERROR_NET_PARTIAL_TRANSFER',
|
1158
1182
|
'NS_ERROR_NET_RESET',
|
1159
1183
|
'NS_ERROR_NET_TIMEOUT',
|
1160
1184
|
'NS_ERROR_REDIRECT_LOOP',
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "PlaywrightCapture"
|
3
|
-
version = "1.24.
|
3
|
+
version = "1.24.11"
|
4
4
|
description = "A simple library to capture websites using playwright"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -19,7 +19,7 @@ classifiers=[
|
|
19
19
|
|
20
20
|
[tool.poetry.dependencies]
|
21
21
|
python = "^3.8"
|
22
|
-
playwright = "^1.
|
22
|
+
playwright = "^1.44.0"
|
23
23
|
dateparser = "^1.2.0"
|
24
24
|
beautifulsoup4 = {version= "^4.12.3", extras = ["lxml", "charset_normalizer"]}
|
25
25
|
w3lib = "^2.1.2"
|
@@ -28,7 +28,7 @@ SpeechRecognition = {version = "^3.10.4", optional = true}
|
|
28
28
|
pytz = {"version" = "^2024.1", python = "<3.9"}
|
29
29
|
tzdata = "^2024.1"
|
30
30
|
playwright-stealth = "^1.0.6"
|
31
|
-
setuptools = "^
|
31
|
+
setuptools = "^70.0.0"
|
32
32
|
puremagic = "^1.23"
|
33
33
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
34
34
|
aiohttp = {extras = ["speedups"], version = "^3.9.5"}
|
@@ -42,7 +42,7 @@ optional = true
|
|
42
42
|
|
43
43
|
[tool.poetry.group.dev.dependencies]
|
44
44
|
types-beautifulsoup4 = "^4.12.0.20240511"
|
45
|
-
pytest = "^8.2.
|
45
|
+
pytest = "^8.2.1"
|
46
46
|
mypy = "^1.10.0"
|
47
47
|
types-dateparser = "^1.2.0.20240420"
|
48
48
|
types-pytz = "^2024.1.0.20240417"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|