lacuscore 1.9.3__tar.gz → 1.9.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lacuscore-1.9.3 → lacuscore-1.9.5}/PKG-INFO +3 -3
- {lacuscore-1.9.3 → lacuscore-1.9.5}/lacuscore/lacuscore.py +32 -19
- {lacuscore-1.9.3 → lacuscore-1.9.5}/pyproject.toml +6 -6
- {lacuscore-1.9.3 → lacuscore-1.9.5}/LICENSE +0 -0
- {lacuscore-1.9.3 → lacuscore-1.9.5}/README.md +0 -0
- {lacuscore-1.9.3 → lacuscore-1.9.5}/lacuscore/__init__.py +0 -0
- {lacuscore-1.9.3 → lacuscore-1.9.5}/lacuscore/helpers.py +0 -0
- {lacuscore-1.9.3 → lacuscore-1.9.5}/lacuscore/lacus_monitoring.py +0 -0
- {lacuscore-1.9.3 → lacuscore-1.9.5}/lacuscore/py.typed +0 -0
- {lacuscore-1.9.3 → lacuscore-1.9.5}/lacuscore/task_logger.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lacuscore
|
3
|
-
Version: 1.9.
|
3
|
+
Version: 1.9.5
|
4
4
|
Summary: Core of Lacus, usable as a module
|
5
5
|
Home-page: https://github.com/ail-project/LacusCore
|
6
6
|
License: BSD-3-Clause
|
@@ -28,9 +28,9 @@ Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9") and (extra == "do
|
|
28
28
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
29
29
|
Requires-Dist: defang (>=0.5.3,<0.6.0)
|
30
30
|
Requires-Dist: dnspython (>=2.6.1,<3.0.0)
|
31
|
-
Requires-Dist: playwrightcapture[recaptcha] (>=1.24.
|
31
|
+
Requires-Dist: playwrightcapture[recaptcha] (>=1.24.11,<2.0.0)
|
32
32
|
Requires-Dist: redis[hiredis] (>=5.0.4,<6.0.0)
|
33
|
-
Requires-Dist: requests (>=2.
|
33
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
34
34
|
Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
|
35
35
|
Project-URL: Documentation, https://lacuscore.readthedocs.io/en/latest/
|
36
36
|
Project-URL: Repository, https://github.com/ail-project/LacusCore
|
@@ -43,9 +43,19 @@ from .helpers import (
|
|
43
43
|
|
44
44
|
if sys.version_info < (3, 11):
|
45
45
|
from async_timeout import timeout
|
46
|
+
|
47
|
+
def timeout_expired(timeout_cm, logger, error_message: str) -> None: # type: ignore[no-untyped-def]
|
48
|
+
if timeout_cm.expired:
|
49
|
+
logger.warning(f'Timeout expired: {error_message}')
|
50
|
+
|
46
51
|
else:
|
47
52
|
from asyncio import timeout
|
48
53
|
|
54
|
+
def timeout_expired(timeout_cm, logger, error_message: str) -> None: # type: ignore[no-untyped-def]
|
55
|
+
if timeout_cm.expired():
|
56
|
+
logger.warning(f'Timeout expired: {error_message}')
|
57
|
+
|
58
|
+
|
49
59
|
BROWSER = Literal['chromium', 'firefox', 'webkit']
|
50
60
|
|
51
61
|
|
@@ -523,15 +533,12 @@ class LacusCore():
|
|
523
533
|
browser_engine = 'webkit'
|
524
534
|
try:
|
525
535
|
logger.debug(f'Capturing {url}')
|
526
|
-
# NOTE: starting with python 3.11, we can use asyncio.timeout
|
527
|
-
# async with asyncio.timeout(self.max_capture_time):
|
528
|
-
general_timeout = to_capture.get('general_timeout_in_sec')
|
529
536
|
stats_pipeline.sadd(f'stats:{today}:captures', url)
|
530
537
|
async with Capture(
|
531
538
|
browser=browser_engine,
|
532
539
|
device_name=to_capture.get('device_name'),
|
533
540
|
proxy=proxy,
|
534
|
-
general_timeout_in_sec=
|
541
|
+
general_timeout_in_sec=to_capture.get('general_timeout_in_sec'),
|
535
542
|
loglevel=self.master_logger.getEffectiveLevel(),
|
536
543
|
uuid=uuid) as capture:
|
537
544
|
# required by Mypy: https://github.com/python/mypy/issues/3004
|
@@ -544,20 +551,30 @@ class LacusCore():
|
|
544
551
|
capture.timezone_id = to_capture.get('timezone_id') # type: ignore[assignment]
|
545
552
|
capture.locale = to_capture.get('locale') # type: ignore[assignment]
|
546
553
|
capture.color_scheme = to_capture.get('color_scheme') # type: ignore[assignment]
|
554
|
+
|
555
|
+
# make sure the initialization doesn't take too long
|
556
|
+
init_timeout = max(self.max_capture_time / 10, 5)
|
547
557
|
try:
|
548
|
-
async with timeout(
|
558
|
+
async with timeout(init_timeout) as initialize_timeout:
|
549
559
|
await capture.initialize_context()
|
550
560
|
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
551
|
-
logger
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
+
timeout_expired(initialize_timeout, logger, 'Initializing took too long.')
|
562
|
+
logger.warning(f'Initializing the context for {url} took longer than the allowed initialization timeout ({init_timeout}s)')
|
563
|
+
raise RetryCapture(f'Initializing the context for {url} took longer than the allowed initialization timeout ({init_timeout}s)')
|
564
|
+
|
565
|
+
try:
|
566
|
+
async with timeout(self.max_capture_time) as capture_timeout:
|
567
|
+
playwright_result = await capture.capture_page(
|
568
|
+
url, referer=to_capture.get('referer'),
|
569
|
+
depth=to_capture.get('depth', 0),
|
570
|
+
rendered_hostname_only=to_capture.get('rendered_hostname_only', True),
|
571
|
+
with_favicon=to_capture.get('with_favicon', False),
|
572
|
+
allow_tracking=to_capture.get('allow_tracking', False),
|
573
|
+
max_depth_capture_time=self.max_capture_time)
|
574
|
+
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
575
|
+
timeout_expired(capture_timeout, logger, 'Capture took too long.')
|
576
|
+
logger.warning(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
|
577
|
+
raise RetryCapture(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
|
561
578
|
result = cast(CaptureResponse, playwright_result)
|
562
579
|
if 'error' in result and 'error_name' in result:
|
563
580
|
# generate stats
|
@@ -575,10 +592,6 @@ class LacusCore():
|
|
575
592
|
# The capture can be canceled if it has been running for way too long.
|
576
593
|
# We can give it another short.
|
577
594
|
raise RetryCapture(f'The capture of {url} has been cancelled.')
|
578
|
-
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
579
|
-
logger.warning(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
|
580
|
-
result = {'error': f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)'}
|
581
|
-
raise CaptureError(f'The capture of {url} took longer than the allowed max capture time ({self.max_capture_time}s)')
|
582
595
|
except Exception as e:
|
583
596
|
logger.exception(f'Something went poorly {url} - {e}')
|
584
597
|
result = {'error': f'Something went poorly {url} - {e}'}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lacuscore"
|
3
|
-
version = "1.9.
|
3
|
+
version = "1.9.5"
|
4
4
|
description = "Core of Lacus, usable as a module"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -29,12 +29,12 @@ classifiers = [
|
|
29
29
|
|
30
30
|
[tool.poetry.dependencies]
|
31
31
|
python = "^3.8"
|
32
|
-
requests = "^2.
|
32
|
+
requests = "^2.32.3"
|
33
33
|
Sphinx = [
|
34
34
|
{version = "<7.2", python = "<3.9", optional = true},
|
35
35
|
{version = "^7.2", python = ">=3.9", optional = true}
|
36
36
|
]
|
37
|
-
playwrightcapture = {extras = ["recaptcha"], version = "^1.24.
|
37
|
+
playwrightcapture = {extras = ["recaptcha"], version = "^1.24.11"}
|
38
38
|
defang = "^0.5.3"
|
39
39
|
ua-parser = "^0.18.0"
|
40
40
|
redis = {version = "^5.0.4", extras = ["hiredis"]}
|
@@ -47,14 +47,14 @@ docs = ["Sphinx"]
|
|
47
47
|
[tool.poetry.group.dev.dependencies]
|
48
48
|
types-redis = {version = "^4.6.0.20240425"}
|
49
49
|
mypy = "^1.10.0"
|
50
|
-
types-requests = "^2.
|
51
|
-
types-beautifulsoup4 = "^4.12.0.
|
50
|
+
types-requests = "^2.32.0.20240523"
|
51
|
+
types-beautifulsoup4 = "^4.12.0.20240511"
|
52
52
|
ipython = [
|
53
53
|
{version = "<8.13.0", python = "<3.9"},
|
54
54
|
{version = "^8.18.0", python = ">=3.9"},
|
55
55
|
{version = "^8.19.0", python = ">=3.10"}
|
56
56
|
]
|
57
|
-
pytest = "^8.2.
|
57
|
+
pytest = "^8.2.1"
|
58
58
|
|
59
59
|
[build-system]
|
60
60
|
requires = ["poetry_core"]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|