lacuscore 1.9.2__tar.gz → 1.9.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lacuscore-1.9.2 → lacuscore-1.9.3}/PKG-INFO +3 -3
- {lacuscore-1.9.2 → lacuscore-1.9.3}/lacuscore/__init__.py +2 -1
- lacuscore-1.9.3/lacuscore/helpers.py +102 -0
- {lacuscore-1.9.2 → lacuscore-1.9.3}/lacuscore/lacuscore.py +16 -100
- lacuscore-1.9.3/lacuscore/task_logger.py +59 -0
- {lacuscore-1.9.2 → lacuscore-1.9.3}/pyproject.toml +6 -6
- {lacuscore-1.9.2 → lacuscore-1.9.3}/LICENSE +0 -0
- {lacuscore-1.9.2 → lacuscore-1.9.3}/README.md +0 -0
- {lacuscore-1.9.2 → lacuscore-1.9.3}/lacuscore/lacus_monitoring.py +0 -0
- {lacuscore-1.9.2 → lacuscore-1.9.3}/lacuscore/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lacuscore
|
3
|
-
Version: 1.9.
|
3
|
+
Version: 1.9.3
|
4
4
|
Summary: Core of Lacus, usable as a module
|
5
5
|
Home-page: https://github.com/ail-project/LacusCore
|
6
6
|
License: BSD-3-Clause
|
@@ -28,8 +28,8 @@ Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9") and (extra == "do
|
|
28
28
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
29
29
|
Requires-Dist: defang (>=0.5.3,<0.6.0)
|
30
30
|
Requires-Dist: dnspython (>=2.6.1,<3.0.0)
|
31
|
-
Requires-Dist: playwrightcapture[recaptcha] (>=1.24.
|
32
|
-
Requires-Dist: redis[hiredis] (>=5.0.
|
31
|
+
Requires-Dist: playwrightcapture[recaptcha] (>=1.24.6,<2.0.0)
|
32
|
+
Requires-Dist: redis[hiredis] (>=5.0.4,<6.0.0)
|
33
33
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
34
34
|
Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
|
35
35
|
Project-URL: Documentation, https://lacuscore.readthedocs.io/en/latest/
|
@@ -1,4 +1,5 @@
|
|
1
|
-
from .lacuscore import LacusCore
|
1
|
+
from .lacuscore import LacusCore
|
2
|
+
from .helpers import CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings # noqa
|
2
3
|
from .lacus_monitoring import LacusCoreMonitoring # noqa
|
3
4
|
|
4
5
|
__all__ = [
|
@@ -0,0 +1,102 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from enum import IntEnum, unique
|
6
|
+
from logging import LoggerAdapter
|
7
|
+
from typing import MutableMapping, Any, TypedDict
|
8
|
+
|
9
|
+
from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
|
10
|
+
|
11
|
+
|
12
|
+
class LacusCoreException(Exception):
|
13
|
+
pass
|
14
|
+
|
15
|
+
|
16
|
+
class CaptureError(LacusCoreException):
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class RetryCapture(LacusCoreException):
|
21
|
+
pass
|
22
|
+
|
23
|
+
|
24
|
+
class CaptureSettingsError(LacusCoreException):
|
25
|
+
pass
|
26
|
+
|
27
|
+
|
28
|
+
class LacusCoreLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
29
|
+
"""
|
30
|
+
Prepend log entry with the UUID of the capture
|
31
|
+
"""
|
32
|
+
def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
|
33
|
+
if self.extra:
|
34
|
+
return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
|
35
|
+
return msg, kwargs
|
36
|
+
|
37
|
+
|
38
|
+
@unique
|
39
|
+
class CaptureStatus(IntEnum):
|
40
|
+
'''The status of the capture'''
|
41
|
+
UNKNOWN = -1
|
42
|
+
QUEUED = 0
|
43
|
+
DONE = 1
|
44
|
+
ONGOING = 2
|
45
|
+
|
46
|
+
|
47
|
+
class CaptureResponse(PlaywrightCaptureResponse, TypedDict, total=False):
|
48
|
+
'''A capture made by Lacus. With the base64 encoded image and downloaded file decoded to bytes.'''
|
49
|
+
|
50
|
+
# Need to make sure the type is what's expected down the line
|
51
|
+
children: list[CaptureResponse] | None # type: ignore[misc]
|
52
|
+
|
53
|
+
status: int
|
54
|
+
runtime: float | None
|
55
|
+
|
56
|
+
|
57
|
+
class CaptureResponseJson(TypedDict, total=False):
|
58
|
+
'''A capture made by Lacus. With the base64 encoded image and downloaded file *not* decoded.'''
|
59
|
+
|
60
|
+
status: int
|
61
|
+
last_redirected_url: str | None
|
62
|
+
har: dict[str, Any] | None
|
63
|
+
cookies: list[dict[str, str]] | None
|
64
|
+
error: str | None
|
65
|
+
html: str | None
|
66
|
+
png: str | None
|
67
|
+
downloaded_filename: str | None
|
68
|
+
downloaded_file: str | None
|
69
|
+
children: list[CaptureResponseJson] | None
|
70
|
+
runtime: float | None
|
71
|
+
potential_favicons: list[str] | None
|
72
|
+
|
73
|
+
|
74
|
+
class CaptureSettings(TypedDict, total=False):
|
75
|
+
'''The capture settings that can be passed to Lacus.'''
|
76
|
+
|
77
|
+
url: str | None
|
78
|
+
document_name: str | None
|
79
|
+
document: str | None
|
80
|
+
browser: str | None
|
81
|
+
device_name: str | None
|
82
|
+
user_agent: str | None
|
83
|
+
proxy: str | dict[str, str] | None
|
84
|
+
general_timeout_in_sec: int | None
|
85
|
+
cookies: list[dict[str, Any]] | None
|
86
|
+
headers: str | dict[str, str] | None
|
87
|
+
http_credentials: dict[str, str] | None
|
88
|
+
geolocation: dict[str, float] | None
|
89
|
+
timezone_id: str | None
|
90
|
+
locale: str | None
|
91
|
+
color_scheme: str | None
|
92
|
+
viewport: dict[str, int] | None
|
93
|
+
referer: str | None
|
94
|
+
with_favicon: bool
|
95
|
+
allow_tracking: bool
|
96
|
+
force: bool
|
97
|
+
recapture_interval: int
|
98
|
+
priority: int
|
99
|
+
uuid: str | None
|
100
|
+
|
101
|
+
depth: int
|
102
|
+
rendered_hostname_only: bool # Note: only used if depth is > 0
|
@@ -18,12 +18,10 @@ import zlib
|
|
18
18
|
from asyncio import Task
|
19
19
|
from base64 import b64decode, b64encode
|
20
20
|
from datetime import date, timedelta
|
21
|
-
from enum import IntEnum, unique
|
22
21
|
from ipaddress import ip_address, IPv4Address, IPv6Address
|
23
|
-
from logging import LoggerAdapter
|
24
22
|
from pathlib import Path
|
25
23
|
from tempfile import NamedTemporaryFile
|
26
|
-
from typing import Literal, Any,
|
24
|
+
from typing import Literal, Any, overload, cast, Iterator
|
27
25
|
from uuid import uuid4
|
28
26
|
from urllib.parse import urlsplit
|
29
27
|
|
@@ -33,12 +31,16 @@ from dns.exception import Timeout as DNSTimeout
|
|
33
31
|
|
34
32
|
from defang import refang # type: ignore[import-untyped]
|
35
33
|
from playwrightcapture import Capture, PlaywrightCaptureException
|
36
|
-
from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
|
37
34
|
from redis import Redis
|
38
35
|
from redis.exceptions import ConnectionError as RedisConnectionError
|
39
36
|
from redis.exceptions import DataError
|
40
37
|
from ua_parser import user_agent_parser # type: ignore[import-untyped]
|
41
38
|
|
39
|
+
from . import task_logger
|
40
|
+
from .helpers import (
|
41
|
+
LacusCoreLogAdapter, CaptureError, RetryCapture, CaptureSettingsError,
|
42
|
+
CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings)
|
43
|
+
|
42
44
|
if sys.version_info < (3, 11):
|
43
45
|
from async_timeout import timeout
|
44
46
|
else:
|
@@ -65,95 +67,6 @@ def _secure_filename(filename: str) -> str:
|
|
65
67
|
return filename
|
66
68
|
|
67
69
|
|
68
|
-
class LacusCoreException(Exception):
|
69
|
-
pass
|
70
|
-
|
71
|
-
|
72
|
-
class CaptureError(LacusCoreException):
|
73
|
-
pass
|
74
|
-
|
75
|
-
|
76
|
-
class RetryCapture(LacusCoreException):
|
77
|
-
pass
|
78
|
-
|
79
|
-
|
80
|
-
@unique
|
81
|
-
class CaptureStatus(IntEnum):
|
82
|
-
'''The status of the capture'''
|
83
|
-
UNKNOWN = -1
|
84
|
-
QUEUED = 0
|
85
|
-
DONE = 1
|
86
|
-
ONGOING = 2
|
87
|
-
|
88
|
-
|
89
|
-
class CaptureResponse(PlaywrightCaptureResponse, TypedDict, total=False):
|
90
|
-
'''A capture made by Lacus. With the base64 encoded image and downloaded file decoded to bytes.'''
|
91
|
-
|
92
|
-
# Need to make sure the type is what's expected down the line
|
93
|
-
children: list[CaptureResponse] | None # type: ignore[misc]
|
94
|
-
|
95
|
-
status: int
|
96
|
-
runtime: float | None
|
97
|
-
|
98
|
-
|
99
|
-
class CaptureResponseJson(TypedDict, total=False):
|
100
|
-
'''A capture made by Lacus. With the base64 encoded image and downloaded file *not* decoded.'''
|
101
|
-
|
102
|
-
status: int
|
103
|
-
last_redirected_url: str | None
|
104
|
-
har: dict[str, Any] | None
|
105
|
-
cookies: list[dict[str, str]] | None
|
106
|
-
error: str | None
|
107
|
-
html: str | None
|
108
|
-
png: str | None
|
109
|
-
downloaded_filename: str | None
|
110
|
-
downloaded_file: str | None
|
111
|
-
children: list[CaptureResponseJson] | None
|
112
|
-
runtime: float | None
|
113
|
-
potential_favicons: list[str] | None
|
114
|
-
|
115
|
-
|
116
|
-
class CaptureSettings(TypedDict, total=False):
|
117
|
-
'''The capture settings that can be passed to Lacus.'''
|
118
|
-
|
119
|
-
url: str | None
|
120
|
-
document_name: str | None
|
121
|
-
document: str | None
|
122
|
-
browser: str | None
|
123
|
-
device_name: str | None
|
124
|
-
user_agent: str | None
|
125
|
-
proxy: str | dict[str, str] | None
|
126
|
-
general_timeout_in_sec: int | None
|
127
|
-
cookies: list[dict[str, Any]] | None
|
128
|
-
headers: str | dict[str, str] | None
|
129
|
-
http_credentials: dict[str, str] | None
|
130
|
-
geolocation: dict[str, float] | None
|
131
|
-
timezone_id: str | None
|
132
|
-
locale: str | None
|
133
|
-
color_scheme: str | None
|
134
|
-
viewport: dict[str, int] | None
|
135
|
-
referer: str | None
|
136
|
-
with_favicon: bool
|
137
|
-
allow_tracking: bool
|
138
|
-
force: bool
|
139
|
-
recapture_interval: int
|
140
|
-
priority: int
|
141
|
-
uuid: str | None
|
142
|
-
|
143
|
-
depth: int
|
144
|
-
rendered_hostname_only: bool # Note: only used if depth is > 0
|
145
|
-
|
146
|
-
|
147
|
-
class LacusCoreLogAdapter(LoggerAdapter): # type: ignore[type-arg]
|
148
|
-
"""
|
149
|
-
Prepend log entry with the UUID of the capture
|
150
|
-
"""
|
151
|
-
def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
|
152
|
-
if self.extra:
|
153
|
-
return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
|
154
|
-
return msg, kwargs
|
155
|
-
|
156
|
-
|
157
70
|
class LacusCore():
|
158
71
|
"""Capture URLs or web enabled documents using PlaywrightCapture.
|
159
72
|
|
@@ -368,7 +281,7 @@ class LacusCore():
|
|
368
281
|
p.execute()
|
369
282
|
except DataError:
|
370
283
|
self.master_logger.exception(f'Unable to enqueue: {to_enqueue}')
|
371
|
-
raise
|
284
|
+
raise CaptureSettingsError(f'Unable to enqueue: {to_enqueue}')
|
372
285
|
return perma_uuid
|
373
286
|
|
374
287
|
def _encode_response(self, capture: CaptureResponse) -> CaptureResponseJson:
|
@@ -453,7 +366,10 @@ class LacusCore():
|
|
453
366
|
max_consume -= 1
|
454
367
|
uuid: str = value[0][0].decode()
|
455
368
|
priority: int = int(value[0][1])
|
456
|
-
|
369
|
+
logger = LacusCoreLogAdapter(self.master_logger, {'uuid': uuid})
|
370
|
+
yield task_logger.create_task(self._capture(uuid, priority), name=uuid,
|
371
|
+
logger=logger,
|
372
|
+
message='Capture raised an uncaught exception')
|
457
373
|
|
458
374
|
async def _capture(self, uuid: str, priority: int) -> None:
|
459
375
|
"""Trigger a specific capture
|
@@ -508,11 +424,11 @@ class LacusCore():
|
|
508
424
|
elif k == 'document':
|
509
425
|
document_as_bytes = b64decode(v)
|
510
426
|
else:
|
511
|
-
raise
|
512
|
-
except
|
427
|
+
raise CaptureSettingsError(f'Unexpected setting: {k}: {v}')
|
428
|
+
except CaptureSettingsError as e:
|
513
429
|
raise e
|
514
430
|
except Exception as e:
|
515
|
-
raise
|
431
|
+
raise CaptureSettingsError(f'Error while preparing settings: {e}')
|
516
432
|
|
517
433
|
if not to_capture:
|
518
434
|
all_entries = self.redis.hgetall(f'lacus:capture_settings:{uuid}')
|
@@ -523,7 +439,7 @@ class LacusCore():
|
|
523
439
|
# we do not have a URL yet.
|
524
440
|
name = to_capture.pop('document_name', None)
|
525
441
|
if not name:
|
526
|
-
raise
|
442
|
+
raise CaptureSettingsError('No document name provided, settings are invalid')
|
527
443
|
if not Path(name).suffix:
|
528
444
|
# The browser will simply display the file as text if there is no extension.
|
529
445
|
# Just add HTML as a fallback, as it will be the most comon one.
|
@@ -688,7 +604,7 @@ class LacusCore():
|
|
688
604
|
else:
|
689
605
|
current_retry = int(_current_retry.decode())
|
690
606
|
if current_retry > 0:
|
691
|
-
logger.debug(f'Retrying {url} for the {self.max_retries-current_retry+1}th time.')
|
607
|
+
logger.debug(f'Retrying {url} for the {self.max_retries - current_retry + 1}th time.')
|
692
608
|
self.redis.decr(f'lacus:capture_retry:{uuid}')
|
693
609
|
retry = True
|
694
610
|
else:
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Any, Coroutine, Optional, TypeVar, Tuple
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import functools
|
9
|
+
import logging
|
10
|
+
|
11
|
+
from .helpers import LacusCoreLogAdapter
|
12
|
+
|
13
|
+
T = TypeVar('T')
|
14
|
+
|
15
|
+
# Code from https://quantlane.com/blog/ensure-asyncio-task-exceptions-get-logged/
|
16
|
+
|
17
|
+
|
18
|
+
def create_task(
|
19
|
+
coroutine: Coroutine[Any, Any, T],
|
20
|
+
*,
|
21
|
+
name: str,
|
22
|
+
logger: 'LacusCoreLogAdapter',
|
23
|
+
message: str,
|
24
|
+
message_args: Tuple[Any, ...] = (),
|
25
|
+
loop: Optional[asyncio.AbstractEventLoop] = None,
|
26
|
+
|
27
|
+
) -> 'asyncio.Task[T]': # This type annotation has to be quoted for Python < 3.9, see https://www.python.org/dev/peps/pep-0585/
|
28
|
+
'''
|
29
|
+
This helper function wraps a ``loop.create_task(coroutine())`` call and ensures there is
|
30
|
+
an exception handler added to the resulting task. If the task raises an exception it is logged
|
31
|
+
using the provided ``logger``, with additional context provided by ``message`` and optionally
|
32
|
+
``message_args``.
|
33
|
+
'''
|
34
|
+
if loop is None:
|
35
|
+
loop = asyncio.get_running_loop()
|
36
|
+
task = loop.create_task(coroutine, name=name)
|
37
|
+
task.add_done_callback(
|
38
|
+
functools.partial(_handle_task_result, logger=logger, message=message, message_args=message_args)
|
39
|
+
)
|
40
|
+
return task
|
41
|
+
|
42
|
+
|
43
|
+
def _handle_task_result(
|
44
|
+
task: asyncio.Task[Any],
|
45
|
+
*,
|
46
|
+
logger: logging.Logger,
|
47
|
+
message: str,
|
48
|
+
message_args: Tuple[Any, ...] = (),
|
49
|
+
) -> None:
|
50
|
+
try:
|
51
|
+
task.result()
|
52
|
+
except asyncio.CancelledError:
|
53
|
+
pass # Task cancellation should not be logged as an error.
|
54
|
+
except asyncio.TimeoutError:
|
55
|
+
pass # Timeout is also fine
|
56
|
+
# Ad the pylint ignore: we want to handle all exceptions here so that the result of the task
|
57
|
+
# is properly logged. There is no point re-raising the exception in this callback.
|
58
|
+
except Exception: # pylint: disable=broad-except
|
59
|
+
logger.exception(message, *message_args)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lacuscore"
|
3
|
-
version = "1.9.
|
3
|
+
version = "1.9.3"
|
4
4
|
description = "Core of Lacus, usable as a module"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -34,10 +34,10 @@ Sphinx = [
|
|
34
34
|
{version = "<7.2", python = "<3.9", optional = true},
|
35
35
|
{version = "^7.2", python = ">=3.9", optional = true}
|
36
36
|
]
|
37
|
-
playwrightcapture = {extras = ["recaptcha"], version = "^1.24.
|
37
|
+
playwrightcapture = {extras = ["recaptcha"], version = "^1.24.6"}
|
38
38
|
defang = "^0.5.3"
|
39
39
|
ua-parser = "^0.18.0"
|
40
|
-
redis = {version = "^5.0.
|
40
|
+
redis = {version = "^5.0.4", extras = ["hiredis"]}
|
41
41
|
dnspython = "^2.6.1"
|
42
42
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
43
43
|
|
@@ -45,8 +45,8 @@ async-timeout = {version = "^4.0.3", python = "<3.11"}
|
|
45
45
|
docs = ["Sphinx"]
|
46
46
|
|
47
47
|
[tool.poetry.group.dev.dependencies]
|
48
|
-
types-redis = {version = "^4.6.0.
|
49
|
-
mypy = "^1.
|
48
|
+
types-redis = {version = "^4.6.0.20240425"}
|
49
|
+
mypy = "^1.10.0"
|
50
50
|
types-requests = "^2.31.0.20240406"
|
51
51
|
types-beautifulsoup4 = "^4.12.0.20240229"
|
52
52
|
ipython = [
|
@@ -54,7 +54,7 @@ ipython = [
|
|
54
54
|
{version = "^8.18.0", python = ">=3.9"},
|
55
55
|
{version = "^8.19.0", python = ">=3.10"}
|
56
56
|
]
|
57
|
-
pytest = "^8.
|
57
|
+
pytest = "^8.2.0"
|
58
58
|
|
59
59
|
[build-system]
|
60
60
|
requires = ["poetry_core"]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|