lacuscore 1.11.3__tar.gz → 1.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lacuscore-1.11.3 → lacuscore-1.12.0}/PKG-INFO +4 -7
- {lacuscore-1.11.3 → lacuscore-1.12.0}/lacuscore/helpers.py +3 -1
- {lacuscore-1.11.3 → lacuscore-1.12.0}/lacuscore/lacuscore.py +30 -19
- {lacuscore-1.11.3 → lacuscore-1.12.0}/lacuscore/task_logger.py +8 -7
- {lacuscore-1.11.3 → lacuscore-1.12.0}/pyproject.toml +4 -10
- {lacuscore-1.11.3 → lacuscore-1.12.0}/LICENSE +0 -0
- {lacuscore-1.11.3 → lacuscore-1.12.0}/README.md +0 -0
- {lacuscore-1.11.3 → lacuscore-1.12.0}/lacuscore/__init__.py +0 -0
- {lacuscore-1.11.3 → lacuscore-1.12.0}/lacuscore/lacus_monitoring.py +0 -0
- {lacuscore-1.11.3 → lacuscore-1.12.0}/lacuscore/py.typed +0 -0
@@ -1,12 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lacuscore
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.12.0
|
4
4
|
Summary: Core of Lacus, usable as a module
|
5
5
|
Home-page: https://github.com/ail-project/LacusCore
|
6
6
|
License: BSD-3-Clause
|
7
7
|
Author: Raphaël Vinot
|
8
8
|
Author-email: raphael.vinot@circl.lu
|
9
|
-
Requires-Python: >=3.
|
9
|
+
Requires-Python: >=3.9,<4.0
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
11
11
|
Classifier: Environment :: Console
|
12
12
|
Classifier: Intended Audience :: Information Technology
|
@@ -15,7 +15,6 @@ Classifier: Intended Audience :: Telecommunications Industry
|
|
15
15
|
Classifier: License :: OSI Approved :: BSD License
|
16
16
|
Classifier: Operating System :: POSIX :: Linux
|
17
17
|
Classifier: Programming Language :: Python :: 3
|
18
|
-
Classifier: Programming Language :: Python :: 3.8
|
19
18
|
Classifier: Programming Language :: Python :: 3.9
|
20
19
|
Classifier: Programming Language :: Python :: 3.10
|
21
20
|
Classifier: Programming Language :: Python :: 3.11
|
@@ -24,15 +23,13 @@ Classifier: Programming Language :: Python :: 3.13
|
|
24
23
|
Classifier: Topic :: Internet
|
25
24
|
Classifier: Topic :: Security
|
26
25
|
Provides-Extra: docs
|
27
|
-
Requires-Dist: Sphinx (<7.2) ; (python_version < "3.9") and (extra == "docs")
|
28
26
|
Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9" and python_version < "3.10") and (extra == "docs")
|
29
27
|
Requires-Dist: Sphinx (>=8,<9) ; (python_version >= "3.10") and (extra == "docs")
|
30
28
|
Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
|
31
29
|
Requires-Dist: defang (>=0.5.3,<0.6.0)
|
32
|
-
Requires-Dist: dnspython (
|
33
|
-
Requires-Dist: dnspython (>=2.7,<3.0) ; python_version >= "3.9"
|
30
|
+
Requires-Dist: dnspython (>=2.7.0,<3.0.0) ; python_version >= "3.9"
|
34
31
|
Requires-Dist: eval-type-backport (>=0.2.0,<0.3.0) ; python_version < "3.10"
|
35
|
-
Requires-Dist: playwrightcapture[recaptcha] (>=1.
|
32
|
+
Requires-Dist: playwrightcapture[recaptcha] (>=1.27.0,<2.0.0)
|
36
33
|
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
37
34
|
Requires-Dist: redis[hiredis] (>=5.2.0,<6.0.0)
|
38
35
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
@@ -6,7 +6,8 @@ import json
|
|
6
6
|
|
7
7
|
from enum import IntEnum, unique
|
8
8
|
from logging import LoggerAdapter
|
9
|
-
from typing import
|
9
|
+
from typing import Any, TypedDict, Literal
|
10
|
+
from collections.abc import MutableMapping, Mapping
|
10
11
|
|
11
12
|
from defang import refang # type: ignore[import-untyped]
|
12
13
|
from pydantic import BaseModel, field_validator, model_validator, ValidationError
|
@@ -107,6 +108,7 @@ class CaptureSettings(BaseModel):
|
|
107
108
|
force: bool = False
|
108
109
|
recapture_interval: int = 300
|
109
110
|
priority: int = 0
|
111
|
+
max_retries: int | None = None
|
110
112
|
uuid: str | None = None
|
111
113
|
|
112
114
|
depth: int = 0
|
@@ -19,7 +19,8 @@ from base64 import b64decode, b64encode
|
|
19
19
|
from datetime import date, timedelta
|
20
20
|
from ipaddress import ip_address, IPv4Address, IPv6Address
|
21
21
|
from tempfile import NamedTemporaryFile
|
22
|
-
from typing import Literal, Any, overload, cast
|
22
|
+
from typing import Literal, Any, overload, cast
|
23
|
+
from collections.abc import Iterator
|
23
24
|
from uuid import uuid4
|
24
25
|
from urllib.parse import urlsplit
|
25
26
|
|
@@ -80,7 +81,7 @@ class LacusCore():
|
|
80
81
|
"""Capture URLs or web enabled documents using PlaywrightCapture.
|
81
82
|
|
82
83
|
:param redis_connector: Pre-configured connector to a redis instance.
|
83
|
-
:param
|
84
|
+
:param max_capture_time: If the capture takes more than that time, break (in seconds)
|
84
85
|
:param tor_proxy: URL to a SOCKS 5 tor proxy. If you have tor installed, this is the default: socks5://127.0.0.1:9050.
|
85
86
|
:param only_global_lookups: Discard captures that point to non-public IPs.
|
86
87
|
:param max_retries: How many times should we re-try a capture if it failed.
|
@@ -137,6 +138,7 @@ class LacusCore():
|
|
137
138
|
rendered_hostname_only: bool=True,
|
138
139
|
with_favicon: bool=False,
|
139
140
|
allow_tracking: bool=False,
|
141
|
+
max_retries: int | None=None,
|
140
142
|
force: bool=False,
|
141
143
|
recapture_interval: int=300,
|
142
144
|
priority: int=0,
|
@@ -166,6 +168,7 @@ class LacusCore():
|
|
166
168
|
rendered_hostname_only: bool=True,
|
167
169
|
with_favicon: bool=False,
|
168
170
|
allow_tracking: bool=False,
|
171
|
+
max_retries: int | None=None,
|
169
172
|
force: bool=False,
|
170
173
|
recapture_interval: int=300,
|
171
174
|
priority: int=0,
|
@@ -197,6 +200,8 @@ class LacusCore():
|
|
197
200
|
:param rendered_hostname_only: If depth > 0: only capture URLs with the same hostname as the rendered page
|
198
201
|
:param with_favicon: If True, PlaywrightCapture will attempt to get the potential favicons for the rendered URL. It is a dirty trick, see this issue for details: https://github.com/Lookyloo/PlaywrightCapture/issues/45
|
199
202
|
:param allow_tracking: If True, PlaywrightCapture will attempt to click through the cookie banners. It is totally dependent on the framework used on the website.
|
203
|
+
:param max_retries: The maximum anount of retries for this capture
|
204
|
+
|
200
205
|
:param force: Force recapture, even if the same one was already done within the recapture_interval
|
201
206
|
:param recapture_interval: The time the enqueued settings are kept in memory to avoid duplicates
|
202
207
|
:param priority: The priority of the capture
|
@@ -215,7 +220,7 @@ class LacusCore():
|
|
215
220
|
'timezone_id': timezone_id, 'locale': locale,
|
216
221
|
'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
|
217
222
|
'viewport': viewport, 'referer': referer, 'with_favicon': with_favicon,
|
218
|
-
'allow_tracking': allow_tracking}
|
223
|
+
'allow_tracking': allow_tracking, 'max_retries': max_retries}
|
219
224
|
|
220
225
|
try:
|
221
226
|
to_enqueue = CaptureSettings(**settings)
|
@@ -469,6 +474,8 @@ class LacusCore():
|
|
469
474
|
cookie['path'] = '/'
|
470
475
|
cookies.append(cookie)
|
471
476
|
|
477
|
+
# If the class is initialized with max_retries below the one provided in the settings, we use the lowest value
|
478
|
+
max_retries = min([to_capture.max_retries, self.max_retries]) if to_capture.max_retries is not None else self.max_retries
|
472
479
|
try:
|
473
480
|
logger.debug(f'Capturing {url}')
|
474
481
|
stats_pipeline.sadd(f'stats:{today}:captures', url)
|
@@ -544,25 +551,29 @@ class LacusCore():
|
|
544
551
|
# this is a retry that worked
|
545
552
|
stats_pipeline.sadd(f'stats:{today}:retry_success', url)
|
546
553
|
except RetryCapture:
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
# No retry yet
|
551
|
-
logger.debug(f'Retrying {url} for the first time.')
|
552
|
-
retry = True
|
553
|
-
self.redis.setex(f'lacus:capture_retry:{uuid}',
|
554
|
-
self.max_capture_time * (self.max_retries + 10),
|
555
|
-
self.max_retries)
|
554
|
+
if max_retries == 0:
|
555
|
+
error_msg = result['error'] if result.get('error') else 'Unknown error'
|
556
|
+
logger.info(f'Retries disabled for {url}: {error_msg}')
|
556
557
|
else:
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
558
|
+
# Check if we already re-tried this capture
|
559
|
+
_current_retry = self.redis.get(f'lacus:capture_retry:{uuid}')
|
560
|
+
if _current_retry is None:
|
561
|
+
# No retry yet
|
562
|
+
logger.debug(f'Retrying {url} for the first time.')
|
561
563
|
retry = True
|
564
|
+
self.redis.setex(f'lacus:capture_retry:{uuid}',
|
565
|
+
self.max_capture_time * (max_retries + 10),
|
566
|
+
max_retries - 1)
|
562
567
|
else:
|
563
|
-
|
564
|
-
|
565
|
-
|
568
|
+
current_retry = int(_current_retry.decode())
|
569
|
+
if current_retry > 0:
|
570
|
+
logger.debug(f'Retrying {url} for the {max_retries - current_retry + 1} time.')
|
571
|
+
self.redis.decr(f'lacus:capture_retry:{uuid}')
|
572
|
+
retry = True
|
573
|
+
else:
|
574
|
+
error_msg = result['error'] if result.get('error') else 'Unknown error'
|
575
|
+
logger.info(f'Retried too many times {url}: {error_msg}')
|
576
|
+
stats_pipeline.sadd(f'stats:{today}:retry_failed', url)
|
566
577
|
except CaptureError:
|
567
578
|
if not result:
|
568
579
|
result = {'error': "No result key, shouldn't happen"}
|
@@ -2,7 +2,8 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
-
from typing import Any,
|
5
|
+
from typing import Any, TypeVar
|
6
|
+
from collections.abc import Coroutine
|
6
7
|
|
7
8
|
import asyncio
|
8
9
|
import functools
|
@@ -18,12 +19,12 @@ def create_task(
|
|
18
19
|
coroutine: Coroutine[Any, Any, T],
|
19
20
|
*,
|
20
21
|
name: str,
|
21
|
-
logger:
|
22
|
+
logger: LacusCoreLogAdapter,
|
22
23
|
message: str,
|
23
|
-
message_args:
|
24
|
-
loop:
|
24
|
+
message_args: tuple[Any, ...] = (),
|
25
|
+
loop: asyncio.AbstractEventLoop | None = None,
|
25
26
|
|
26
|
-
) ->
|
27
|
+
) -> asyncio.Task[T]: # This type annotation has to be quoted for Python < 3.9, see https://www.python.org/dev/peps/pep-0585/
|
27
28
|
'''
|
28
29
|
This helper function wraps a ``loop.create_task(coroutine())`` call and ensures there is
|
29
30
|
an exception handler added to the resulting task. If the task raises an exception it is logged
|
@@ -42,9 +43,9 @@ def create_task(
|
|
42
43
|
def _handle_task_result(
|
43
44
|
task: asyncio.Task[Any],
|
44
45
|
*,
|
45
|
-
logger:
|
46
|
+
logger: LacusCoreLogAdapter,
|
46
47
|
message: str,
|
47
|
-
message_args:
|
48
|
+
message_args: tuple[Any, ...] = (),
|
48
49
|
) -> None:
|
49
50
|
try:
|
50
51
|
task.result()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lacuscore"
|
3
|
-
version = "1.
|
3
|
+
version = "1.12.0"
|
4
4
|
description = "Core of Lacus, usable as a module"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -18,7 +18,6 @@ classifiers = [
|
|
18
18
|
'Intended Audience :: Telecommunications Industry',
|
19
19
|
'Intended Audience :: Information Technology',
|
20
20
|
'Programming Language :: Python :: 3',
|
21
|
-
'Programming Language :: Python :: 3.8',
|
22
21
|
'Programming Language :: Python :: 3.9',
|
23
22
|
'Programming Language :: Python :: 3.10',
|
24
23
|
'Programming Language :: Python :: 3.11',
|
@@ -29,21 +28,17 @@ classifiers = [
|
|
29
28
|
]
|
30
29
|
|
31
30
|
[tool.poetry.dependencies]
|
32
|
-
python = "^3.
|
31
|
+
python = "^3.9"
|
33
32
|
requests = "^2.32.3"
|
34
33
|
Sphinx = [
|
35
|
-
{version = "<7.2", python = "<3.9", optional = true},
|
36
34
|
{version = "^7.2", python = ">=3.9,<3.10", optional = true},
|
37
35
|
{version = "^8", python = ">=3.10", optional = true}
|
38
36
|
]
|
39
|
-
playwrightcapture = {extras = ["recaptcha"], version = "^1.
|
37
|
+
playwrightcapture = {extras = ["recaptcha"], version = "^1.27.0"}
|
40
38
|
defang = "^0.5.3"
|
41
39
|
ua-parser = "^0.18.0"
|
42
40
|
redis = {version = "^5.2.0", extras = ["hiredis"]}
|
43
|
-
dnspython =
|
44
|
-
{version = "<2.7", python = "<3.9"},
|
45
|
-
{version = "^2.7", python = ">=3.9"}
|
46
|
-
]
|
41
|
+
dnspython = {version = "^2.7.0", python = ">=3.9"}
|
47
42
|
async-timeout = {version = "^4.0.3", python = "<3.11"}
|
48
43
|
pydantic = "^2.9.2"
|
49
44
|
eval-type-backport = {version = "^0.2.0", python = "<3.10"}
|
@@ -57,7 +52,6 @@ types-redis = {version = "^4.6.0.20241004"}
|
|
57
52
|
types-requests = "^2.32.0.20241016"
|
58
53
|
types-beautifulsoup4 = "^4.12.0.20241020"
|
59
54
|
ipython = [
|
60
|
-
{version = "<8.13.0", python = "<3.9"},
|
61
55
|
{version = "^8.18.0", python = ">=3.9"},
|
62
56
|
{version = "^8.19.0", python = ">=3.10"}
|
63
57
|
]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|