lacuscore 1.17.2__py3-none-any.whl → 1.17.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lacuscore/helpers.py +2 -0
- lacuscore/lacuscore.py +48 -7
- {lacuscore-1.17.2.dist-info → lacuscore-1.17.4.dist-info}/METADATA +4 -3
- lacuscore-1.17.4.dist-info/RECORD +10 -0
- lacuscore-1.17.2.dist-info/RECORD +0 -10
- {lacuscore-1.17.2.dist-info → lacuscore-1.17.4.dist-info}/LICENSE +0 -0
- {lacuscore-1.17.2.dist-info → lacuscore-1.17.4.dist-info}/WHEEL +0 -0
lacuscore/helpers.py
CHANGED
@@ -88,6 +88,7 @@ class CaptureResponseJson(TypedDict, total=False):
|
|
88
88
|
downloaded_filename: str | None
|
89
89
|
downloaded_file: str | None
|
90
90
|
children: list[CaptureResponseJson] | None
|
91
|
+
trusted_timestamps: dict[str, str] | None
|
91
92
|
runtime: float | None
|
92
93
|
potential_favicons: list[str] | None
|
93
94
|
|
@@ -122,6 +123,7 @@ class CaptureSettings(BaseModel):
|
|
122
123
|
allow_tracking: bool = False
|
123
124
|
headless: bool = True
|
124
125
|
init_script: str | None = None
|
126
|
+
with_trusted_timestamps: bool = False
|
125
127
|
force: bool = False
|
126
128
|
recapture_interval: int = 300
|
127
129
|
priority: int = 0
|
lacuscore/lacuscore.py
CHANGED
@@ -9,6 +9,7 @@ import os
|
|
9
9
|
import pickle
|
10
10
|
import random
|
11
11
|
import re
|
12
|
+
import socket
|
12
13
|
import sys
|
13
14
|
import time
|
14
15
|
import unicodedata
|
@@ -30,7 +31,7 @@ from dns.asyncresolver import Resolver
|
|
30
31
|
from dns.exception import DNSException
|
31
32
|
from dns.exception import Timeout as DNSTimeout
|
32
33
|
|
33
|
-
from playwrightcapture import Capture, PlaywrightCaptureException, InvalidPlaywrightParameter
|
34
|
+
from playwrightcapture import Capture, PlaywrightCaptureException, InvalidPlaywrightParameter, TrustedTimestampSettings
|
34
35
|
from pydantic import ValidationError
|
35
36
|
from redis import Redis
|
36
37
|
from redis.exceptions import ConnectionError as RedisConnectionError
|
@@ -38,6 +39,7 @@ from redis.exceptions import DataError
|
|
38
39
|
|
39
40
|
from . import task_logger
|
40
41
|
from .helpers import (
|
42
|
+
LacusCoreException,
|
41
43
|
LacusCoreLogAdapter, CaptureError, RetryCapture, CaptureSettingsError,
|
42
44
|
CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings)
|
43
45
|
|
@@ -80,6 +82,19 @@ def _secure_filename(filename: str) -> str:
|
|
80
82
|
return filename
|
81
83
|
|
82
84
|
|
85
|
+
def _check_proxy_port_open(proxy: dict[str, str] | str) -> bool:
|
86
|
+
if isinstance(proxy, dict):
|
87
|
+
to_check = proxy['server']
|
88
|
+
else:
|
89
|
+
to_check = proxy
|
90
|
+
splitted_proxy_url = urlsplit(to_check)
|
91
|
+
if not splitted_proxy_url.hostname or not splitted_proxy_url.port:
|
92
|
+
raise LacusCoreException('Invalid pre-defined proxy (needs hostname and port): {proxy}')
|
93
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
94
|
+
s.settimeout(3)
|
95
|
+
return s.connect_ex((splitted_proxy_url.hostname, splitted_proxy_url.port)) == 0
|
96
|
+
|
97
|
+
|
83
98
|
class LacusCore():
|
84
99
|
"""Capture URLs or web enabled documents using PlaywrightCapture.
|
85
100
|
|
@@ -95,11 +110,12 @@ class LacusCore():
|
|
95
110
|
def __init__(self, redis_connector: Redis[bytes], /, *,
|
96
111
|
max_capture_time: int=3600,
|
97
112
|
expire_results: int=36000,
|
98
|
-
tor_proxy: str | None=None,
|
99
|
-
i2p_proxy: str | None=None,
|
113
|
+
tor_proxy: dict[str, str] | str | None=None,
|
114
|
+
i2p_proxy: dict[str, str] | str | None=None,
|
100
115
|
only_global_lookups: bool=True,
|
101
116
|
max_retries: int=3,
|
102
117
|
headed_allowed: bool=False,
|
118
|
+
tt_settings: TrustedTimestampSettings | None=None,
|
103
119
|
loglevel: str | int='INFO') -> None:
|
104
120
|
self.master_logger = logging.getLogger(f'{self.__class__.__name__}')
|
105
121
|
self.master_logger.setLevel(loglevel)
|
@@ -107,8 +123,12 @@ class LacusCore():
|
|
107
123
|
self.redis = redis_connector
|
108
124
|
self.max_capture_time = max_capture_time
|
109
125
|
self.expire_results = expire_results
|
126
|
+
|
110
127
|
self.tor_proxy = tor_proxy
|
111
128
|
self.i2p_proxy = i2p_proxy
|
129
|
+
|
130
|
+
self.tt_settings = tt_settings
|
131
|
+
|
112
132
|
self.only_global_lookups = only_global_lookups
|
113
133
|
self.max_retries = max_retries
|
114
134
|
self.headed_allowed = headed_allowed
|
@@ -150,6 +170,7 @@ class LacusCore():
|
|
150
170
|
rendered_hostname_only: bool=True,
|
151
171
|
with_screenshot: bool=True,
|
152
172
|
with_favicon: bool=False,
|
173
|
+
with_trusted_timestamps: bool=False,
|
153
174
|
allow_tracking: bool=False,
|
154
175
|
headless: bool=True,
|
155
176
|
max_retries: int | None=None,
|
@@ -185,6 +206,7 @@ class LacusCore():
|
|
185
206
|
rendered_hostname_only: bool=True,
|
186
207
|
with_screenshot: bool=True,
|
187
208
|
with_favicon: bool=False,
|
209
|
+
with_trusted_timestamps: bool=False,
|
188
210
|
allow_tracking: bool=False,
|
189
211
|
headless: bool=True,
|
190
212
|
max_retries: int | None=None,
|
@@ -222,6 +244,7 @@ class LacusCore():
|
|
222
244
|
:param rendered_hostname_only: If depth > 0: only capture URLs with the same hostname as the rendered page
|
223
245
|
:param with_screenshot: If False, PlaywrightCapture won't take a screenshot of the rendered URL
|
224
246
|
:param with_favicon: If True, PlaywrightCapture will attempt to get the potential favicons for the rendered URL. It is a dirty trick, see this issue for details: https://github.com/Lookyloo/PlaywrightCapture/issues/45
|
247
|
+
:param with_trusted_timestamps: If True, PlaywrightCapture will trigger calls to a remote timestamp service. For that to work, this class must have been initialized with tt_settings. See RFC3161 for details: https://www.rfc-editor.org/rfc/rfc3161
|
225
248
|
:param allow_tracking: If True, PlaywrightCapture will attempt to click through the cookie banners. It is totally dependent on the framework used on the website.
|
226
249
|
:param headless: Whether to run the browser in headless mode. WARNING: requires to run in a graphical environment.
|
227
250
|
:param max_retries: The maximum anount of retries for this capture
|
@@ -246,6 +269,7 @@ class LacusCore():
|
|
246
269
|
'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
|
247
270
|
'viewport': viewport, 'referer': referer,
|
248
271
|
'with_screenshot': with_screenshot, 'with_favicon': with_favicon,
|
272
|
+
'with_trusted_timestamps': with_trusted_timestamps,
|
249
273
|
'allow_tracking': allow_tracking,
|
250
274
|
# Quietly force it to true if headed is not allowed.
|
251
275
|
'headless': headless if self.headed_allowed else True,
|
@@ -277,6 +301,10 @@ class LacusCore():
|
|
277
301
|
else:
|
278
302
|
perma_uuid = str(uuid4())
|
279
303
|
|
304
|
+
if to_enqueue.with_trusted_timestamps and not self.tt_settings:
|
305
|
+
self.master_logger.warning('Cannot trigger trusted timestamp, the remote timestamper service settings are missing.')
|
306
|
+
to_enqueue.with_trusted_timestamps = False
|
307
|
+
|
280
308
|
p = self.redis.pipeline()
|
281
309
|
p.set(f'lacus:query_hash:{hash_query}', perma_uuid, nx=True, ex=recapture_interval)
|
282
310
|
p.hset(f'lacus:capture_settings:{perma_uuid}', mapping=to_enqueue.redis_dump())
|
@@ -453,18 +481,26 @@ class LacusCore():
|
|
453
481
|
proxy = to_capture.proxy
|
454
482
|
if self.tor_proxy:
|
455
483
|
# check if onion or forced
|
456
|
-
if (proxy == 'force_tor' # if the proxy is set to "force_tor", we use the pre-configured tor proxy, regardless the URL.
|
484
|
+
if (proxy == 'force_tor' # if the proxy is set to "force_tor", we use the pre-configured tor proxy, regardless the URL, legacy feature.
|
457
485
|
or (not proxy # if the TLD is "onion", we use the pre-configured tor proxy
|
458
486
|
and splitted_url.netloc
|
459
487
|
and splitted_url.hostname
|
460
488
|
and splitted_url.hostname.split('.')[-1] == 'onion')):
|
489
|
+
if not _check_proxy_port_open(self.tor_proxy):
|
490
|
+
logger.critical(f'Unable to connect to the default tor proxy: {self.tor_proxy}')
|
491
|
+
raise CaptureError('The selected tor proxy is unreachable, unable to run the capture.')
|
461
492
|
proxy = self.tor_proxy
|
462
|
-
|
493
|
+
logger.info('Using the default tor proxy.')
|
494
|
+
if self.i2p_proxy:
|
463
495
|
if (not proxy # if the TLD is "i2p", we use the pre-configured I2P proxy
|
464
496
|
and splitted_url.netloc
|
465
497
|
and splitted_url.hostname
|
466
498
|
and splitted_url.hostname.split('.')[-1] == 'i2p'):
|
499
|
+
if not _check_proxy_port_open(self.i2p_proxy):
|
500
|
+
logger.critical(f'Unable to connect to the default tor proxy: {self.i2p_proxy}')
|
501
|
+
raise CaptureError('The selected I2P proxy is unreachable, unable to run the capture.')
|
467
502
|
proxy = self.i2p_proxy
|
503
|
+
logger.info('Using the default I2P proxy.')
|
468
504
|
|
469
505
|
if self.only_global_lookups and not proxy and splitted_url.scheme not in ['data', 'file']:
|
470
506
|
# not relevant if we also have a proxy, or the thing to capture is a data URI or a file on disk
|
@@ -543,6 +579,7 @@ class LacusCore():
|
|
543
579
|
loglevel=self.master_logger.getEffectiveLevel(),
|
544
580
|
headless=to_capture.headless,
|
545
581
|
init_script=to_capture.init_script,
|
582
|
+
tt_settings=self.tt_settings,
|
546
583
|
uuid=uuid) as capture:
|
547
584
|
# required by Mypy: https://github.com/python/mypy/issues/3004
|
548
585
|
capture.headers = to_capture.headers
|
@@ -576,6 +613,7 @@ class LacusCore():
|
|
576
613
|
with_screenshot=to_capture.with_screenshot,
|
577
614
|
with_favicon=to_capture.with_favicon,
|
578
615
|
allow_tracking=to_capture.allow_tracking,
|
616
|
+
with_trusted_timestamps=to_capture.with_trusted_timestamps,
|
579
617
|
max_depth_capture_time=self.max_capture_time)
|
580
618
|
except (TimeoutError, asyncio.exceptions.TimeoutError):
|
581
619
|
timeout_expired(capture_timeout, logger, 'Capture took too long.')
|
@@ -732,6 +770,8 @@ class LacusCore():
|
|
732
770
|
if results.get('html') and results['html'] is not None:
|
733
771
|
# Need to avoid unicode encore errors, and surrogates are not allowed
|
734
772
|
hash_to_set['html'] = results['html'].encode('utf-8', 'surrogateescape')
|
773
|
+
if results.get('trusted_timestamps'):
|
774
|
+
hash_to_set['trusted_timestamps'] = pickle.dumps(results['trusted_timestamps'])
|
735
775
|
if 'children' in results and results['children'] is not None:
|
736
776
|
padding_length = len(str(len(results['children'])))
|
737
777
|
children = set()
|
@@ -746,11 +786,10 @@ class LacusCore():
|
|
746
786
|
hash_to_set['children'] = pickle.dumps(children)
|
747
787
|
|
748
788
|
for key in results.keys():
|
749
|
-
if key in ['har', 'cookies', 'storage', 'potential_favicons', 'html', 'children'] or not results.get(key):
|
789
|
+
if key in ['har', 'cookies', 'storage', 'trusted_timestamps', 'potential_favicons', 'html', 'children'] or not results.get(key):
|
750
790
|
continue
|
751
791
|
# these entries can be stored directly
|
752
792
|
hash_to_set[key] = results[key] # type: ignore[literal-required]
|
753
|
-
|
754
793
|
if hash_to_set:
|
755
794
|
pipeline.hset(root_key, mapping=hash_to_set) # type: ignore[arg-type]
|
756
795
|
# Make sure the key expires
|
@@ -775,6 +814,8 @@ class LacusCore():
|
|
775
814
|
to_return['storage'] = pickle.loads(value)
|
776
815
|
elif key == b'potential_favicons':
|
777
816
|
to_return['potential_favicons'] = pickle.loads(value)
|
817
|
+
elif key == b'trusted_timestamps':
|
818
|
+
to_return['trusted_timestamps'] = pickle.loads(value)
|
778
819
|
elif key == b'children':
|
779
820
|
to_return['children'] = []
|
780
821
|
for child_root_key in sorted(pickle.loads(value)):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: lacuscore
|
3
|
-
Version: 1.17.
|
3
|
+
Version: 1.17.4
|
4
4
|
Summary: Core of Lacus, usable as a module
|
5
5
|
License: BSD-3-Clause
|
6
6
|
Author: Raphaël Vinot
|
@@ -24,9 +24,10 @@ Provides-Extra: docs
|
|
24
24
|
Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
|
25
25
|
Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
|
26
26
|
Requires-Dist: defang (>=0.5.3)
|
27
|
-
Requires-Dist: dnspython (
|
27
|
+
Requires-Dist: dnspython (<2.8) ; python_version < "3.10"
|
28
|
+
Requires-Dist: dnspython (>=2.8.0) ; python_version >= "3.10"
|
28
29
|
Requires-Dist: eval-type-backport (>=0.2.2) ; python_version < "3.10"
|
29
|
-
Requires-Dist: playwrightcapture[recaptcha] (>=1.32.
|
30
|
+
Requires-Dist: playwrightcapture[recaptcha] (>=1.32.2)
|
30
31
|
Requires-Dist: pydantic (>=2.11.7)
|
31
32
|
Requires-Dist: redis[hiredis] (>=5.3.0,<6.0.0)
|
32
33
|
Requires-Dist: requests (>=2.32.5)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
|
2
|
+
lacuscore/helpers.py,sha256=E_pgZ6vgtOkb_NCIjgy1c041CSy00arL739hPHVxlS8,14184
|
3
|
+
lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
|
4
|
+
lacuscore/lacuscore.py,sha256=jrFGBFSYTWRquZsZx7KQAfDc8KTT_nfu7-c6mnrh-pk,48275
|
5
|
+
lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
|
7
|
+
lacuscore-1.17.4.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
|
8
|
+
lacuscore-1.17.4.dist-info/METADATA,sha256=LCRvwvOmSz65zlMO1iWUjpJ5lGnPVhGm8uRQ1juGBm4,2739
|
9
|
+
lacuscore-1.17.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
10
|
+
lacuscore-1.17.4.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
|
2
|
-
lacuscore/helpers.py,sha256=dTt-FM7SnwEgIeGTCAvREwa1K224iNb16mmaCwcY_eg,14096
|
3
|
-
lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
|
4
|
-
lacuscore/lacuscore.py,sha256=fU8rojUgdxHlLjMwqDcZetwMBPVEVAS341g4xcusK4g,45682
|
5
|
-
lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
|
7
|
-
lacuscore-1.17.2.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
|
8
|
-
lacuscore-1.17.2.dist-info/METADATA,sha256=t83EragMDEZzYsfX53l1bGbiiItE8hUTVM6nm7ESc70,2654
|
9
|
-
lacuscore-1.17.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
10
|
-
lacuscore-1.17.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|