lacuscore 1.17.2__py3-none-any.whl → 1.17.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lacuscore/helpers.py CHANGED
@@ -88,6 +88,7 @@ class CaptureResponseJson(TypedDict, total=False):
88
88
  downloaded_filename: str | None
89
89
  downloaded_file: str | None
90
90
  children: list[CaptureResponseJson] | None
91
+ trusted_timestamps: dict[str, str] | None
91
92
  runtime: float | None
92
93
  potential_favicons: list[str] | None
93
94
 
@@ -122,6 +123,7 @@ class CaptureSettings(BaseModel):
122
123
  allow_tracking: bool = False
123
124
  headless: bool = True
124
125
  init_script: str | None = None
126
+ with_trusted_timestamps: bool = False
125
127
  force: bool = False
126
128
  recapture_interval: int = 300
127
129
  priority: int = 0
lacuscore/lacuscore.py CHANGED
@@ -9,6 +9,7 @@ import os
9
9
  import pickle
10
10
  import random
11
11
  import re
12
+ import socket
12
13
  import sys
13
14
  import time
14
15
  import unicodedata
@@ -30,7 +31,7 @@ from dns.asyncresolver import Resolver
30
31
  from dns.exception import DNSException
31
32
  from dns.exception import Timeout as DNSTimeout
32
33
 
33
- from playwrightcapture import Capture, PlaywrightCaptureException, InvalidPlaywrightParameter
34
+ from playwrightcapture import Capture, PlaywrightCaptureException, InvalidPlaywrightParameter, TrustedTimestampSettings
34
35
  from pydantic import ValidationError
35
36
  from redis import Redis
36
37
  from redis.exceptions import ConnectionError as RedisConnectionError
@@ -38,6 +39,7 @@ from redis.exceptions import DataError
38
39
 
39
40
  from . import task_logger
40
41
  from .helpers import (
42
+ LacusCoreException,
41
43
  LacusCoreLogAdapter, CaptureError, RetryCapture, CaptureSettingsError,
42
44
  CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings)
43
45
 
@@ -80,6 +82,19 @@ def _secure_filename(filename: str) -> str:
80
82
  return filename
81
83
 
82
84
 
85
+ def _check_proxy_port_open(proxy: dict[str, str] | str) -> bool:
86
+ if isinstance(proxy, dict):
87
+ to_check = proxy['server']
88
+ else:
89
+ to_check = proxy
90
+ splitted_proxy_url = urlsplit(to_check)
91
+ if not splitted_proxy_url.hostname or not splitted_proxy_url.port:
92
+ raise LacusCoreException('Invalid pre-defined proxy (needs hostname and port): {proxy}')
93
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
94
+ s.settimeout(3)
95
+ return s.connect_ex((splitted_proxy_url.hostname, splitted_proxy_url.port)) == 0
96
+
97
+
83
98
  class LacusCore():
84
99
  """Capture URLs or web enabled documents using PlaywrightCapture.
85
100
 
@@ -95,11 +110,12 @@ class LacusCore():
95
110
  def __init__(self, redis_connector: Redis[bytes], /, *,
96
111
  max_capture_time: int=3600,
97
112
  expire_results: int=36000,
98
- tor_proxy: str | None=None,
99
- i2p_proxy: str | None=None,
113
+ tor_proxy: dict[str, str] | str | None=None,
114
+ i2p_proxy: dict[str, str] | str | None=None,
100
115
  only_global_lookups: bool=True,
101
116
  max_retries: int=3,
102
117
  headed_allowed: bool=False,
118
+ tt_settings: TrustedTimestampSettings | None=None,
103
119
  loglevel: str | int='INFO') -> None:
104
120
  self.master_logger = logging.getLogger(f'{self.__class__.__name__}')
105
121
  self.master_logger.setLevel(loglevel)
@@ -107,8 +123,12 @@ class LacusCore():
107
123
  self.redis = redis_connector
108
124
  self.max_capture_time = max_capture_time
109
125
  self.expire_results = expire_results
126
+
110
127
  self.tor_proxy = tor_proxy
111
128
  self.i2p_proxy = i2p_proxy
129
+
130
+ self.tt_settings = tt_settings
131
+
112
132
  self.only_global_lookups = only_global_lookups
113
133
  self.max_retries = max_retries
114
134
  self.headed_allowed = headed_allowed
@@ -150,6 +170,7 @@ class LacusCore():
150
170
  rendered_hostname_only: bool=True,
151
171
  with_screenshot: bool=True,
152
172
  with_favicon: bool=False,
173
+ with_trusted_timestamps: bool=False,
153
174
  allow_tracking: bool=False,
154
175
  headless: bool=True,
155
176
  max_retries: int | None=None,
@@ -185,6 +206,7 @@ class LacusCore():
185
206
  rendered_hostname_only: bool=True,
186
207
  with_screenshot: bool=True,
187
208
  with_favicon: bool=False,
209
+ with_trusted_timestamps: bool=False,
188
210
  allow_tracking: bool=False,
189
211
  headless: bool=True,
190
212
  max_retries: int | None=None,
@@ -222,6 +244,7 @@ class LacusCore():
222
244
  :param rendered_hostname_only: If depth > 0: only capture URLs with the same hostname as the rendered page
223
245
  :param with_screenshot: If False, PlaywrightCapture won't take a screenshot of the rendered URL
224
246
  :param with_favicon: If True, PlaywrightCapture will attempt to get the potential favicons for the rendered URL. It is a dirty trick, see this issue for details: https://github.com/Lookyloo/PlaywrightCapture/issues/45
247
+ :param with_trusted_timestamps: If True, PlaywrightCapture will trigger calls to a remote timestamp service. For that to work, this class must have been initialized with tt_settings. See RFC3161 for details: https://www.rfc-editor.org/rfc/rfc3161
225
248
  :param allow_tracking: If True, PlaywrightCapture will attempt to click through the cookie banners. It is totally dependent on the framework used on the website.
226
249
  :param headless: Whether to run the browser in headless mode. WARNING: requires to run in a graphical environment.
227
250
  :param max_retries: The maximum anount of retries for this capture
@@ -246,6 +269,7 @@ class LacusCore():
246
269
  'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
247
270
  'viewport': viewport, 'referer': referer,
248
271
  'with_screenshot': with_screenshot, 'with_favicon': with_favicon,
272
+ 'with_trusted_timestamps': with_trusted_timestamps,
249
273
  'allow_tracking': allow_tracking,
250
274
  # Quietly force it to true if headed is not allowed.
251
275
  'headless': headless if self.headed_allowed else True,
@@ -277,6 +301,10 @@ class LacusCore():
277
301
  else:
278
302
  perma_uuid = str(uuid4())
279
303
 
304
+ if to_enqueue.with_trusted_timestamps and not self.tt_settings:
305
+ self.master_logger.warning('Cannot trigger trusted timestamp, the remote timestamper service settings are missing.')
306
+ to_enqueue.with_trusted_timestamps = False
307
+
280
308
  p = self.redis.pipeline()
281
309
  p.set(f'lacus:query_hash:{hash_query}', perma_uuid, nx=True, ex=recapture_interval)
282
310
  p.hset(f'lacus:capture_settings:{perma_uuid}', mapping=to_enqueue.redis_dump())
@@ -453,18 +481,26 @@ class LacusCore():
453
481
  proxy = to_capture.proxy
454
482
  if self.tor_proxy:
455
483
  # check if onion or forced
456
- if (proxy == 'force_tor' # if the proxy is set to "force_tor", we use the pre-configured tor proxy, regardless the URL.
484
+ if (proxy == 'force_tor' # if the proxy is set to "force_tor", we use the pre-configured tor proxy, regardless the URL, legacy feature.
457
485
  or (not proxy # if the TLD is "onion", we use the pre-configured tor proxy
458
486
  and splitted_url.netloc
459
487
  and splitted_url.hostname
460
488
  and splitted_url.hostname.split('.')[-1] == 'onion')):
489
+ if not _check_proxy_port_open(self.tor_proxy):
490
+ logger.critical(f'Unable to connect to the default tor proxy: {self.tor_proxy}')
491
+ raise CaptureError('The selected tor proxy is unreachable, unable to run the capture.')
461
492
  proxy = self.tor_proxy
462
- elif self.i2p_proxy:
493
+ logger.info('Using the default tor proxy.')
494
+ if self.i2p_proxy:
463
495
  if (not proxy # if the TLD is "i2p", we use the pre-configured I2P proxy
464
496
  and splitted_url.netloc
465
497
  and splitted_url.hostname
466
498
  and splitted_url.hostname.split('.')[-1] == 'i2p'):
499
+ if not _check_proxy_port_open(self.i2p_proxy):
500
+ logger.critical(f'Unable to connect to the default tor proxy: {self.i2p_proxy}')
501
+ raise CaptureError('The selected I2P proxy is unreachable, unable to run the capture.')
467
502
  proxy = self.i2p_proxy
503
+ logger.info('Using the default I2P proxy.')
468
504
 
469
505
  if self.only_global_lookups and not proxy and splitted_url.scheme not in ['data', 'file']:
470
506
  # not relevant if we also have a proxy, or the thing to capture is a data URI or a file on disk
@@ -543,6 +579,7 @@ class LacusCore():
543
579
  loglevel=self.master_logger.getEffectiveLevel(),
544
580
  headless=to_capture.headless,
545
581
  init_script=to_capture.init_script,
582
+ tt_settings=self.tt_settings,
546
583
  uuid=uuid) as capture:
547
584
  # required by Mypy: https://github.com/python/mypy/issues/3004
548
585
  capture.headers = to_capture.headers
@@ -576,6 +613,7 @@ class LacusCore():
576
613
  with_screenshot=to_capture.with_screenshot,
577
614
  with_favicon=to_capture.with_favicon,
578
615
  allow_tracking=to_capture.allow_tracking,
616
+ with_trusted_timestamps=to_capture.with_trusted_timestamps,
579
617
  max_depth_capture_time=self.max_capture_time)
580
618
  except (TimeoutError, asyncio.exceptions.TimeoutError):
581
619
  timeout_expired(capture_timeout, logger, 'Capture took too long.')
@@ -732,6 +770,8 @@ class LacusCore():
732
770
  if results.get('html') and results['html'] is not None:
733
771
  # Need to avoid unicode encore errors, and surrogates are not allowed
734
772
  hash_to_set['html'] = results['html'].encode('utf-8', 'surrogateescape')
773
+ if results.get('trusted_timestamps'):
774
+ hash_to_set['trusted_timestamps'] = pickle.dumps(results['trusted_timestamps'])
735
775
  if 'children' in results and results['children'] is not None:
736
776
  padding_length = len(str(len(results['children'])))
737
777
  children = set()
@@ -746,11 +786,10 @@ class LacusCore():
746
786
  hash_to_set['children'] = pickle.dumps(children)
747
787
 
748
788
  for key in results.keys():
749
- if key in ['har', 'cookies', 'storage', 'potential_favicons', 'html', 'children'] or not results.get(key):
789
+ if key in ['har', 'cookies', 'storage', 'trusted_timestamps', 'potential_favicons', 'html', 'children'] or not results.get(key):
750
790
  continue
751
791
  # these entries can be stored directly
752
792
  hash_to_set[key] = results[key] # type: ignore[literal-required]
753
-
754
793
  if hash_to_set:
755
794
  pipeline.hset(root_key, mapping=hash_to_set) # type: ignore[arg-type]
756
795
  # Make sure the key expires
@@ -775,6 +814,8 @@ class LacusCore():
775
814
  to_return['storage'] = pickle.loads(value)
776
815
  elif key == b'potential_favicons':
777
816
  to_return['potential_favicons'] = pickle.loads(value)
817
+ elif key == b'trusted_timestamps':
818
+ to_return['trusted_timestamps'] = pickle.loads(value)
778
819
  elif key == b'children':
779
820
  to_return['children'] = []
780
821
  for child_root_key in sorted(pickle.loads(value)):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lacuscore
3
- Version: 1.17.2
3
+ Version: 1.17.4
4
4
  Summary: Core of Lacus, usable as a module
5
5
  License: BSD-3-Clause
6
6
  Author: Raphaël Vinot
@@ -24,9 +24,10 @@ Provides-Extra: docs
24
24
  Requires-Dist: Sphinx (>=8.2.3) ; (python_version >= "3.11") and (extra == "docs")
25
25
  Requires-Dist: async-timeout (>=5.0.1) ; python_version < "3.11"
26
26
  Requires-Dist: defang (>=0.5.3)
27
- Requires-Dist: dnspython (>=2.7.0)
27
+ Requires-Dist: dnspython (<2.8) ; python_version < "3.10"
28
+ Requires-Dist: dnspython (>=2.8.0) ; python_version >= "3.10"
28
29
  Requires-Dist: eval-type-backport (>=0.2.2) ; python_version < "3.10"
29
- Requires-Dist: playwrightcapture[recaptcha] (>=1.32.1)
30
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.32.2)
30
31
  Requires-Dist: pydantic (>=2.11.7)
31
32
  Requires-Dist: redis[hiredis] (>=5.3.0,<6.0.0)
32
33
  Requires-Dist: requests (>=2.32.5)
@@ -0,0 +1,10 @@
1
+ lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
2
+ lacuscore/helpers.py,sha256=E_pgZ6vgtOkb_NCIjgy1c041CSy00arL739hPHVxlS8,14184
3
+ lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
4
+ lacuscore/lacuscore.py,sha256=jrFGBFSYTWRquZsZx7KQAfDc8KTT_nfu7-c6mnrh-pk,48275
5
+ lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
7
+ lacuscore-1.17.4.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
+ lacuscore-1.17.4.dist-info/METADATA,sha256=LCRvwvOmSz65zlMO1iWUjpJ5lGnPVhGm8uRQ1juGBm4,2739
9
+ lacuscore-1.17.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
+ lacuscore-1.17.4.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- lacuscore/__init__.py,sha256=aLBshQPT9IBDKn5qWrX9A_exqtLFPyLsQiPWdfpAFjA,537
2
- lacuscore/helpers.py,sha256=dTt-FM7SnwEgIeGTCAvREwa1K224iNb16mmaCwcY_eg,14096
3
- lacuscore/lacus_monitoring.py,sha256=r6IaYuh6sMq43eOWdZx0fU8p4PWVZlqSD6nr6yOaTUU,2713
4
- lacuscore/lacuscore.py,sha256=fU8rojUgdxHlLjMwqDcZetwMBPVEVAS341g4xcusK4g,45682
5
- lacuscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- lacuscore/task_logger.py,sha256=2wDotU6r6vn-aKO8nZNdxSuisSj11LlcxuvW60qPL0Y,1909
7
- lacuscore-1.17.2.dist-info/LICENSE,sha256=4C4hLYrIkUD96Ggk-y_Go1Qf7PBZrEm9PSeTGe2nd4s,1516
8
- lacuscore-1.17.2.dist-info/METADATA,sha256=t83EragMDEZzYsfX53l1bGbiiItE8hUTVM6nm7ESc70,2654
9
- lacuscore-1.17.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
10
- lacuscore-1.17.2.dist-info/RECORD,,