lacuscore 1.11.3__tar.gz → 1.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lacuscore
3
- Version: 1.11.3
3
+ Version: 1.12.1
4
4
  Summary: Core of Lacus, usable as a module
5
5
  Home-page: https://github.com/ail-project/LacusCore
6
6
  License: BSD-3-Clause
7
7
  Author: Raphaël Vinot
8
8
  Author-email: raphael.vinot@circl.lu
9
- Requires-Python: >=3.8,<4.0
9
+ Requires-Python: >=3.9,<4.0
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Environment :: Console
12
12
  Classifier: Intended Audience :: Information Technology
@@ -15,7 +15,6 @@ Classifier: Intended Audience :: Telecommunications Industry
15
15
  Classifier: License :: OSI Approved :: BSD License
16
16
  Classifier: Operating System :: POSIX :: Linux
17
17
  Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.8
19
18
  Classifier: Programming Language :: Python :: 3.9
20
19
  Classifier: Programming Language :: Python :: 3.10
21
20
  Classifier: Programming Language :: Python :: 3.11
@@ -24,15 +23,13 @@ Classifier: Programming Language :: Python :: 3.13
24
23
  Classifier: Topic :: Internet
25
24
  Classifier: Topic :: Security
26
25
  Provides-Extra: docs
27
- Requires-Dist: Sphinx (<7.2) ; (python_version < "3.9") and (extra == "docs")
28
26
  Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9" and python_version < "3.10") and (extra == "docs")
29
27
  Requires-Dist: Sphinx (>=8,<9) ; (python_version >= "3.10") and (extra == "docs")
30
28
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
31
29
  Requires-Dist: defang (>=0.5.3,<0.6.0)
32
- Requires-Dist: dnspython (<2.7) ; python_version < "3.9"
33
- Requires-Dist: dnspython (>=2.7,<3.0) ; python_version >= "3.9"
30
+ Requires-Dist: dnspython (>=2.7.0,<3.0.0) ; python_version >= "3.9"
34
31
  Requires-Dist: eval-type-backport (>=0.2.0,<0.3.0) ; python_version < "3.10"
35
- Requires-Dist: playwrightcapture[recaptcha] (>=1.26.3,<2.0.0)
32
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.27.0,<2.0.0)
36
33
  Requires-Dist: pydantic (>=2.9.2,<3.0.0)
37
34
  Requires-Dist: redis[hiredis] (>=5.2.0,<6.0.0)
38
35
  Requires-Dist: requests (>=2.32.3,<3.0.0)
@@ -6,7 +6,8 @@ import json
6
6
 
7
7
  from enum import IntEnum, unique
8
8
  from logging import LoggerAdapter
9
- from typing import MutableMapping, Any, TypedDict, Mapping, Literal
9
+ from typing import Any, TypedDict, Literal
10
+ from collections.abc import MutableMapping, Mapping
10
11
 
11
12
  from defang import refang # type: ignore[import-untyped]
12
13
  from pydantic import BaseModel, field_validator, model_validator, ValidationError
@@ -107,6 +108,7 @@ class CaptureSettings(BaseModel):
107
108
  force: bool = False
108
109
  recapture_interval: int = 300
109
110
  priority: int = 0
111
+ max_retries: int | None = None
110
112
  uuid: str | None = None
111
113
 
112
114
  depth: int = 0
@@ -19,7 +19,8 @@ from base64 import b64decode, b64encode
19
19
  from datetime import date, timedelta
20
20
  from ipaddress import ip_address, IPv4Address, IPv6Address
21
21
  from tempfile import NamedTemporaryFile
22
- from typing import Literal, Any, overload, cast, Iterator
22
+ from typing import Literal, Any, overload, cast
23
+ from collections.abc import Iterator
23
24
  from uuid import uuid4
24
25
  from urllib.parse import urlsplit
25
26
 
@@ -80,7 +81,7 @@ class LacusCore():
80
81
  """Capture URLs or web enabled documents using PlaywrightCapture.
81
82
 
82
83
  :param redis_connector: Pre-configured connector to a redis instance.
83
- :param max_capture time: If the capture takes more than that time, break (in seconds)
84
+ :param max_capture_time: If the capture takes more than that time, break (in seconds)
84
85
  :param tor_proxy: URL to a SOCKS 5 tor proxy. If you have tor installed, this is the default: socks5://127.0.0.1:9050.
85
86
  :param only_global_lookups: Discard captures that point to non-public IPs.
86
87
  :param max_retries: How many times should we re-try a capture if it failed.
@@ -137,6 +138,7 @@ class LacusCore():
137
138
  rendered_hostname_only: bool=True,
138
139
  with_favicon: bool=False,
139
140
  allow_tracking: bool=False,
141
+ max_retries: int | None=None,
140
142
  force: bool=False,
141
143
  recapture_interval: int=300,
142
144
  priority: int=0,
@@ -166,6 +168,7 @@ class LacusCore():
166
168
  rendered_hostname_only: bool=True,
167
169
  with_favicon: bool=False,
168
170
  allow_tracking: bool=False,
171
+ max_retries: int | None=None,
169
172
  force: bool=False,
170
173
  recapture_interval: int=300,
171
174
  priority: int=0,
@@ -197,6 +200,8 @@ class LacusCore():
197
200
  :param rendered_hostname_only: If depth > 0: only capture URLs with the same hostname as the rendered page
198
201
  :param with_favicon: If True, PlaywrightCapture will attempt to get the potential favicons for the rendered URL. It is a dirty trick, see this issue for details: https://github.com/Lookyloo/PlaywrightCapture/issues/45
199
202
  :param allow_tracking: If True, PlaywrightCapture will attempt to click through the cookie banners. It is totally dependent on the framework used on the website.
203
+ :param max_retries: The maximum anount of retries for this capture
204
+
200
205
  :param force: Force recapture, even if the same one was already done within the recapture_interval
201
206
  :param recapture_interval: The time the enqueued settings are kept in memory to avoid duplicates
202
207
  :param priority: The priority of the capture
@@ -215,7 +220,7 @@ class LacusCore():
215
220
  'timezone_id': timezone_id, 'locale': locale,
216
221
  'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
217
222
  'viewport': viewport, 'referer': referer, 'with_favicon': with_favicon,
218
- 'allow_tracking': allow_tracking}
223
+ 'allow_tracking': allow_tracking, 'max_retries': max_retries}
219
224
 
220
225
  try:
221
226
  to_enqueue = CaptureSettings(**settings)
@@ -371,6 +376,10 @@ class LacusCore():
371
376
  logger.warning(f'Settings invalid: {e}')
372
377
  raise CaptureSettingsError('Invalid settings', e)
373
378
 
379
+ # If the class is initialized with max_retries below the one provided in the settings, we use the lowest value
380
+ # NOTE: make sure the variable is initialized *before* we raise any RetryCapture
381
+ max_retries = min([to_capture.max_retries, self.max_retries]) if to_capture.max_retries is not None else self.max_retries
382
+
374
383
  if to_capture.document:
375
384
  # we do not have a URL yet.
376
385
  document_as_bytes = b64decode(to_capture.document)
@@ -544,25 +553,29 @@ class LacusCore():
544
553
  # this is a retry that worked
545
554
  stats_pipeline.sadd(f'stats:{today}:retry_success', url)
546
555
  except RetryCapture:
547
- # Check if we already re-tried this capture
548
- _current_retry = self.redis.get(f'lacus:capture_retry:{uuid}')
549
- if _current_retry is None:
550
- # No retry yet
551
- logger.debug(f'Retrying {url} for the first time.')
552
- retry = True
553
- self.redis.setex(f'lacus:capture_retry:{uuid}',
554
- self.max_capture_time * (self.max_retries + 10),
555
- self.max_retries)
556
+ if max_retries == 0:
557
+ error_msg = result['error'] if result.get('error') else 'Unknown error'
558
+ logger.info(f'Retries disabled for {url}: {error_msg}')
556
559
  else:
557
- current_retry = int(_current_retry.decode())
558
- if current_retry > 0:
559
- logger.debug(f'Retrying {url} for the {self.max_retries - current_retry + 1}th time.')
560
- self.redis.decr(f'lacus:capture_retry:{uuid}')
560
+ # Check if we already re-tried this capture
561
+ _current_retry = self.redis.get(f'lacus:capture_retry:{uuid}')
562
+ if _current_retry is None:
563
+ # No retry yet
564
+ logger.debug(f'Retrying {url} for the first time.')
561
565
  retry = True
566
+ self.redis.setex(f'lacus:capture_retry:{uuid}',
567
+ self.max_capture_time * (max_retries + 10),
568
+ max_retries - 1)
562
569
  else:
563
- error_msg = result['error'] if result.get('error') else 'Unknown error'
564
- logger.info(f'Retried too many times {url}: {error_msg}')
565
- stats_pipeline.sadd(f'stats:{today}:retry_failed', url)
570
+ current_retry = int(_current_retry.decode())
571
+ if current_retry > 0:
572
+ logger.debug(f'Retrying {url} for the {max_retries - current_retry + 1} time.')
573
+ self.redis.decr(f'lacus:capture_retry:{uuid}')
574
+ retry = True
575
+ else:
576
+ error_msg = result['error'] if result.get('error') else 'Unknown error'
577
+ logger.info(f'Retried too many times {url}: {error_msg}')
578
+ stats_pipeline.sadd(f'stats:{today}:retry_failed', url)
566
579
  except CaptureError:
567
580
  if not result:
568
581
  result = {'error': "No result key, shouldn't happen"}
@@ -2,7 +2,8 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Any, Coroutine, Optional, TypeVar, Tuple
5
+ from typing import Any, TypeVar
6
+ from collections.abc import Coroutine
6
7
 
7
8
  import asyncio
8
9
  import functools
@@ -18,12 +19,12 @@ def create_task(
18
19
  coroutine: Coroutine[Any, Any, T],
19
20
  *,
20
21
  name: str,
21
- logger: 'LacusCoreLogAdapter',
22
+ logger: LacusCoreLogAdapter,
22
23
  message: str,
23
- message_args: Tuple[Any, ...] = (),
24
- loop: Optional[asyncio.AbstractEventLoop] = None,
24
+ message_args: tuple[Any, ...] = (),
25
+ loop: asyncio.AbstractEventLoop | None = None,
25
26
 
26
- ) -> 'asyncio.Task[T]': # This type annotation has to be quoted for Python < 3.9, see https://www.python.org/dev/peps/pep-0585/
27
+ ) -> asyncio.Task[T]: # This type annotation has to be quoted for Python < 3.9, see https://www.python.org/dev/peps/pep-0585/
27
28
  '''
28
29
  This helper function wraps a ``loop.create_task(coroutine())`` call and ensures there is
29
30
  an exception handler added to the resulting task. If the task raises an exception it is logged
@@ -42,9 +43,9 @@ def create_task(
42
43
  def _handle_task_result(
43
44
  task: asyncio.Task[Any],
44
45
  *,
45
- logger: 'LacusCoreLogAdapter',
46
+ logger: LacusCoreLogAdapter,
46
47
  message: str,
47
- message_args: Tuple[Any, ...] = (),
48
+ message_args: tuple[Any, ...] = (),
48
49
  ) -> None:
49
50
  try:
50
51
  task.result()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lacuscore"
3
- version = "1.11.3"
3
+ version = "1.12.1"
4
4
  description = "Core of Lacus, usable as a module"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -18,7 +18,6 @@ classifiers = [
18
18
  'Intended Audience :: Telecommunications Industry',
19
19
  'Intended Audience :: Information Technology',
20
20
  'Programming Language :: Python :: 3',
21
- 'Programming Language :: Python :: 3.8',
22
21
  'Programming Language :: Python :: 3.9',
23
22
  'Programming Language :: Python :: 3.10',
24
23
  'Programming Language :: Python :: 3.11',
@@ -29,21 +28,17 @@ classifiers = [
29
28
  ]
30
29
 
31
30
  [tool.poetry.dependencies]
32
- python = "^3.8"
31
+ python = "^3.9"
33
32
  requests = "^2.32.3"
34
33
  Sphinx = [
35
- {version = "<7.2", python = "<3.9", optional = true},
36
34
  {version = "^7.2", python = ">=3.9,<3.10", optional = true},
37
35
  {version = "^8", python = ">=3.10", optional = true}
38
36
  ]
39
- playwrightcapture = {extras = ["recaptcha"], version = "^1.26.3"}
37
+ playwrightcapture = {extras = ["recaptcha"], version = "^1.27.0"}
40
38
  defang = "^0.5.3"
41
39
  ua-parser = "^0.18.0"
42
40
  redis = {version = "^5.2.0", extras = ["hiredis"]}
43
- dnspython = [
44
- {version = "<2.7", python = "<3.9"},
45
- {version = "^2.7", python = ">=3.9"}
46
- ]
41
+ dnspython = {version = "^2.7.0", python = ">=3.9"}
47
42
  async-timeout = {version = "^4.0.3", python = "<3.11"}
48
43
  pydantic = "^2.9.2"
49
44
  eval-type-backport = {version = "^0.2.0", python = "<3.10"}
@@ -57,7 +52,6 @@ types-redis = {version = "^4.6.0.20241004"}
57
52
  types-requests = "^2.32.0.20241016"
58
53
  types-beautifulsoup4 = "^4.12.0.20241020"
59
54
  ipython = [
60
- {version = "<8.13.0", python = "<3.9"},
61
55
  {version = "^8.18.0", python = ">=3.9"},
62
56
  {version = "^8.19.0", python = ">=3.10"}
63
57
  ]
File without changes
File without changes