lacuscore 1.11.3__tar.gz → 1.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lacuscore
3
- Version: 1.11.3
3
+ Version: 1.12.0
4
4
  Summary: Core of Lacus, usable as a module
5
5
  Home-page: https://github.com/ail-project/LacusCore
6
6
  License: BSD-3-Clause
7
7
  Author: Raphaël Vinot
8
8
  Author-email: raphael.vinot@circl.lu
9
- Requires-Python: >=3.8,<4.0
9
+ Requires-Python: >=3.9,<4.0
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Environment :: Console
12
12
  Classifier: Intended Audience :: Information Technology
@@ -15,7 +15,6 @@ Classifier: Intended Audience :: Telecommunications Industry
15
15
  Classifier: License :: OSI Approved :: BSD License
16
16
  Classifier: Operating System :: POSIX :: Linux
17
17
  Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.8
19
18
  Classifier: Programming Language :: Python :: 3.9
20
19
  Classifier: Programming Language :: Python :: 3.10
21
20
  Classifier: Programming Language :: Python :: 3.11
@@ -24,15 +23,13 @@ Classifier: Programming Language :: Python :: 3.13
24
23
  Classifier: Topic :: Internet
25
24
  Classifier: Topic :: Security
26
25
  Provides-Extra: docs
27
- Requires-Dist: Sphinx (<7.2) ; (python_version < "3.9") and (extra == "docs")
28
26
  Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9" and python_version < "3.10") and (extra == "docs")
29
27
  Requires-Dist: Sphinx (>=8,<9) ; (python_version >= "3.10") and (extra == "docs")
30
28
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
31
29
  Requires-Dist: defang (>=0.5.3,<0.6.0)
32
- Requires-Dist: dnspython (<2.7) ; python_version < "3.9"
33
- Requires-Dist: dnspython (>=2.7,<3.0) ; python_version >= "3.9"
30
+ Requires-Dist: dnspython (>=2.7.0,<3.0.0) ; python_version >= "3.9"
34
31
  Requires-Dist: eval-type-backport (>=0.2.0,<0.3.0) ; python_version < "3.10"
35
- Requires-Dist: playwrightcapture[recaptcha] (>=1.26.3,<2.0.0)
32
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.27.0,<2.0.0)
36
33
  Requires-Dist: pydantic (>=2.9.2,<3.0.0)
37
34
  Requires-Dist: redis[hiredis] (>=5.2.0,<6.0.0)
38
35
  Requires-Dist: requests (>=2.32.3,<3.0.0)
@@ -6,7 +6,8 @@ import json
6
6
 
7
7
  from enum import IntEnum, unique
8
8
  from logging import LoggerAdapter
9
- from typing import MutableMapping, Any, TypedDict, Mapping, Literal
9
+ from typing import Any, TypedDict, Literal
10
+ from collections.abc import MutableMapping, Mapping
10
11
 
11
12
  from defang import refang # type: ignore[import-untyped]
12
13
  from pydantic import BaseModel, field_validator, model_validator, ValidationError
@@ -107,6 +108,7 @@ class CaptureSettings(BaseModel):
107
108
  force: bool = False
108
109
  recapture_interval: int = 300
109
110
  priority: int = 0
111
+ max_retries: int | None = None
110
112
  uuid: str | None = None
111
113
 
112
114
  depth: int = 0
@@ -19,7 +19,8 @@ from base64 import b64decode, b64encode
19
19
  from datetime import date, timedelta
20
20
  from ipaddress import ip_address, IPv4Address, IPv6Address
21
21
  from tempfile import NamedTemporaryFile
22
- from typing import Literal, Any, overload, cast, Iterator
22
+ from typing import Literal, Any, overload, cast
23
+ from collections.abc import Iterator
23
24
  from uuid import uuid4
24
25
  from urllib.parse import urlsplit
25
26
 
@@ -80,7 +81,7 @@ class LacusCore():
80
81
  """Capture URLs or web enabled documents using PlaywrightCapture.
81
82
 
82
83
  :param redis_connector: Pre-configured connector to a redis instance.
83
- :param max_capture time: If the capture takes more than that time, break (in seconds)
84
+ :param max_capture_time: If the capture takes more than that time, break (in seconds)
84
85
  :param tor_proxy: URL to a SOCKS 5 tor proxy. If you have tor installed, this is the default: socks5://127.0.0.1:9050.
85
86
  :param only_global_lookups: Discard captures that point to non-public IPs.
86
87
  :param max_retries: How many times should we re-try a capture if it failed.
@@ -137,6 +138,7 @@ class LacusCore():
137
138
  rendered_hostname_only: bool=True,
138
139
  with_favicon: bool=False,
139
140
  allow_tracking: bool=False,
141
+ max_retries: int | None=None,
140
142
  force: bool=False,
141
143
  recapture_interval: int=300,
142
144
  priority: int=0,
@@ -166,6 +168,7 @@ class LacusCore():
166
168
  rendered_hostname_only: bool=True,
167
169
  with_favicon: bool=False,
168
170
  allow_tracking: bool=False,
171
+ max_retries: int | None=None,
169
172
  force: bool=False,
170
173
  recapture_interval: int=300,
171
174
  priority: int=0,
@@ -197,6 +200,8 @@ class LacusCore():
197
200
  :param rendered_hostname_only: If depth > 0: only capture URLs with the same hostname as the rendered page
198
201
  :param with_favicon: If True, PlaywrightCapture will attempt to get the potential favicons for the rendered URL. It is a dirty trick, see this issue for details: https://github.com/Lookyloo/PlaywrightCapture/issues/45
199
202
  :param allow_tracking: If True, PlaywrightCapture will attempt to click through the cookie banners. It is totally dependent on the framework used on the website.
203
+ :param max_retries: The maximum anount of retries for this capture
204
+
200
205
  :param force: Force recapture, even if the same one was already done within the recapture_interval
201
206
  :param recapture_interval: The time the enqueued settings are kept in memory to avoid duplicates
202
207
  :param priority: The priority of the capture
@@ -215,7 +220,7 @@ class LacusCore():
215
220
  'timezone_id': timezone_id, 'locale': locale,
216
221
  'color_scheme': color_scheme, 'java_script_enabled': java_script_enabled,
217
222
  'viewport': viewport, 'referer': referer, 'with_favicon': with_favicon,
218
- 'allow_tracking': allow_tracking}
223
+ 'allow_tracking': allow_tracking, 'max_retries': max_retries}
219
224
 
220
225
  try:
221
226
  to_enqueue = CaptureSettings(**settings)
@@ -469,6 +474,8 @@ class LacusCore():
469
474
  cookie['path'] = '/'
470
475
  cookies.append(cookie)
471
476
 
477
+ # If the class is initialized with max_retries below the one provided in the settings, we use the lowest value
478
+ max_retries = min([to_capture.max_retries, self.max_retries]) if to_capture.max_retries is not None else self.max_retries
472
479
  try:
473
480
  logger.debug(f'Capturing {url}')
474
481
  stats_pipeline.sadd(f'stats:{today}:captures', url)
@@ -544,25 +551,29 @@ class LacusCore():
544
551
  # this is a retry that worked
545
552
  stats_pipeline.sadd(f'stats:{today}:retry_success', url)
546
553
  except RetryCapture:
547
- # Check if we already re-tried this capture
548
- _current_retry = self.redis.get(f'lacus:capture_retry:{uuid}')
549
- if _current_retry is None:
550
- # No retry yet
551
- logger.debug(f'Retrying {url} for the first time.')
552
- retry = True
553
- self.redis.setex(f'lacus:capture_retry:{uuid}',
554
- self.max_capture_time * (self.max_retries + 10),
555
- self.max_retries)
554
+ if max_retries == 0:
555
+ error_msg = result['error'] if result.get('error') else 'Unknown error'
556
+ logger.info(f'Retries disabled for {url}: {error_msg}')
556
557
  else:
557
- current_retry = int(_current_retry.decode())
558
- if current_retry > 0:
559
- logger.debug(f'Retrying {url} for the {self.max_retries - current_retry + 1}th time.')
560
- self.redis.decr(f'lacus:capture_retry:{uuid}')
558
+ # Check if we already re-tried this capture
559
+ _current_retry = self.redis.get(f'lacus:capture_retry:{uuid}')
560
+ if _current_retry is None:
561
+ # No retry yet
562
+ logger.debug(f'Retrying {url} for the first time.')
561
563
  retry = True
564
+ self.redis.setex(f'lacus:capture_retry:{uuid}',
565
+ self.max_capture_time * (max_retries + 10),
566
+ max_retries - 1)
562
567
  else:
563
- error_msg = result['error'] if result.get('error') else 'Unknown error'
564
- logger.info(f'Retried too many times {url}: {error_msg}')
565
- stats_pipeline.sadd(f'stats:{today}:retry_failed', url)
568
+ current_retry = int(_current_retry.decode())
569
+ if current_retry > 0:
570
+ logger.debug(f'Retrying {url} for the {max_retries - current_retry + 1} time.')
571
+ self.redis.decr(f'lacus:capture_retry:{uuid}')
572
+ retry = True
573
+ else:
574
+ error_msg = result['error'] if result.get('error') else 'Unknown error'
575
+ logger.info(f'Retried too many times {url}: {error_msg}')
576
+ stats_pipeline.sadd(f'stats:{today}:retry_failed', url)
566
577
  except CaptureError:
567
578
  if not result:
568
579
  result = {'error': "No result key, shouldn't happen"}
@@ -2,7 +2,8 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Any, Coroutine, Optional, TypeVar, Tuple
5
+ from typing import Any, TypeVar
6
+ from collections.abc import Coroutine
6
7
 
7
8
  import asyncio
8
9
  import functools
@@ -18,12 +19,12 @@ def create_task(
18
19
  coroutine: Coroutine[Any, Any, T],
19
20
  *,
20
21
  name: str,
21
- logger: 'LacusCoreLogAdapter',
22
+ logger: LacusCoreLogAdapter,
22
23
  message: str,
23
- message_args: Tuple[Any, ...] = (),
24
- loop: Optional[asyncio.AbstractEventLoop] = None,
24
+ message_args: tuple[Any, ...] = (),
25
+ loop: asyncio.AbstractEventLoop | None = None,
25
26
 
26
- ) -> 'asyncio.Task[T]': # This type annotation has to be quoted for Python < 3.9, see https://www.python.org/dev/peps/pep-0585/
27
+ ) -> asyncio.Task[T]: # This type annotation has to be quoted for Python < 3.9, see https://www.python.org/dev/peps/pep-0585/
27
28
  '''
28
29
  This helper function wraps a ``loop.create_task(coroutine())`` call and ensures there is
29
30
  an exception handler added to the resulting task. If the task raises an exception it is logged
@@ -42,9 +43,9 @@ def create_task(
42
43
  def _handle_task_result(
43
44
  task: asyncio.Task[Any],
44
45
  *,
45
- logger: 'LacusCoreLogAdapter',
46
+ logger: LacusCoreLogAdapter,
46
47
  message: str,
47
- message_args: Tuple[Any, ...] = (),
48
+ message_args: tuple[Any, ...] = (),
48
49
  ) -> None:
49
50
  try:
50
51
  task.result()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lacuscore"
3
- version = "1.11.3"
3
+ version = "1.12.0"
4
4
  description = "Core of Lacus, usable as a module"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -18,7 +18,6 @@ classifiers = [
18
18
  'Intended Audience :: Telecommunications Industry',
19
19
  'Intended Audience :: Information Technology',
20
20
  'Programming Language :: Python :: 3',
21
- 'Programming Language :: Python :: 3.8',
22
21
  'Programming Language :: Python :: 3.9',
23
22
  'Programming Language :: Python :: 3.10',
24
23
  'Programming Language :: Python :: 3.11',
@@ -29,21 +28,17 @@ classifiers = [
29
28
  ]
30
29
 
31
30
  [tool.poetry.dependencies]
32
- python = "^3.8"
31
+ python = "^3.9"
33
32
  requests = "^2.32.3"
34
33
  Sphinx = [
35
- {version = "<7.2", python = "<3.9", optional = true},
36
34
  {version = "^7.2", python = ">=3.9,<3.10", optional = true},
37
35
  {version = "^8", python = ">=3.10", optional = true}
38
36
  ]
39
- playwrightcapture = {extras = ["recaptcha"], version = "^1.26.3"}
37
+ playwrightcapture = {extras = ["recaptcha"], version = "^1.27.0"}
40
38
  defang = "^0.5.3"
41
39
  ua-parser = "^0.18.0"
42
40
  redis = {version = "^5.2.0", extras = ["hiredis"]}
43
- dnspython = [
44
- {version = "<2.7", python = "<3.9"},
45
- {version = "^2.7", python = ">=3.9"}
46
- ]
41
+ dnspython = {version = "^2.7.0", python = ">=3.9"}
47
42
  async-timeout = {version = "^4.0.3", python = "<3.11"}
48
43
  pydantic = "^2.9.2"
49
44
  eval-type-backport = {version = "^0.2.0", python = "<3.10"}
@@ -57,7 +52,6 @@ types-redis = {version = "^4.6.0.20241004"}
57
52
  types-requests = "^2.32.0.20241016"
58
53
  types-beautifulsoup4 = "^4.12.0.20241020"
59
54
  ipython = [
60
- {version = "<8.13.0", python = "<3.9"},
61
55
  {version = "^8.18.0", python = ">=3.9"},
62
56
  {version = "^8.19.0", python = ">=3.10"}
63
57
  ]
File without changes
File without changes