lacuscore 1.9.2__tar.gz → 1.9.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lacuscore
3
- Version: 1.9.2
3
+ Version: 1.9.3
4
4
  Summary: Core of Lacus, usable as a module
5
5
  Home-page: https://github.com/ail-project/LacusCore
6
6
  License: BSD-3-Clause
@@ -28,8 +28,8 @@ Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9") and (extra == "do
28
28
  Requires-Dist: async-timeout (>=4.0.3,<5.0.0) ; python_version < "3.11"
29
29
  Requires-Dist: defang (>=0.5.3,<0.6.0)
30
30
  Requires-Dist: dnspython (>=2.6.1,<3.0.0)
31
- Requires-Dist: playwrightcapture[recaptcha] (>=1.24.5,<2.0.0)
32
- Requires-Dist: redis[hiredis] (>=5.0.3,<6.0.0)
31
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.24.6,<2.0.0)
32
+ Requires-Dist: redis[hiredis] (>=5.0.4,<6.0.0)
33
33
  Requires-Dist: requests (>=2.31.0,<3.0.0)
34
34
  Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
35
35
  Project-URL: Documentation, https://lacuscore.readthedocs.io/en/latest/
@@ -1,4 +1,5 @@
1
- from .lacuscore import LacusCore, CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings # noqa
1
+ from .lacuscore import LacusCore
2
+ from .helpers import CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings # noqa
2
3
  from .lacus_monitoring import LacusCoreMonitoring # noqa
3
4
 
4
5
  __all__ = [
@@ -0,0 +1,102 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import IntEnum, unique
6
+ from logging import LoggerAdapter
7
+ from typing import MutableMapping, Any, TypedDict
8
+
9
+ from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
10
+
11
+
12
+ class LacusCoreException(Exception):
13
+ pass
14
+
15
+
16
+ class CaptureError(LacusCoreException):
17
+ pass
18
+
19
+
20
+ class RetryCapture(LacusCoreException):
21
+ pass
22
+
23
+
24
+ class CaptureSettingsError(LacusCoreException):
25
+ pass
26
+
27
+
28
+ class LacusCoreLogAdapter(LoggerAdapter): # type: ignore[type-arg]
29
+ """
30
+ Prepend log entry with the UUID of the capture
31
+ """
32
+ def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
33
+ if self.extra:
34
+ return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
35
+ return msg, kwargs
36
+
37
+
38
+ @unique
39
+ class CaptureStatus(IntEnum):
40
+ '''The status of the capture'''
41
+ UNKNOWN = -1
42
+ QUEUED = 0
43
+ DONE = 1
44
+ ONGOING = 2
45
+
46
+
47
+ class CaptureResponse(PlaywrightCaptureResponse, TypedDict, total=False):
48
+ '''A capture made by Lacus. With the base64 encoded image and downloaded file decoded to bytes.'''
49
+
50
+ # Need to make sure the type is what's expected down the line
51
+ children: list[CaptureResponse] | None # type: ignore[misc]
52
+
53
+ status: int
54
+ runtime: float | None
55
+
56
+
57
+ class CaptureResponseJson(TypedDict, total=False):
58
+ '''A capture made by Lacus. With the base64 encoded image and downloaded file *not* decoded.'''
59
+
60
+ status: int
61
+ last_redirected_url: str | None
62
+ har: dict[str, Any] | None
63
+ cookies: list[dict[str, str]] | None
64
+ error: str | None
65
+ html: str | None
66
+ png: str | None
67
+ downloaded_filename: str | None
68
+ downloaded_file: str | None
69
+ children: list[CaptureResponseJson] | None
70
+ runtime: float | None
71
+ potential_favicons: list[str] | None
72
+
73
+
74
+ class CaptureSettings(TypedDict, total=False):
75
+ '''The capture settings that can be passed to Lacus.'''
76
+
77
+ url: str | None
78
+ document_name: str | None
79
+ document: str | None
80
+ browser: str | None
81
+ device_name: str | None
82
+ user_agent: str | None
83
+ proxy: str | dict[str, str] | None
84
+ general_timeout_in_sec: int | None
85
+ cookies: list[dict[str, Any]] | None
86
+ headers: str | dict[str, str] | None
87
+ http_credentials: dict[str, str] | None
88
+ geolocation: dict[str, float] | None
89
+ timezone_id: str | None
90
+ locale: str | None
91
+ color_scheme: str | None
92
+ viewport: dict[str, int] | None
93
+ referer: str | None
94
+ with_favicon: bool
95
+ allow_tracking: bool
96
+ force: bool
97
+ recapture_interval: int
98
+ priority: int
99
+ uuid: str | None
100
+
101
+ depth: int
102
+ rendered_hostname_only: bool # Note: only used if depth is > 0
@@ -18,12 +18,10 @@ import zlib
18
18
  from asyncio import Task
19
19
  from base64 import b64decode, b64encode
20
20
  from datetime import date, timedelta
21
- from enum import IntEnum, unique
22
21
  from ipaddress import ip_address, IPv4Address, IPv6Address
23
- from logging import LoggerAdapter
24
22
  from pathlib import Path
25
23
  from tempfile import NamedTemporaryFile
26
- from typing import Literal, Any, TypedDict, overload, cast, MutableMapping, Iterator
24
+ from typing import Literal, Any, overload, cast, Iterator
27
25
  from uuid import uuid4
28
26
  from urllib.parse import urlsplit
29
27
 
@@ -33,12 +31,16 @@ from dns.exception import Timeout as DNSTimeout
33
31
 
34
32
  from defang import refang # type: ignore[import-untyped]
35
33
  from playwrightcapture import Capture, PlaywrightCaptureException
36
- from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
37
34
  from redis import Redis
38
35
  from redis.exceptions import ConnectionError as RedisConnectionError
39
36
  from redis.exceptions import DataError
40
37
  from ua_parser import user_agent_parser # type: ignore[import-untyped]
41
38
 
39
+ from . import task_logger
40
+ from .helpers import (
41
+ LacusCoreLogAdapter, CaptureError, RetryCapture, CaptureSettingsError,
42
+ CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings)
43
+
42
44
  if sys.version_info < (3, 11):
43
45
  from async_timeout import timeout
44
46
  else:
@@ -65,95 +67,6 @@ def _secure_filename(filename: str) -> str:
65
67
  return filename
66
68
 
67
69
 
68
- class LacusCoreException(Exception):
69
- pass
70
-
71
-
72
- class CaptureError(LacusCoreException):
73
- pass
74
-
75
-
76
- class RetryCapture(LacusCoreException):
77
- pass
78
-
79
-
80
- @unique
81
- class CaptureStatus(IntEnum):
82
- '''The status of the capture'''
83
- UNKNOWN = -1
84
- QUEUED = 0
85
- DONE = 1
86
- ONGOING = 2
87
-
88
-
89
- class CaptureResponse(PlaywrightCaptureResponse, TypedDict, total=False):
90
- '''A capture made by Lacus. With the base64 encoded image and downloaded file decoded to bytes.'''
91
-
92
- # Need to make sure the type is what's expected down the line
93
- children: list[CaptureResponse] | None # type: ignore[misc]
94
-
95
- status: int
96
- runtime: float | None
97
-
98
-
99
- class CaptureResponseJson(TypedDict, total=False):
100
- '''A capture made by Lacus. With the base64 encoded image and downloaded file *not* decoded.'''
101
-
102
- status: int
103
- last_redirected_url: str | None
104
- har: dict[str, Any] | None
105
- cookies: list[dict[str, str]] | None
106
- error: str | None
107
- html: str | None
108
- png: str | None
109
- downloaded_filename: str | None
110
- downloaded_file: str | None
111
- children: list[CaptureResponseJson] | None
112
- runtime: float | None
113
- potential_favicons: list[str] | None
114
-
115
-
116
- class CaptureSettings(TypedDict, total=False):
117
- '''The capture settings that can be passed to Lacus.'''
118
-
119
- url: str | None
120
- document_name: str | None
121
- document: str | None
122
- browser: str | None
123
- device_name: str | None
124
- user_agent: str | None
125
- proxy: str | dict[str, str] | None
126
- general_timeout_in_sec: int | None
127
- cookies: list[dict[str, Any]] | None
128
- headers: str | dict[str, str] | None
129
- http_credentials: dict[str, str] | None
130
- geolocation: dict[str, float] | None
131
- timezone_id: str | None
132
- locale: str | None
133
- color_scheme: str | None
134
- viewport: dict[str, int] | None
135
- referer: str | None
136
- with_favicon: bool
137
- allow_tracking: bool
138
- force: bool
139
- recapture_interval: int
140
- priority: int
141
- uuid: str | None
142
-
143
- depth: int
144
- rendered_hostname_only: bool # Note: only used if depth is > 0
145
-
146
-
147
- class LacusCoreLogAdapter(LoggerAdapter): # type: ignore[type-arg]
148
- """
149
- Prepend log entry with the UUID of the capture
150
- """
151
- def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
152
- if self.extra:
153
- return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
154
- return msg, kwargs
155
-
156
-
157
70
  class LacusCore():
158
71
  """Capture URLs or web enabled documents using PlaywrightCapture.
159
72
 
@@ -368,7 +281,7 @@ class LacusCore():
368
281
  p.execute()
369
282
  except DataError:
370
283
  self.master_logger.exception(f'Unable to enqueue: {to_enqueue}')
371
- raise LacusCoreException(f'Unable to enqueue: {to_enqueue}')
284
+ raise CaptureSettingsError(f'Unable to enqueue: {to_enqueue}')
372
285
  return perma_uuid
373
286
 
374
287
  def _encode_response(self, capture: CaptureResponse) -> CaptureResponseJson:
@@ -453,7 +366,10 @@ class LacusCore():
453
366
  max_consume -= 1
454
367
  uuid: str = value[0][0].decode()
455
368
  priority: int = int(value[0][1])
456
- yield asyncio.create_task(self._capture(uuid, priority), name=uuid)
369
+ logger = LacusCoreLogAdapter(self.master_logger, {'uuid': uuid})
370
+ yield task_logger.create_task(self._capture(uuid, priority), name=uuid,
371
+ logger=logger,
372
+ message='Capture raised an uncaught exception')
457
373
 
458
374
  async def _capture(self, uuid: str, priority: int) -> None:
459
375
  """Trigger a specific capture
@@ -508,11 +424,11 @@ class LacusCore():
508
424
  elif k == 'document':
509
425
  document_as_bytes = b64decode(v)
510
426
  else:
511
- raise LacusCoreException(f'Unexpected setting: {k}: {v}')
512
- except LacusCoreException as e:
427
+ raise CaptureSettingsError(f'Unexpected setting: {k}: {v}')
428
+ except CaptureSettingsError as e:
513
429
  raise e
514
430
  except Exception as e:
515
- raise LacusCoreException(f'Error while preparing settings: {e}')
431
+ raise CaptureSettingsError(f'Error while preparing settings: {e}')
516
432
 
517
433
  if not to_capture:
518
434
  all_entries = self.redis.hgetall(f'lacus:capture_settings:{uuid}')
@@ -523,7 +439,7 @@ class LacusCore():
523
439
  # we do not have a URL yet.
524
440
  name = to_capture.pop('document_name', None)
525
441
  if not name:
526
- raise LacusCoreException('No document name provided, settings are invalid')
442
+ raise CaptureSettingsError('No document name provided, settings are invalid')
527
443
  if not Path(name).suffix:
528
444
  # The browser will simply display the file as text if there is no extension.
529
445
  # Just add HTML as a fallback, as it will be the most comon one.
@@ -688,7 +604,7 @@ class LacusCore():
688
604
  else:
689
605
  current_retry = int(_current_retry.decode())
690
606
  if current_retry > 0:
691
- logger.debug(f'Retrying {url} for the {self.max_retries-current_retry+1}th time.')
607
+ logger.debug(f'Retrying {url} for the {self.max_retries - current_retry + 1}th time.')
692
608
  self.redis.decr(f'lacus:capture_retry:{uuid}')
693
609
  retry = True
694
610
  else:
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Coroutine, Optional, TypeVar, Tuple
6
+
7
+ import asyncio
8
+ import functools
9
+ import logging
10
+
11
+ from .helpers import LacusCoreLogAdapter
12
+
13
+ T = TypeVar('T')
14
+
15
+ # Code from https://quantlane.com/blog/ensure-asyncio-task-exceptions-get-logged/
16
+
17
+
18
+ def create_task(
19
+ coroutine: Coroutine[Any, Any, T],
20
+ *,
21
+ name: str,
22
+ logger: 'LacusCoreLogAdapter',
23
+ message: str,
24
+ message_args: Tuple[Any, ...] = (),
25
+ loop: Optional[asyncio.AbstractEventLoop] = None,
26
+
27
+ ) -> 'asyncio.Task[T]': # This type annotation has to be quoted for Python < 3.9, see https://www.python.org/dev/peps/pep-0585/
28
+ '''
29
+ This helper function wraps a ``loop.create_task(coroutine())`` call and ensures there is
30
+ an exception handler added to the resulting task. If the task raises an exception it is logged
31
+ using the provided ``logger``, with additional context provided by ``message`` and optionally
32
+ ``message_args``.
33
+ '''
34
+ if loop is None:
35
+ loop = asyncio.get_running_loop()
36
+ task = loop.create_task(coroutine, name=name)
37
+ task.add_done_callback(
38
+ functools.partial(_handle_task_result, logger=logger, message=message, message_args=message_args)
39
+ )
40
+ return task
41
+
42
+
43
+ def _handle_task_result(
44
+ task: asyncio.Task[Any],
45
+ *,
46
+ logger: logging.Logger,
47
+ message: str,
48
+ message_args: Tuple[Any, ...] = (),
49
+ ) -> None:
50
+ try:
51
+ task.result()
52
+ except asyncio.CancelledError:
53
+ pass # Task cancellation should not be logged as an error.
54
+ except asyncio.TimeoutError:
55
+ pass # Timeout is also fine
56
+ # Ad the pylint ignore: we want to handle all exceptions here so that the result of the task
57
+ # is properly logged. There is no point re-raising the exception in this callback.
58
+ except Exception: # pylint: disable=broad-except
59
+ logger.exception(message, *message_args)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lacuscore"
3
- version = "1.9.2"
3
+ version = "1.9.3"
4
4
  description = "Core of Lacus, usable as a module"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -34,10 +34,10 @@ Sphinx = [
34
34
  {version = "<7.2", python = "<3.9", optional = true},
35
35
  {version = "^7.2", python = ">=3.9", optional = true}
36
36
  ]
37
- playwrightcapture = {extras = ["recaptcha"], version = "^1.24.5"}
37
+ playwrightcapture = {extras = ["recaptcha"], version = "^1.24.6"}
38
38
  defang = "^0.5.3"
39
39
  ua-parser = "^0.18.0"
40
- redis = {version = "^5.0.3", extras = ["hiredis"]}
40
+ redis = {version = "^5.0.4", extras = ["hiredis"]}
41
41
  dnspython = "^2.6.1"
42
42
  async-timeout = {version = "^4.0.3", python = "<3.11"}
43
43
 
@@ -45,8 +45,8 @@ async-timeout = {version = "^4.0.3", python = "<3.11"}
45
45
  docs = ["Sphinx"]
46
46
 
47
47
  [tool.poetry.group.dev.dependencies]
48
- types-redis = {version = "^4.6.0.20240417"}
49
- mypy = "^1.9.0"
48
+ types-redis = {version = "^4.6.0.20240425"}
49
+ mypy = "^1.10.0"
50
50
  types-requests = "^2.31.0.20240406"
51
51
  types-beautifulsoup4 = "^4.12.0.20240229"
52
52
  ipython = [
@@ -54,7 +54,7 @@ ipython = [
54
54
  {version = "^8.18.0", python = ">=3.9"},
55
55
  {version = "^8.19.0", python = ">=3.10"}
56
56
  ]
57
- pytest = "^8.1.1"
57
+ pytest = "^8.2.0"
58
58
 
59
59
  [build-system]
60
60
  requires = ["poetry_core"]
File without changes
File without changes
File without changes