lacuscore 1.7.8__tar.gz → 1.7.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lacuscore
3
- Version: 1.7.8
3
+ Version: 1.7.10
4
4
  Summary: Core of Lacus, usable as a module
5
5
  Home-page: https://github.com/ail-project/LacusCore
6
6
  License: BSD-3-Clause
@@ -26,7 +26,7 @@ Provides-Extra: docs
26
26
  Requires-Dist: Sphinx (<7.2) ; (python_version < "3.9") and (extra == "docs")
27
27
  Requires-Dist: Sphinx (>=7.2,<8.0) ; (python_version >= "3.9") and (extra == "docs")
28
28
  Requires-Dist: defang (>=0.5.3,<0.6.0)
29
- Requires-Dist: playwrightcapture[recaptcha] (>=1.22.5,<2.0.0)
29
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.22.7,<2.0.0)
30
30
  Requires-Dist: redis[hiredis] (>=5.0.1,<6.0.0)
31
31
  Requires-Dist: requests (>=2.31.0,<3.0.0)
32
32
  Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
@@ -1,2 +1,11 @@
1
1
  from .lacuscore import LacusCore, CaptureStatus, CaptureResponse, CaptureResponseJson, CaptureSettings # noqa
2
2
  from .lacus_monitoring import LacusCoreMonitoring # noqa
3
+
4
+ __all__ = [
5
+ 'LacusCore',
6
+ 'CaptureStatus',
7
+ 'CaptureResponse',
8
+ 'CaptureResponseJson',
9
+ 'CaptureSettings',
10
+ 'LacusCoreMonitoring'
11
+ ]
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  from typing import List, Tuple, Dict, Optional, Union, Any, Set
4
6
 
5
7
  from datetime import datetime, date
@@ -9,28 +11,28 @@ from redis import Redis
9
11
 
10
12
  class LacusCoreMonitoring():
11
13
 
12
- def __init__(self, redis_connector: Redis):
14
+ def __init__(self, redis_connector: Redis): # type: ignore[type-arg]
13
15
  self.redis = redis_connector
14
16
 
15
17
  def check_redis_up(self) -> bool:
16
18
  return bool(self.redis.ping())
17
19
 
18
- def get_ongoing_captures(self) -> List[Tuple[str, datetime]]:
20
+ def get_ongoing_captures(self) -> list[tuple[str, datetime]]:
19
21
  return [(uuid, datetime.fromtimestamp(timestamp)) for uuid, timestamp in self.redis.zrevrangebyscore('lacus:ongoing', '+Inf', 0, withscores=True)]
20
22
 
21
- def get_capture_settings(self, uuid: str) -> Dict[str, str]:
23
+ def get_capture_settings(self, uuid: str) -> dict[str, str]:
22
24
  return self.redis.hgetall(f'lacus:capture_settings:{uuid}')
23
25
 
24
- def get_enqueued_captures(self) -> List[Tuple[str, float]]:
26
+ def get_enqueued_captures(self) -> list[tuple[str, float]]:
25
27
  return self.redis.zrevrangebyscore('lacus:to_capture', '+Inf', '-Inf', withscores=True)
26
28
 
27
- def get_capture_result(self, uuid: str) -> Optional[str]:
29
+ def get_capture_result(self, uuid: str) -> str | None:
28
30
  return self.redis.get(f'lacus:capture_results:{uuid}')
29
31
 
30
- def get_capture_result_size(self, uuid: str) -> Optional[str]:
32
+ def get_capture_result_size(self, uuid: str) -> str | None:
31
33
  return self.redis.memory_usage(f'lacus:capture_results:{uuid}')
32
34
 
33
- def get_stats(self, d: Optional[Union[datetime, date, str]]=None, /, *, cardinality_only: bool=False) -> Dict[str, Any]:
35
+ def get_stats(self, d: datetime | date | str | None=None, /, *, cardinality_only: bool=False) -> dict[str, Any]:
34
36
  if d is None:
35
37
  _date = date.today().isoformat()
36
38
  elif isinstance(d, str):
@@ -41,7 +43,7 @@ class LacusCoreMonitoring():
41
43
  _date = d.isoformat()
42
44
  else:
43
45
  raise Exception('Invalid type for date ({type(d)})')
44
- to_return: Dict[str, Union[List[Tuple[str, float]], int, Set[str]]] = {}
46
+ to_return: dict[str, list[tuple[str, float]] | int | set[str]] = {}
45
47
  if errors := self.redis.zrevrangebyscore(f'stats:{_date}:errors', '+Inf', 0, withscores=True):
46
48
  to_return['errors'] = errors
47
49
  if cardinality_only:
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import asyncio
4
6
  import ipaddress
5
7
  import hashlib
@@ -21,7 +23,7 @@ from enum import IntEnum, unique
21
23
  from logging import LoggerAdapter
22
24
  from pathlib import Path
23
25
  from tempfile import NamedTemporaryFile
24
- from typing import Literal, Optional, Union, Dict, List, Any, TypedDict, overload, Tuple, cast, MutableMapping, Iterator
26
+ from typing import Literal, Any, TypedDict, overload, cast, MutableMapping, Iterator
25
27
  from uuid import uuid4
26
28
  from urllib.parse import urlsplit
27
29
 
@@ -30,6 +32,7 @@ from playwrightcapture import Capture, PlaywrightCaptureException
30
32
  from playwrightcapture.capture import CaptureResponse as PlaywrightCaptureResponse
31
33
  from redis import Redis
32
34
  from redis.exceptions import ConnectionError as RedisConnectionError
35
+ from redis.exceptions import DataError
33
36
  from ua_parser import user_agent_parser # type: ignore
34
37
 
35
38
  BROWSER = Literal['chromium', 'firefox', 'webkit']
@@ -78,66 +81,66 @@ class CaptureResponse(PlaywrightCaptureResponse, TypedDict, total=False):
78
81
  '''A capture made by Lacus. With the base64 encoded image and downloaded file decoded to bytes.'''
79
82
 
80
83
  # Need to make sure the type is what's expected down the line
81
- children: Optional[List['CaptureResponse']] # type: ignore
84
+ children: list[CaptureResponse] | None # type: ignore
82
85
 
83
86
  status: int
84
- runtime: Optional[float]
87
+ runtime: float | None
85
88
 
86
89
 
87
90
  class CaptureResponseJson(TypedDict, total=False):
88
91
  '''A capture made by Lacus. With the base64 encoded image and downloaded file *not* decoded.'''
89
92
 
90
93
  status: int
91
- last_redirected_url: Optional[str]
92
- har: Optional[Dict[str, Any]]
93
- cookies: Optional[List[Dict[str, str]]]
94
- error: Optional[str]
95
- html: Optional[str]
96
- png: Optional[str]
97
- downloaded_filename: Optional[str]
98
- downloaded_file: Optional[str]
99
- children: Optional[List['CaptureResponseJson']]
100
- runtime: Optional[float]
101
- potential_favicons: Optional[List[str]]
94
+ last_redirected_url: str | None
95
+ har: dict[str, Any] | None
96
+ cookies: list[dict[str, str]] | None
97
+ error: str | None
98
+ html: str | None
99
+ png: str | None
100
+ downloaded_filename: str | None
101
+ downloaded_file: str | None
102
+ children: list[CaptureResponseJson] | None
103
+ runtime: float | None
104
+ potential_favicons: list[str] | None
102
105
 
103
106
 
104
107
  class CaptureSettings(TypedDict, total=False):
105
108
  '''The capture settings that can be passed to Lacus.'''
106
109
 
107
- url: Optional[str]
108
- document_name: Optional[str]
109
- document: Optional[str]
110
- browser: Optional[str]
111
- device_name: Optional[str]
112
- user_agent: Optional[str]
113
- proxy: Optional[Union[str, Dict[str, str]]]
114
- general_timeout_in_sec: Optional[int]
115
- cookies: Optional[List[Dict[str, Any]]]
116
- headers: Optional[Union[str, Dict[str, str]]]
117
- http_credentials: Optional[Dict[str, str]]
118
- geolocation: Optional[Dict[str, float]]
119
- timezone_id: Optional[str]
120
- locale: Optional[str]
121
- color_scheme: Optional[str]
122
- viewport: Optional[Dict[str, int]]
123
- referer: Optional[str]
110
+ url: str | None
111
+ document_name: str | None
112
+ document: str | None
113
+ browser: str | None
114
+ device_name: str | None
115
+ user_agent: str | None
116
+ proxy: str | dict[str, str] | None
117
+ general_timeout_in_sec: int | None
118
+ cookies: list[dict[str, Any]] | None
119
+ headers: str | dict[str, str] | None
120
+ http_credentials: dict[str, str] | None
121
+ geolocation: dict[str, float] | None
122
+ timezone_id: str | None
123
+ locale: str | None
124
+ color_scheme: str | None
125
+ viewport: dict[str, int] | None
126
+ referer: str | None
124
127
  with_favicon: bool
125
- force: Optional[bool]
126
- recapture_interval: Optional[int]
127
- priority: Optional[int]
128
- uuid: Optional[str]
128
+ force: bool | None
129
+ recapture_interval: int | None
130
+ priority: int | None
131
+ uuid: str | None
129
132
 
130
133
  depth: int
131
134
  rendered_hostname_only: bool # Note: only used if depth is > 0
132
135
 
133
136
 
134
- class LacusCoreLogAdapter(LoggerAdapter):
137
+ class LacusCoreLogAdapter(LoggerAdapter): # type: ignore[type-arg]
135
138
  """
136
139
  Prepend log entry with the UUID of the capture
137
140
  """
138
- def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> Tuple[str, MutableMapping[str, Any]]:
141
+ def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> tuple[str, MutableMapping[str, Any]]:
139
142
  if self.extra:
140
- return '[%s] %s' % (self.extra['uuid'], msg), kwargs
143
+ return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
141
144
  return msg, kwargs
142
145
 
143
146
 
@@ -151,9 +154,9 @@ class LacusCore():
151
154
  :param max_retries: How many times should we re-try a capture if it failed.
152
155
  """
153
156
 
154
- def __init__(self, redis_connector: Redis, /, *,
157
+ def __init__(self, redis_connector: Redis, /, *, # type: ignore[type-arg]
155
158
  max_capture_time: int=3600,
156
- tor_proxy: Optional[str]=None,
159
+ tor_proxy: str | None=None,
157
160
  only_global_lookups: bool=True,
158
161
  max_retries: int=3,
159
162
  loglevel: str='INFO') -> None:
@@ -166,7 +169,7 @@ class LacusCore():
166
169
  self.max_retries = max_retries
167
170
 
168
171
  # NOTE: Remove in 1.8.* - clear old ongoing captures queue in case of need
169
- if self.redis.type('lacus:ongoing') in ['set', b'set']:
172
+ if self.redis.type('lacus:ongoing') in ['set', b'set']: # type: ignore[no-untyped-call]
170
173
  self.redis.delete('lacus:ongoing')
171
174
 
172
175
  def check_redis_up(self) -> bool:
@@ -174,60 +177,60 @@ class LacusCore():
174
177
  return bool(self.redis.ping())
175
178
 
176
179
  @overload
177
- def enqueue(self, *, settings: Optional[CaptureSettings]=None) -> str:
180
+ def enqueue(self, *, settings: CaptureSettings | None=None) -> str:
178
181
  ...
179
182
 
180
183
  @overload
181
184
  def enqueue(self, *,
182
- url: Optional[str]=None,
183
- document_name: Optional[str]=None, document: Optional[str]=None,
185
+ url: str | None=None,
186
+ document_name: str | None=None, document: str | None=None,
184
187
  depth: int=0,
185
- browser: Optional[BROWSER]=None, device_name: Optional[str]=None,
186
- user_agent: Optional[str]=None,
187
- proxy: Optional[Union[str, Dict[str, str]]]=None,
188
- general_timeout_in_sec: Optional[int]=None,
189
- cookies: Optional[List[Dict[str, Any]]]=None,
190
- headers: Optional[Union[str, Dict[str, str]]]=None,
191
- http_credentials: Optional[Dict[str, str]]=None,
192
- geolocation: Optional[Dict[str, float]]=None,
193
- timezone_id: Optional[str]=None,
194
- locale: Optional[str]=None,
195
- color_scheme: Optional[str]=None,
196
- viewport: Optional[Dict[str, int]]=None,
197
- referer: Optional[str]=None,
188
+ browser: BROWSER | None=None, device_name: str | None=None,
189
+ user_agent: str | None=None,
190
+ proxy: str | dict[str, str] | None=None,
191
+ general_timeout_in_sec: int | None=None,
192
+ cookies: list[dict[str, Any]] | None=None,
193
+ headers: str | dict[str, str] | None=None,
194
+ http_credentials: dict[str, str] | None=None,
195
+ geolocation: dict[str, float] | None=None,
196
+ timezone_id: str | None=None,
197
+ locale: str | None=None,
198
+ color_scheme: str | None=None,
199
+ viewport: dict[str, int] | None=None,
200
+ referer: str | None=None,
198
201
  rendered_hostname_only: bool=True,
199
202
  with_favicon: bool=False,
200
203
  force: bool=False,
201
204
  recapture_interval: int=300,
202
205
  priority: int=0,
203
- uuid: Optional[str]=None
206
+ uuid: str | None=None
204
207
  ) -> str:
205
208
  ...
206
209
 
207
210
  def enqueue(self, *,
208
- settings: Optional[CaptureSettings]=None,
209
- url: Optional[str]=None,
210
- document_name: Optional[str]=None, document: Optional[str]=None,
211
+ settings: CaptureSettings | None=None,
212
+ url: str | None=None,
213
+ document_name: str | None=None, document: str | None=None,
211
214
  depth: int=0,
212
- browser: Optional[BROWSER]=None, device_name: Optional[str]=None,
213
- user_agent: Optional[str]=None,
214
- proxy: Optional[Union[str, Dict[str, str]]]=None,
215
- general_timeout_in_sec: Optional[int]=None,
216
- cookies: Optional[List[Dict[str, Any]]]=None,
217
- headers: Optional[Union[str, Dict[str, str]]]=None,
218
- http_credentials: Optional[Dict[str, str]]=None,
219
- geolocation: Optional[Dict[str, float]]=None,
220
- timezone_id: Optional[str]=None,
221
- locale: Optional[str]=None,
222
- color_scheme: Optional[str]=None,
223
- viewport: Optional[Dict[str, int]]=None,
224
- referer: Optional[str]=None,
215
+ browser: BROWSER | None=None, device_name: str | None=None,
216
+ user_agent: str | None=None,
217
+ proxy: str | dict[str, str] | None=None,
218
+ general_timeout_in_sec: int | None=None,
219
+ cookies: list[dict[str, Any]] | None=None,
220
+ headers: str | dict[str, str] | None=None,
221
+ http_credentials: dict[str, str] | None=None,
222
+ geolocation: dict[str, float] | None=None,
223
+ timezone_id: str | None=None,
224
+ locale: str | None=None,
225
+ color_scheme: str | None=None,
226
+ viewport: dict[str, int] | None=None,
227
+ referer: str | None=None,
225
228
  rendered_hostname_only: bool=True,
226
229
  with_favicon: bool=False,
227
230
  force: bool=False,
228
231
  recapture_interval: int=300,
229
232
  priority: int=0,
230
- uuid: Optional[str]=None
233
+ uuid: str | None=None
231
234
  ) -> str:
232
235
  """Enqueue settings.
233
236
 
@@ -326,7 +329,7 @@ class LacusCore():
326
329
  else:
327
330
  perma_uuid = str(uuid4())
328
331
 
329
- mapping_capture: Dict[str, Union[bytes, float, int, str]] = {}
332
+ mapping_capture: dict[str, bytes | float | int | str] = {}
330
333
  for key, value in to_enqueue.items():
331
334
  if isinstance(value, bool):
332
335
  mapping_capture[key] = 1 if value else 0
@@ -339,8 +342,12 @@ class LacusCore():
339
342
  p = self.redis.pipeline()
340
343
  p.set(f'lacus:query_hash:{hash_query}', perma_uuid, nx=True, ex=recapture_interval)
341
344
  p.hset(f'lacus:capture_settings:{perma_uuid}', mapping=mapping_capture) # type: ignore
342
- p.zadd('lacus:to_capture', {perma_uuid: priority})
343
- p.execute()
345
+ p.zadd('lacus:to_capture', {perma_uuid: priority if priority is not None else 0})
346
+ try:
347
+ p.execute()
348
+ except DataError:
349
+ self.master_logger.exception(f'Unable to enqueue: {to_enqueue}')
350
+ raise LacusCoreException(f'Unable to enqueue: {to_enqueue}')
344
351
  return perma_uuid
345
352
 
346
353
  def _encode_response(self, capture: CaptureResponse) -> CaptureResponseJson:
@@ -366,7 +373,7 @@ class LacusCore():
366
373
  def get_capture(self, uuid: str, *, decode: Literal[False]) -> CaptureResponseJson:
367
374
  ...
368
375
 
369
- def get_capture(self, uuid: str, *, decode: bool=False) -> Union[CaptureResponse, CaptureResponseJson]:
376
+ def get_capture(self, uuid: str, *, decode: bool=False) -> CaptureResponse | CaptureResponseJson:
370
377
  """Get the results of a capture, in a json compatible format or not
371
378
 
372
379
  :param uuid: The UUID if the capture (given by enqueue)
@@ -405,12 +412,12 @@ class LacusCore():
405
412
  return CaptureStatus.DONE
406
413
  return CaptureStatus.UNKNOWN
407
414
 
408
- def consume_queue(self, max_consume: int) -> Iterator[Task]:
415
+ def consume_queue(self, max_consume: int) -> Iterator[Task]: # type: ignore[type-arg]
409
416
  """Trigger the capture for captures with the highest priority. Up to max_consume.
410
417
 
411
418
  :yield: Captures.
412
419
  """
413
- value: List[Tuple[bytes, float]]
420
+ value: list[tuple[bytes, float]]
414
421
  while max_consume > 0:
415
422
  value = self.redis.zpopmax('lacus:to_capture')
416
423
  if not value:
@@ -423,7 +430,7 @@ class LacusCore():
423
430
  priority: int = int(value[0][1])
424
431
  yield asyncio.create_task(self._capture(uuid, priority), name=uuid)
425
432
 
426
- async def _capture(self, uuid: str, priority: int):
433
+ async def _capture(self, uuid: str, priority: int) -> None:
427
434
  """Trigger a specific capture
428
435
 
429
436
  :param uuid: The UUID if the capture (given by enqueue)
@@ -708,8 +715,8 @@ class LacusCore():
708
715
  stats_pipeline.expire(f'stats:{today}:captures', expire_time)
709
716
  stats_pipeline.execute()
710
717
 
711
- def _store_capture_response(self, pipeline: Redis, capture_uuid: str, results: CaptureResponse,
712
- root_key: Optional[str]=None) -> None:
718
+ def _store_capture_response(self, pipeline: Redis, capture_uuid: str, results: CaptureResponse, # type: ignore[type-arg]
719
+ root_key: str | None=None) -> None:
713
720
  logger = LacusCoreLogAdapter(self.master_logger, {'uuid': capture_uuid})
714
721
  if root_key is None:
715
722
  root_key = f'lacus:capture_results_hash:{capture_uuid}'
@@ -750,7 +757,7 @@ class LacusCore():
750
757
  else:
751
758
  logger.critical(f'Nothing to store (Hash: {hash_to_set}) for {root_key}')
752
759
 
753
- def _get_capture_response(self, capture_uuid: str, root_key: Optional[str]=None) -> Optional[CaptureResponse]:
760
+ def _get_capture_response(self, capture_uuid: str, root_key: str | None=None) -> CaptureResponse | None:
754
761
  logger = LacusCoreLogAdapter(self.master_logger, {'uuid': capture_uuid})
755
762
  if root_key is None:
756
763
  root_key = f'lacus:capture_results_hash:{capture_uuid}'
@@ -792,7 +799,7 @@ class LacusCore():
792
799
  logger.critical(f'Unexpected key in response: {key} - {value}')
793
800
  return to_return
794
801
 
795
- def clear_capture(self, uuid: str, reason: str):
802
+ def clear_capture(self, uuid: str, reason: str) -> None:
796
803
  '''Remove a capture from the list, shouldn't happen unless it is in error'''
797
804
  logger = LacusCoreLogAdapter(self.master_logger, {'uuid': uuid})
798
805
  capture_status = self.get_capture_status(uuid)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lacuscore"
3
- version = "1.7.8"
3
+ version = "1.7.10"
4
4
  description = "Core of Lacus, usable as a module"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -33,7 +33,7 @@ Sphinx = [
33
33
  {version = "<7.2", python = "<3.9", optional = true},
34
34
  {version = "^7.2", python = ">=3.9", optional = true}
35
35
  ]
36
- playwrightcapture = {extras = ["recaptcha"], version = "^1.22.5"}
36
+ playwrightcapture = {extras = ["recaptcha"], version = "^1.22.7"}
37
37
  defang = "^0.5.3"
38
38
  ua-parser = "^0.18.0"
39
39
  redis = {version = "^5.0.1", extras = ["hiredis"]}
@@ -42,10 +42,10 @@ redis = {version = "^5.0.1", extras = ["hiredis"]}
42
42
  docs = ["Sphinx"]
43
43
 
44
44
  [tool.poetry.group.dev.dependencies]
45
- types-redis = {version = "^4.6.0.11"}
45
+ types-redis = {version = "^4.6.0.20240106"}
46
46
  mypy = "^1.8.0"
47
- types-requests = "^2.31.0.20231231"
48
- types-beautifulsoup4 = "^4.12.0.7"
47
+ types-requests = "^2.31.0.20240106"
48
+ types-beautifulsoup4 = "^4.12.0.20240106"
49
49
  ipython = [
50
50
  {version = "<8.13.0", python = "<3.9"},
51
51
  {version = "^8.18.0", python = ">=3.9"},
File without changes
File without changes
File without changes