lacuscore 1.6.5__tar.gz → 1.6.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lacuscore-1.6.5 → lacuscore-1.6.7}/PKG-INFO +2 -2
- {lacuscore-1.6.5 → lacuscore-1.6.7}/lacuscore/lacuscore.py +92 -22
- {lacuscore-1.6.5 → lacuscore-1.6.7}/pyproject.toml +4 -4
- {lacuscore-1.6.5 → lacuscore-1.6.7}/LICENSE +0 -0
- {lacuscore-1.6.5 → lacuscore-1.6.7}/README.md +0 -0
- {lacuscore-1.6.5 → lacuscore-1.6.7}/lacuscore/__init__.py +0 -0
- {lacuscore-1.6.5 → lacuscore-1.6.7}/lacuscore/lacus_monitoring.py +0 -0
- {lacuscore-1.6.5 → lacuscore-1.6.7}/lacuscore/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lacuscore
|
3
|
-
Version: 1.6.
|
3
|
+
Version: 1.6.7
|
4
4
|
Summary: Core of Lacus, usable as a module
|
5
5
|
Home-page: https://github.com/ail-project/LacusCore
|
6
6
|
License: BSD-3-Clause
|
@@ -24,7 +24,7 @@ Classifier: Topic :: Security
|
|
24
24
|
Provides-Extra: docs
|
25
25
|
Requires-Dist: Sphinx (>=7.1.2,<8.0.0) ; extra == "docs"
|
26
26
|
Requires-Dist: defang (>=0.5.3,<0.6.0)
|
27
|
-
Requires-Dist: playwrightcapture[recaptcha] (>=1.21.
|
27
|
+
Requires-Dist: playwrightcapture[recaptcha] (>=1.21.6,<2.0.0)
|
28
28
|
Requires-Dist: redis[hiredis] (>=4.6.0,<5.0.0)
|
29
29
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
30
30
|
Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
|
@@ -10,7 +10,6 @@ import pickle
|
|
10
10
|
import random
|
11
11
|
import re
|
12
12
|
import socket
|
13
|
-
import sys
|
14
13
|
import time
|
15
14
|
import unicodedata
|
16
15
|
import zlib
|
@@ -78,6 +77,9 @@ class CaptureStatus(IntEnum):
|
|
78
77
|
class CaptureResponse(PlaywrightCaptureResponse, TypedDict, total=False):
|
79
78
|
'''A capture made by Lacus. With the base64 encoded image and downloaded file decoded to bytes.'''
|
80
79
|
|
80
|
+
# Need to make sure the type is what's expected down the line
|
81
|
+
children: Optional[List['CaptureResponse']] # type: ignore
|
82
|
+
|
81
83
|
status: int
|
82
84
|
runtime: Optional[float]
|
83
85
|
|
@@ -94,7 +96,7 @@ class CaptureResponseJson(TypedDict, total=False):
|
|
94
96
|
png: Optional[str]
|
95
97
|
downloaded_filename: Optional[str]
|
96
98
|
downloaded_file: Optional[str]
|
97
|
-
children: Optional[List[
|
99
|
+
children: Optional[List['CaptureResponseJson']]
|
98
100
|
runtime: Optional[float]
|
99
101
|
potential_favicons: Optional[List[str]]
|
100
102
|
|
@@ -139,11 +141,6 @@ class LacusCoreLogAdapter(LoggerAdapter):
|
|
139
141
|
return msg, kwargs
|
140
142
|
|
141
143
|
|
142
|
-
def _json_encode(obj: Union[bytes]) -> str:
|
143
|
-
if isinstance(obj, bytes):
|
144
|
-
return b64encode(obj).decode()
|
145
|
-
|
146
|
-
|
147
144
|
class LacusCore():
|
148
145
|
"""Capture URLs or web enabled documents using PlaywrightCapture.
|
149
146
|
|
@@ -168,7 +165,7 @@ class LacusCore():
|
|
168
165
|
self.only_global_lookups = only_global_lookups
|
169
166
|
self.max_retries = max_retries
|
170
167
|
|
171
|
-
# NOTE: clear old ongoing captures queue in case of need
|
168
|
+
# NOTE: Remove in 1.8.* - clear old ongoing captures queue in case of need
|
172
169
|
if self.redis.type('lacus:ongoing') in ['set', b'set']:
|
173
170
|
self.redis.delete('lacus:ongoing')
|
174
171
|
|
@@ -353,11 +350,12 @@ class LacusCore():
|
|
353
350
|
if capture.get('downloaded_file') is not None and capture['downloaded_file'] is not None: # the second part is not needed, but makes mypy happy
|
354
351
|
encoded_capture['downloaded_file'] = b64encode(capture['downloaded_file']).decode()
|
355
352
|
if capture.get('children') and capture['children']:
|
356
|
-
for child in capture['children']
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
353
|
+
encoded_capture['children'] = [self._encode_response(child) for child in capture['children']]
|
354
|
+
|
355
|
+
# A set cannot be dumped in json, it must be turned into a list. If it is empty, we need to remove it.
|
356
|
+
if 'potential_favicons' in capture:
|
357
|
+
if potential_favicons := capture.pop('potential_favicons'):
|
358
|
+
encoded_capture['potential_favicons'] = [b64encode(favicon).decode() for favicon in potential_favicons]
|
361
359
|
return encoded_capture
|
362
360
|
|
363
361
|
@overload
|
@@ -381,10 +379,9 @@ class LacusCore():
|
|
381
379
|
to_return['status'] = CaptureStatus.QUEUED
|
382
380
|
elif self.redis.zscore('lacus:ongoing', uuid) is not None:
|
383
381
|
to_return['status'] = CaptureStatus.ONGOING
|
384
|
-
elif response := self.
|
382
|
+
elif response := self._get_capture_response(uuid):
|
385
383
|
to_return['status'] = CaptureStatus.DONE
|
386
|
-
|
387
|
-
to_return.update(response_json)
|
384
|
+
to_return.update(response)
|
388
385
|
if decode:
|
389
386
|
return to_return
|
390
387
|
return self._encode_response(to_return)
|
@@ -401,7 +398,10 @@ class LacusCore():
|
|
401
398
|
return CaptureStatus.QUEUED
|
402
399
|
elif self.redis.zscore('lacus:ongoing', uuid) is not None:
|
403
400
|
return CaptureStatus.ONGOING
|
401
|
+
elif self.redis.exists(f'lacus:capture_results_hash:{uuid}'):
|
402
|
+
return CaptureStatus.DONE
|
404
403
|
elif self.redis.exists(f'lacus:capture_results:{uuid}'):
|
404
|
+
# TODO: remove in 1.8.* - old format used last in 1.6, and kept no more than 10H in redis
|
405
405
|
return CaptureStatus.DONE
|
406
406
|
return CaptureStatus.UNKNOWN
|
407
407
|
|
@@ -675,18 +675,17 @@ class LacusCore():
|
|
675
675
|
p.zadd('lacus:to_capture', {uuid: priority - 1})
|
676
676
|
p.execute()
|
677
677
|
else:
|
678
|
-
to_store = zlib.compress(pickle.dumps(result))
|
679
678
|
retry_redis_error = 3
|
680
679
|
while retry_redis_error > 0:
|
681
680
|
try:
|
682
681
|
p = self.redis.pipeline()
|
683
|
-
|
682
|
+
self._store_capture_response(p, uuid, result)
|
684
683
|
p.delete(f'lacus:capture_settings:{uuid}')
|
685
684
|
p.zrem('lacus:ongoing', uuid)
|
686
685
|
p.execute()
|
687
686
|
break
|
688
687
|
except RedisConnectionError as e:
|
689
|
-
logger.warning(f'Unable to store capture result
|
688
|
+
logger.warning(f'Unable to store capture result - Redis Connection Error: {e}')
|
690
689
|
retry_redis_error -= 1
|
691
690
|
await asyncio.sleep(random.randint(5, 10))
|
692
691
|
else:
|
@@ -702,6 +701,78 @@ class LacusCore():
|
|
702
701
|
stats_pipeline.expire(f'stats:{today}:captures', expire_time)
|
703
702
|
stats_pipeline.execute()
|
704
703
|
|
704
|
+
def _store_capture_response(self, pipeline: Redis, capture_uuid: str, results: CaptureResponse,
|
705
|
+
root_key: Optional[str]=None) -> None:
|
706
|
+
if root_key is None:
|
707
|
+
root_key = f'lacus:capture_results_hash:{capture_uuid}'
|
708
|
+
|
709
|
+
hash_to_set = {}
|
710
|
+
if results.get('har'):
|
711
|
+
hash_to_set['har'] = pickle.dumps(results['har'])
|
712
|
+
if results.get('cookies'):
|
713
|
+
hash_to_set['cookies'] = pickle.dumps(results['cookies'])
|
714
|
+
if results.get('potential_favicons'):
|
715
|
+
hash_to_set['potential_favicons'] = pickle.dumps(results['potential_favicons'])
|
716
|
+
if 'children' in results and results['children'] is not None:
|
717
|
+
padding_length = len(str(len(results['children'])))
|
718
|
+
children = set()
|
719
|
+
for i, child in enumerate(results['children']):
|
720
|
+
child_key = f'{root_key}_{i:0{padding_length}}'
|
721
|
+
self._store_capture_response(pipeline, capture_uuid, child, child_key)
|
722
|
+
children.add(child_key)
|
723
|
+
hash_to_set['children'] = pickle.dumps(children)
|
724
|
+
|
725
|
+
for key in results.keys():
|
726
|
+
if key in ['har', 'cookies', 'potential_favicons', 'children'] or not results.get(key):
|
727
|
+
continue
|
728
|
+
# these entries can be stored directly
|
729
|
+
hash_to_set[key] = results[key] # type: ignore
|
730
|
+
pipeline.hset(root_key, mapping=hash_to_set) # type: ignore
|
731
|
+
# Make sure the key expires
|
732
|
+
pipeline.expire(root_key, 36000)
|
733
|
+
|
734
|
+
def _get_capture_response(self, capture_uuid: str, root_key: Optional[str]=None) -> Optional[CaptureResponse]:
|
735
|
+
logger = LacusCoreLogAdapter(self.master_logger, {'uuid': capture_uuid})
|
736
|
+
if root_key is None:
|
737
|
+
root_key = f'lacus:capture_results_hash:{capture_uuid}'
|
738
|
+
|
739
|
+
if not self.redis.exists(root_key):
|
740
|
+
if old_response := self.redis.get(f'lacus:capture_results:{capture_uuid}'):
|
741
|
+
# TODO: remove in 1.8.* - old format used last in 1.6, and kept no more than 10H in redis
|
742
|
+
return pickle.loads(zlib.decompress(old_response))
|
743
|
+
return None
|
744
|
+
|
745
|
+
# New format and capture done
|
746
|
+
|
747
|
+
to_return: CaptureResponse = {}
|
748
|
+
for key, value in self.redis.hgetall(root_key).items():
|
749
|
+
if key == b'har':
|
750
|
+
to_return['har'] = pickle.loads(value)
|
751
|
+
elif key == b'cookies':
|
752
|
+
to_return['cookies'] = pickle.loads(value)
|
753
|
+
elif key == b'potential_favicons':
|
754
|
+
to_return['potential_favicons'] = pickle.loads(value)
|
755
|
+
elif key == b'children':
|
756
|
+
to_return['children'] = []
|
757
|
+
for child_root_key in sorted(pickle.loads(value)):
|
758
|
+
child = self._get_capture_response(capture_uuid, child_root_key)
|
759
|
+
to_return['children'].append(child) # type: ignore
|
760
|
+
elif key in [b'status']:
|
761
|
+
# The value in an int
|
762
|
+
to_return[key.decode()] = int(value) # type: ignore
|
763
|
+
elif key in [b'runtime']:
|
764
|
+
# The value is a float
|
765
|
+
to_return[key.decode()] = float(value) # type: ignore
|
766
|
+
elif key in [b'last_redirected_url', b'error', b'error_name', b'html', b'downloaded_filename']:
|
767
|
+
# the value is a string
|
768
|
+
to_return[key.decode()] = value.decode() # type: ignore
|
769
|
+
elif key in [b'png', b'downloaded_file']:
|
770
|
+
# the value is bytes
|
771
|
+
to_return[key.decode()] = value # type: ignore
|
772
|
+
else:
|
773
|
+
logger.critical(f'Unexpected key in response: {key} - {value}')
|
774
|
+
return to_return
|
775
|
+
|
705
776
|
def clear_capture(self, uuid: str, reason: str):
|
706
777
|
'''Remove a capture from the list, shouldn't happen unless it is in error'''
|
707
778
|
logger = LacusCoreLogAdapter(self.master_logger, {'uuid': uuid})
|
@@ -709,10 +780,9 @@ class LacusCore():
|
|
709
780
|
logger.warning('Attempted to clear capture that is still being processed.')
|
710
781
|
return
|
711
782
|
logger.warning(f'Clearing capture: {reason}')
|
712
|
-
result = {'error': reason}
|
783
|
+
result: CaptureResponse = {'error': reason}
|
713
784
|
p = self.redis.pipeline()
|
714
|
-
|
715
|
-
p.setex(f'lacus:capture_results:{uuid}', 36000, to_store)
|
785
|
+
self._store_capture_response(p, uuid, result)
|
716
786
|
p.delete(f'lacus:capture_settings:{uuid}')
|
717
787
|
p.zrem('lacus:ongoing', uuid)
|
718
788
|
p.execute()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lacuscore"
|
3
|
-
version = "1.6.
|
3
|
+
version = "1.6.7"
|
4
4
|
description = "Core of Lacus, usable as a module"
|
5
5
|
authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
|
6
6
|
license = "BSD-3-Clause"
|
@@ -32,7 +32,7 @@ include = ['README.md']
|
|
32
32
|
python = "^3.8"
|
33
33
|
requests = "^2.31.0"
|
34
34
|
Sphinx = { version = "^7.1.2", optional = true }
|
35
|
-
playwrightcapture = {extras = ["recaptcha"], version = "^1.21.
|
35
|
+
playwrightcapture = {extras = ["recaptcha"], version = "^1.21.6"}
|
36
36
|
defang = "^0.5.3"
|
37
37
|
ua-parser = "^0.18.0"
|
38
38
|
redis = {version = "^4.6.0", extras = ["hiredis"]}
|
@@ -42,9 +42,9 @@ docs = ["Sphinx"]
|
|
42
42
|
|
43
43
|
[tool.poetry.group.dev.dependencies]
|
44
44
|
types-redis = {version = "^4.6.0.3"}
|
45
|
-
mypy = "^1.
|
45
|
+
mypy = "^1.5.0"
|
46
46
|
types-requests = "^2.31.0.2"
|
47
|
-
types-beautifulsoup4 = "^4.12.0.
|
47
|
+
types-beautifulsoup4 = "^4.12.0.6"
|
48
48
|
ipython = [
|
49
49
|
{version = "<8.13.0", python = "<3.9"},
|
50
50
|
{version = "^8.13.0", python = ">=3.9"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|