lacuscore 1.6.5__tar.gz → 1.6.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lacuscore
3
- Version: 1.6.5
3
+ Version: 1.6.7
4
4
  Summary: Core of Lacus, usable as a module
5
5
  Home-page: https://github.com/ail-project/LacusCore
6
6
  License: BSD-3-Clause
@@ -24,7 +24,7 @@ Classifier: Topic :: Security
24
24
  Provides-Extra: docs
25
25
  Requires-Dist: Sphinx (>=7.1.2,<8.0.0) ; extra == "docs"
26
26
  Requires-Dist: defang (>=0.5.3,<0.6.0)
27
- Requires-Dist: playwrightcapture[recaptcha] (>=1.21.4,<2.0.0)
27
+ Requires-Dist: playwrightcapture[recaptcha] (>=1.21.6,<2.0.0)
28
28
  Requires-Dist: redis[hiredis] (>=4.6.0,<5.0.0)
29
29
  Requires-Dist: requests (>=2.31.0,<3.0.0)
30
30
  Requires-Dist: ua-parser (>=0.18.0,<0.19.0)
@@ -10,7 +10,6 @@ import pickle
10
10
  import random
11
11
  import re
12
12
  import socket
13
- import sys
14
13
  import time
15
14
  import unicodedata
16
15
  import zlib
@@ -78,6 +77,9 @@ class CaptureStatus(IntEnum):
78
77
  class CaptureResponse(PlaywrightCaptureResponse, TypedDict, total=False):
79
78
  '''A capture made by Lacus. With the base64 encoded image and downloaded file decoded to bytes.'''
80
79
 
80
+ # Need to make sure the type is what's expected down the line
81
+ children: Optional[List['CaptureResponse']] # type: ignore
82
+
81
83
  status: int
82
84
  runtime: Optional[float]
83
85
 
@@ -94,7 +96,7 @@ class CaptureResponseJson(TypedDict, total=False):
94
96
  png: Optional[str]
95
97
  downloaded_filename: Optional[str]
96
98
  downloaded_file: Optional[str]
97
- children: Optional[List[Any]]
99
+ children: Optional[List['CaptureResponseJson']]
98
100
  runtime: Optional[float]
99
101
  potential_favicons: Optional[List[str]]
100
102
 
@@ -139,11 +141,6 @@ class LacusCoreLogAdapter(LoggerAdapter):
139
141
  return msg, kwargs
140
142
 
141
143
 
142
- def _json_encode(obj: Union[bytes]) -> str:
143
- if isinstance(obj, bytes):
144
- return b64encode(obj).decode()
145
-
146
-
147
144
  class LacusCore():
148
145
  """Capture URLs or web enabled documents using PlaywrightCapture.
149
146
 
@@ -168,7 +165,7 @@ class LacusCore():
168
165
  self.only_global_lookups = only_global_lookups
169
166
  self.max_retries = max_retries
170
167
 
171
- # NOTE: clear old ongoing captures queue in case of need
168
+ # NOTE: Remove in 1.8.* - clear old ongoing captures queue in case of need
172
169
  if self.redis.type('lacus:ongoing') in ['set', b'set']:
173
170
  self.redis.delete('lacus:ongoing')
174
171
 
@@ -353,11 +350,12 @@ class LacusCore():
353
350
  if capture.get('downloaded_file') is not None and capture['downloaded_file'] is not None: # the second part is not needed, but makes mypy happy
354
351
  encoded_capture['downloaded_file'] = b64encode(capture['downloaded_file']).decode()
355
352
  if capture.get('children') and capture['children']:
356
- for child in capture['children']:
357
- child = self._encode_response(child)
358
- if capture.get('potential_favicons') and capture['potential_favicons'] is not None:
359
- encoded_favicons = [b64encode(favicon).decode() for favicon in capture['potential_favicons']]
360
- encoded_capture['potential_favicons'] = encoded_favicons
353
+ encoded_capture['children'] = [self._encode_response(child) for child in capture['children']]
354
+
355
+ # A set cannot be dumped in json, it must be turned into a list. If it is empty, we need to remove it.
356
+ if 'potential_favicons' in capture:
357
+ if potential_favicons := capture.pop('potential_favicons'):
358
+ encoded_capture['potential_favicons'] = [b64encode(favicon).decode() for favicon in potential_favicons]
361
359
  return encoded_capture
362
360
 
363
361
  @overload
@@ -381,10 +379,9 @@ class LacusCore():
381
379
  to_return['status'] = CaptureStatus.QUEUED
382
380
  elif self.redis.zscore('lacus:ongoing', uuid) is not None:
383
381
  to_return['status'] = CaptureStatus.ONGOING
384
- elif response := self.redis.get(f'lacus:capture_results:{uuid}'):
382
+ elif response := self._get_capture_response(uuid):
385
383
  to_return['status'] = CaptureStatus.DONE
386
- response_json = pickle.loads(zlib.decompress(response))
387
- to_return.update(response_json)
384
+ to_return.update(response)
388
385
  if decode:
389
386
  return to_return
390
387
  return self._encode_response(to_return)
@@ -401,7 +398,10 @@ class LacusCore():
401
398
  return CaptureStatus.QUEUED
402
399
  elif self.redis.zscore('lacus:ongoing', uuid) is not None:
403
400
  return CaptureStatus.ONGOING
401
+ elif self.redis.exists(f'lacus:capture_results_hash:{uuid}'):
402
+ return CaptureStatus.DONE
404
403
  elif self.redis.exists(f'lacus:capture_results:{uuid}'):
404
+ # TODO: remove in 1.8.* - old format used last in 1.6, and kept no more than 10H in redis
405
405
  return CaptureStatus.DONE
406
406
  return CaptureStatus.UNKNOWN
407
407
 
@@ -675,18 +675,17 @@ class LacusCore():
675
675
  p.zadd('lacus:to_capture', {uuid: priority - 1})
676
676
  p.execute()
677
677
  else:
678
- to_store = zlib.compress(pickle.dumps(result))
679
678
  retry_redis_error = 3
680
679
  while retry_redis_error > 0:
681
680
  try:
682
681
  p = self.redis.pipeline()
683
- p.setex(f'lacus:capture_results:{uuid}', 36000, to_store)
682
+ self._store_capture_response(p, uuid, result)
684
683
  p.delete(f'lacus:capture_settings:{uuid}')
685
684
  p.zrem('lacus:ongoing', uuid)
686
685
  p.execute()
687
686
  break
688
687
  except RedisConnectionError as e:
689
- logger.warning(f'Unable to store capture result (size: {sys.getsizeof(to_store)} - Redis Connection Error: {e}')
688
+ logger.warning(f'Unable to store capture result - Redis Connection Error: {e}')
690
689
  retry_redis_error -= 1
691
690
  await asyncio.sleep(random.randint(5, 10))
692
691
  else:
@@ -702,6 +701,78 @@ class LacusCore():
702
701
  stats_pipeline.expire(f'stats:{today}:captures', expire_time)
703
702
  stats_pipeline.execute()
704
703
 
704
+ def _store_capture_response(self, pipeline: Redis, capture_uuid: str, results: CaptureResponse,
705
+ root_key: Optional[str]=None) -> None:
706
+ if root_key is None:
707
+ root_key = f'lacus:capture_results_hash:{capture_uuid}'
708
+
709
+ hash_to_set = {}
710
+ if results.get('har'):
711
+ hash_to_set['har'] = pickle.dumps(results['har'])
712
+ if results.get('cookies'):
713
+ hash_to_set['cookies'] = pickle.dumps(results['cookies'])
714
+ if results.get('potential_favicons'):
715
+ hash_to_set['potential_favicons'] = pickle.dumps(results['potential_favicons'])
716
+ if 'children' in results and results['children'] is not None:
717
+ padding_length = len(str(len(results['children'])))
718
+ children = set()
719
+ for i, child in enumerate(results['children']):
720
+ child_key = f'{root_key}_{i:0{padding_length}}'
721
+ self._store_capture_response(pipeline, capture_uuid, child, child_key)
722
+ children.add(child_key)
723
+ hash_to_set['children'] = pickle.dumps(children)
724
+
725
+ for key in results.keys():
726
+ if key in ['har', 'cookies', 'potential_favicons', 'children'] or not results.get(key):
727
+ continue
728
+ # these entries can be stored directly
729
+ hash_to_set[key] = results[key] # type: ignore
730
+ pipeline.hset(root_key, mapping=hash_to_set) # type: ignore
731
+ # Make sure the key expires
732
+ pipeline.expire(root_key, 36000)
733
+
734
+ def _get_capture_response(self, capture_uuid: str, root_key: Optional[str]=None) -> Optional[CaptureResponse]:
735
+ logger = LacusCoreLogAdapter(self.master_logger, {'uuid': capture_uuid})
736
+ if root_key is None:
737
+ root_key = f'lacus:capture_results_hash:{capture_uuid}'
738
+
739
+ if not self.redis.exists(root_key):
740
+ if old_response := self.redis.get(f'lacus:capture_results:{capture_uuid}'):
741
+ # TODO: remove in 1.8.* - old format used last in 1.6, and kept no more than 10H in redis
742
+ return pickle.loads(zlib.decompress(old_response))
743
+ return None
744
+
745
+ # New format and capture done
746
+
747
+ to_return: CaptureResponse = {}
748
+ for key, value in self.redis.hgetall(root_key).items():
749
+ if key == b'har':
750
+ to_return['har'] = pickle.loads(value)
751
+ elif key == b'cookies':
752
+ to_return['cookies'] = pickle.loads(value)
753
+ elif key == b'potential_favicons':
754
+ to_return['potential_favicons'] = pickle.loads(value)
755
+ elif key == b'children':
756
+ to_return['children'] = []
757
+ for child_root_key in sorted(pickle.loads(value)):
758
+ child = self._get_capture_response(capture_uuid, child_root_key)
759
+ to_return['children'].append(child) # type: ignore
760
+ elif key in [b'status']:
761
+ # The value in an int
762
+ to_return[key.decode()] = int(value) # type: ignore
763
+ elif key in [b'runtime']:
764
+ # The value is a float
765
+ to_return[key.decode()] = float(value) # type: ignore
766
+ elif key in [b'last_redirected_url', b'error', b'error_name', b'html', b'downloaded_filename']:
767
+ # the value is a string
768
+ to_return[key.decode()] = value.decode() # type: ignore
769
+ elif key in [b'png', b'downloaded_file']:
770
+ # the value is bytes
771
+ to_return[key.decode()] = value # type: ignore
772
+ else:
773
+ logger.critical(f'Unexpected key in response: {key} - {value}')
774
+ return to_return
775
+
705
776
  def clear_capture(self, uuid: str, reason: str):
706
777
  '''Remove a capture from the list, shouldn't happen unless it is in error'''
707
778
  logger = LacusCoreLogAdapter(self.master_logger, {'uuid': uuid})
@@ -709,10 +780,9 @@ class LacusCore():
709
780
  logger.warning('Attempted to clear capture that is still being processed.')
710
781
  return
711
782
  logger.warning(f'Clearing capture: {reason}')
712
- result = {'error': reason}
783
+ result: CaptureResponse = {'error': reason}
713
784
  p = self.redis.pipeline()
714
- to_store = zlib.compress(pickle.dumps(result))
715
- p.setex(f'lacus:capture_results:{uuid}', 36000, to_store)
785
+ self._store_capture_response(p, uuid, result)
716
786
  p.delete(f'lacus:capture_settings:{uuid}')
717
787
  p.zrem('lacus:ongoing', uuid)
718
788
  p.execute()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lacuscore"
3
- version = "1.6.5"
3
+ version = "1.6.7"
4
4
  description = "Core of Lacus, usable as a module"
5
5
  authors = ["Raphaël Vinot <raphael.vinot@circl.lu>"]
6
6
  license = "BSD-3-Clause"
@@ -32,7 +32,7 @@ include = ['README.md']
32
32
  python = "^3.8"
33
33
  requests = "^2.31.0"
34
34
  Sphinx = { version = "^7.1.2", optional = true }
35
- playwrightcapture = {extras = ["recaptcha"], version = "^1.21.4"}
35
+ playwrightcapture = {extras = ["recaptcha"], version = "^1.21.6"}
36
36
  defang = "^0.5.3"
37
37
  ua-parser = "^0.18.0"
38
38
  redis = {version = "^4.6.0", extras = ["hiredis"]}
@@ -42,9 +42,9 @@ docs = ["Sphinx"]
42
42
 
43
43
  [tool.poetry.group.dev.dependencies]
44
44
  types-redis = {version = "^4.6.0.3"}
45
- mypy = "^1.4.1"
45
+ mypy = "^1.5.0"
46
46
  types-requests = "^2.31.0.2"
47
- types-beautifulsoup4 = "^4.12.0.5"
47
+ types-beautifulsoup4 = "^4.12.0.6"
48
48
  ipython = [
49
49
  {version = "<8.13.0", python = "<3.9"},
50
50
  {version = "^8.13.0", python = ">=3.9"}
File without changes
File without changes
File without changes