assemblyline-core 4.5.1.dev145__tar.gz → 4.5.1.dev151__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-core might be problematic. Click here for more details.

Files changed (88) hide show
  1. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/PKG-INFO +1 -1
  2. assemblyline-core-4.5.1.dev151/assemblyline_core/VERSION +1 -0
  3. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/archiver/run_archiver.py +4 -3
  4. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/expiry/run_expiry.py +113 -48
  5. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core.egg-info/PKG-INFO +1 -1
  6. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_dispatcher.py +3 -3
  7. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_expiry.py +13 -15
  8. assemblyline-core-4.5.1.dev145/assemblyline_core/VERSION +0 -1
  9. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/LICENCE.md +0 -0
  10. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/README.md +0 -0
  11. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/__init__.py +0 -0
  12. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/alerter/__init__.py +0 -0
  13. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/alerter/processing.py +0 -0
  14. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/alerter/run_alerter.py +0 -0
  15. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/archiver/__init__.py +0 -0
  16. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/badlist_client.py +0 -0
  17. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/dispatching/__init__.py +0 -0
  18. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/dispatching/__main__.py +0 -0
  19. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/dispatching/client.py +0 -0
  20. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/dispatching/dispatcher.py +0 -0
  21. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/dispatching/schedules.py +0 -0
  22. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/dispatching/timeout.py +0 -0
  23. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/expiry/__init__.py +0 -0
  24. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/ingester/__init__.py +0 -0
  25. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/ingester/__main__.py +0 -0
  26. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/ingester/constants.py +0 -0
  27. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/ingester/ingester.py +0 -0
  28. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/metrics/__init__.py +0 -0
  29. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/metrics/es_metrics.py +0 -0
  30. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/metrics/heartbeat_formatter.py +0 -0
  31. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/metrics/helper.py +0 -0
  32. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/metrics/metrics_server.py +0 -0
  33. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/metrics/run_heartbeat_manager.py +0 -0
  34. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/metrics/run_metrics_aggregator.py +0 -0
  35. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/metrics/run_statistics_aggregator.py +0 -0
  36. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/plumber/__init__.py +0 -0
  37. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/plumber/run_plumber.py +0 -0
  38. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/__init__.py +0 -0
  39. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/client.py +0 -0
  40. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/creator/__init__.py +0 -0
  41. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/creator/run.py +0 -0
  42. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/creator/run_worker.py +0 -0
  43. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/loader/__init__.py +0 -0
  44. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/loader/run.py +0 -0
  45. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/loader/run_worker.py +0 -0
  46. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/replay/replay.py +0 -0
  47. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/safelist_client.py +0 -0
  48. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/scaler/__init__.py +0 -0
  49. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/scaler/collection.py +0 -0
  50. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/scaler/controllers/__init__.py +0 -0
  51. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/scaler/controllers/docker_ctl.py +0 -0
  52. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/scaler/controllers/interface.py +0 -0
  53. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/scaler/controllers/kubernetes_ctl.py +0 -0
  54. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/scaler/run_scaler.py +0 -0
  55. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/scaler/scaler_server.py +0 -0
  56. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/server_base.py +0 -0
  57. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/signature_client.py +0 -0
  58. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/submission_client.py +0 -0
  59. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/tasking_client.py +0 -0
  60. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/updater/__init__.py +0 -0
  61. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/updater/helper.py +0 -0
  62. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/updater/run_updater.py +0 -0
  63. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/vacuum/__init__.py +0 -0
  64. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/vacuum/crawler.py +0 -0
  65. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/vacuum/department_map.py +0 -0
  66. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/vacuum/safelist.py +0 -0
  67. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/vacuum/stream_map.py +0 -0
  68. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/vacuum/worker.py +0 -0
  69. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/workflow/__init__.py +0 -0
  70. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core/workflow/run_workflow.py +0 -0
  71. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core.egg-info/SOURCES.txt +0 -0
  72. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core.egg-info/dependency_links.txt +0 -0
  73. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core.egg-info/requires.txt +0 -0
  74. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/assemblyline_core.egg-info/top_level.txt +0 -0
  75. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/setup.cfg +0 -0
  76. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/setup.py +0 -0
  77. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_alerter.py +0 -0
  78. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_badlist_client.py +0 -0
  79. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_plumber.py +0 -0
  80. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_replay.py +0 -0
  81. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_safelist_client.py +0 -0
  82. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_scaler.py +0 -0
  83. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_scheduler.py +0 -0
  84. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_signature_client.py +0 -0
  85. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_simulation.py +0 -0
  86. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_vacuum.py +0 -0
  87. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_worker_ingest.py +0 -0
  88. {assemblyline-core-4.5.1.dev145 → assemblyline-core-4.5.1.dev151}/test/test_worker_submit.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.1.dev145
3
+ Version: 4.5.1.dev151
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -0,0 +1 @@
1
+ 4.5.1.dev151
@@ -64,7 +64,7 @@ class Archiver(ServerBase):
64
64
  try:
65
65
  if len(message) == 3:
66
66
  archive_type, type_id, delete_after = message
67
- metadata = None
67
+ metadata = {}
68
68
  use_alternate_dtl = False
69
69
  elif len(message) == 4:
70
70
  archive_type, type_id, delete_after, metadata = message
@@ -90,8 +90,9 @@ class Archiver(ServerBase):
90
90
  submission, version = self.datastore.submission.get_if_exists(type_id, version=True)
91
91
 
92
92
  # If we have metadata passed in the message, we need to apply it before archiving the submission
93
- if metadata and self.config.core.archiver.use_metadata:
94
- submission.metadata.update({f"archive.{k}": v for k, v in metadata.items()})
93
+ if metadata and self.config.submission.metadata.archive:
94
+ submission.metadata.update({k: v for k, v in metadata.items()
95
+ if k not in submission.metadata})
95
96
  self.datastore.submission.save(type_id, submission, version=version)
96
97
 
97
98
  break
@@ -4,38 +4,48 @@ from __future__ import annotations
4
4
  import concurrent.futures
5
5
  import threading
6
6
  import functools
7
- import elasticapm
8
7
  import time
9
-
10
8
  from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, Future, as_completed
11
9
  from concurrent.futures.process import BrokenProcessPool
12
- from datemath import dm
13
10
  from typing import Callable, Optional, TYPE_CHECKING
14
11
 
15
- from assemblyline.common.isotime import epoch_to_iso, now_as_iso
16
- from assemblyline.datastore.collection import Index
12
+ import elasticapm
13
+ from datemath import dm
14
+
17
15
  from assemblyline_core.server_base import ServerBase
18
16
  from assemblyline_core.dispatching.dispatcher import BAD_SID_HASH
19
17
  from assemblyline.common import forge
18
+ from assemblyline.common.isotime import epoch_to_iso, now_as_iso
20
19
  from assemblyline.common.metrics import MetricsFactory
21
20
  from assemblyline.filestore import FileStore
22
21
  from assemblyline.odm.messages.expiry_heartbeat import Metrics
23
22
  from assemblyline.remote.datatypes import get_client
23
+ from assemblyline.datastore.collection import Index
24
24
  from assemblyline.remote.datatypes.set import Set
25
25
 
26
26
  if TYPE_CHECKING:
27
27
  from assemblyline.datastore.collection import ESCollection
28
28
 
29
29
 
30
- def file_delete_worker(logger, filestore_urls, file_batch) -> list[str]:
30
+ def file_delete_worker(logger, filestore_urls, file_batch, archive_filestore_urls=None) -> list[tuple[str, bool]]:
31
31
  try:
32
32
  filestore = FileStore(*filestore_urls)
33
-
34
- def filestore_delete(sha256: str) -> Optional[str]:
35
- filestore.delete(sha256)
36
- if not filestore.exists(sha256):
37
- return sha256
38
- return None
33
+ if archive_filestore_urls and filestore_urls != archive_filestore_urls:
34
+ archivestore = FileStore(*archive_filestore_urls)
35
+ else:
36
+ archivestore = filestore
37
+
38
+ def filestore_delete(item: tuple[str, bool]) -> tuple[Optional[str], Optional[bool]]:
39
+ sha256, from_archive = item
40
+ if from_archive:
41
+ archivestore.delete(sha256)
42
+ if not archivestore.exists(sha256):
43
+ return sha256, True
44
+ else:
45
+ filestore.delete(sha256)
46
+ if not filestore.exists(sha256):
47
+ return sha256, False
48
+ return None, None
39
49
 
40
50
  return _file_delete_worker(logger, filestore_delete, file_batch)
41
51
 
@@ -44,8 +54,11 @@ def file_delete_worker(logger, filestore_urls, file_batch) -> list[str]:
44
54
  return []
45
55
 
46
56
 
47
- def _file_delete_worker(logger, delete_action: Callable[[str], Optional[str]], file_batch) -> list[str]:
48
- finished_files: list[str] = []
57
+ ActionSignature = Callable[[tuple[str, bool]], tuple[Optional[str], Optional[bool]]]
58
+
59
+
60
+ def _file_delete_worker(logger, delete_action: ActionSignature, file_batch) -> list[tuple[str, bool]]:
61
+ finished_files: list[tuple[str, bool]] = []
49
62
  try:
50
63
  futures = []
51
64
 
@@ -55,9 +68,9 @@ def _file_delete_worker(logger, delete_action: Callable[[str], Optional[str]], f
55
68
 
56
69
  for future in as_completed(futures):
57
70
  try:
58
- erased_name = future.result()
59
- if erased_name:
60
- finished_files.append(erased_name)
71
+ erased_name, from_archive = future.result()
72
+ if erased_name and from_archive is not None:
73
+ finished_files.append((erased_name, from_archive))
61
74
  except Exception as error:
62
75
  logger.exception("Error in filestore worker: " + str(error))
63
76
 
@@ -67,17 +80,29 @@ def _file_delete_worker(logger, delete_action: Callable[[str], Optional[str]], f
67
80
 
68
81
 
69
82
  class ExpiryManager(ServerBase):
70
- def __init__(self, redis_persist=None):
71
- self.config = forge.get_config()
83
+ def __init__(self, redis_persist=None, datastore=None, filestore=None, config=None, classification=None):
84
+ self.config = config or forge.get_config()
72
85
 
73
86
  super().__init__('assemblyline.expiry', shutdown_timeout=self.config.core.expiry.sleep_time + 5)
74
- self.datastore = forge.get_datastore(config=self.config)
75
- self.filestore = forge.get_filestore(config=self.config)
76
- self.classification = forge.get_classification()
87
+
88
+ # Set Archive related configs
89
+ if self.config.datastore.archive.enabled:
90
+ self.archive_access = True
91
+ self.index_type = Index.HOT_AND_ARCHIVE
92
+ else:
93
+ self.archive_access = False
94
+ self.index_type = Index.HOT
95
+
96
+ self.datastore = datastore or forge.get_datastore(config=self.config, archive_access=self.archive_access)
97
+ self.filestore = filestore or forge.get_filestore(config=self.config)
98
+ self.classification = classification or forge.get_classification()
77
99
  self.expirable_collections: list[ESCollection] = []
78
100
  self.counter = MetricsFactory('expiry', Metrics)
79
101
  self.file_delete_worker = ProcessPoolExecutor(self.config.core.expiry.delete_workers)
80
- self.same_storage = self.config.filestore.storage == self.config.filestore.archive
102
+ if self.config.filestore.archive:
103
+ self.same_storage = self.config.filestore.storage == self.config.filestore.archive
104
+ else:
105
+ self.same_storage = True
81
106
  self.current_submission_cleanup = set()
82
107
 
83
108
  self.redis_persist = redis_persist or get_client(
@@ -127,14 +152,15 @@ class ExpiryManager(ServerBase):
127
152
  def filestore_delete(self, file_batch, _):
128
153
  return self.file_delete_worker.submit(file_delete_worker, logger=self.log,
129
154
  filestore_urls=list(self.config.filestore.storage),
130
- file_batch=file_batch)
155
+ file_batch=file_batch,
156
+ archive_filestore_urls=list(self.config.filestore.archive))
131
157
 
132
158
  def cachestore_delete(self, file_batch, _):
133
159
  return self.file_delete_worker.submit(file_delete_worker, logger=self.log,
134
160
  filestore_urls=list(self.config.filestore.cache),
135
161
  file_batch=file_batch)
136
162
 
137
- def _finish_delete(self, collection: ESCollection, task: Future, expire_only: list[str]):
163
+ def _finish_delete(self, collection: ESCollection, task: Future, expire_only: list[tuple[str, bool]]):
138
164
  # Wait until the worker process finishes deleting files
139
165
  file_list: list[str] = []
140
166
  while self.running:
@@ -145,25 +171,41 @@ class ExpiryManager(ServerBase):
145
171
  except concurrent.futures.TimeoutError:
146
172
  pass
147
173
 
148
- file_list.extend(expire_only)
149
-
150
- # build a batch delete job for all the removed files
151
- bulk = collection.get_bulk_plan()
152
- for sha256 in file_list:
153
- bulk.add_delete_operation(sha256)
154
-
155
- if len(file_list) > 0:
174
+ if file_list:
156
175
  self.log.info(f'[{collection.name}] Deleted associated files from the '
157
176
  f'{"cachestore" if "cache" in collection.name else "filestore"}...')
158
- collection.bulk(bulk)
159
- self.counter.increment(f'{collection.name}', increment_by=len(file_list))
160
- self.log.info(f"[{collection.name}] Deleted {len(file_list)} items from the datastore...")
161
177
  else:
178
+ self.log.info(f'[{collection.name}] Nothing was deleted from the '
179
+ f'{"cachestore" if "cache" in collection.name else "filestore"}...')
180
+
181
+ # From the files to be deleted, check which are from the hot index
182
+ hot_file_list = [x[0] for x in file_list if not x[1]]
183
+ hot_file_list.extend([x[0] for x in expire_only if not x[1]])
184
+
185
+ # From the files to be deleted, check which are from the archive index
186
+ archive_file_list = [x[0] for x in file_list if x[1]]
187
+ archive_file_list.extend([x[0] for x in expire_only if x[1]])
188
+
189
+ for cur_file_list, index_type in [(hot_file_list, Index.HOT), (archive_file_list, Index.ARCHIVE)]:
190
+ if not cur_file_list:
191
+ # Nothing to delete from this index type
192
+ continue
193
+
194
+ # build a batch delete job for all the removed files
195
+ bulk = collection.get_bulk_plan(index_type=index_type)
196
+ for sha256 in cur_file_list:
197
+ bulk.add_delete_operation(sha256)
198
+
199
+ collection.bulk(bulk)
200
+ self.counter.increment(f'{collection.name}', increment_by=len(cur_file_list))
201
+ self.log.info(f"[{collection.name}] Deleted {len(cur_file_list)} items from the datastore...")
202
+
203
+ if not hot_file_list and not archive_file_list:
162
204
  self.log.warning(f'[{collection.name}] Expiry unable to clean up any of the files in filestore.')
163
205
 
164
- def _simple_delete(self, collection, delete_query, number_to_delete):
206
+ def _simple_delete(self, collection: ESCollection, delete_query, number_to_delete):
165
207
  self.heartbeat()
166
- collection.delete_by_query(delete_query)
208
+ collection.delete_by_query(delete_query, index_type=self.index_type)
167
209
  self.counter.increment(f'{collection.name}', increment_by=number_to_delete)
168
210
  self.log.info(f"[{collection.name}] Deleted {number_to_delete} items from the datastore...")
169
211
 
@@ -195,22 +237,45 @@ class ExpiryManager(ServerBase):
195
237
  # check if we are dealing with an index that needs file cleanup
196
238
  if self.config.core.expiry.delete_storage and collection.name in self.fs_hashmap:
197
239
  # Delete associated files
198
- delete_objects: list[str] = []
199
- for item in collection.stream_search(delete_query, fl='id', as_obj=False):
240
+ delete_objects: list[tuple[str, bool]] = []
241
+ for item in collection.stream_search(
242
+ delete_query, fl='id,from_archive', as_obj=False, index_type=self.index_type):
200
243
  self.heartbeat()
201
- delete_objects.append(item['id'])
244
+ delete_objects.append((item['id'], item.get('from_archive', False)))
202
245
 
203
246
  # Filter archived documents if archive filestore is the same as the filestore
204
- expire_only = []
205
- if self.same_storage and self.config.datastore.archive.enabled and collection.name == 'file':
206
- archived_files = self.datastore.file.multiexists(delete_objects, index_type=Index.ARCHIVE)
207
- delete_objects = [k for k, v in archived_files.items() if not v]
208
- expire_only = [k for k, v in archived_files.items() if v]
247
+ expire_only: list[tuple[str, bool]] = []
248
+ if self.same_storage and self.archive_access and collection.name == 'file':
249
+ # Separate hot and archive files
250
+ delete_from_archive = [i[0] for i in delete_objects if i[1]]
251
+ delete_from_hot = [i[0] for i in delete_objects if not i[1]]
252
+
253
+ # Check for overlap
254
+ overlap = set(delete_from_archive).intersection(set(delete_from_hot))
255
+ delete_from_archive = list(set(delete_from_archive)-overlap)
256
+ delete_from_hot = list(set(delete_from_hot)-overlap)
257
+
258
+ # Create the original delete_object form the overlap
259
+ delete_objects = [(k, False) for k in overlap]
260
+ delete_objects.extend([(k, True) for k in overlap])
261
+
262
+ if delete_from_hot:
263
+ # Check hot objects to delete if they are in archive
264
+ archived_files = self.datastore.file.multiexists(delete_from_hot, index_type=Index.ARCHIVE)
265
+ delete_objects.extend([(k, False) for k, v in archived_files.items() if not v])
266
+ expire_only.extend([(k, False) for k, v in archived_files.items() if v])
267
+
268
+ if delete_from_archive:
269
+ # Check hot objects to delete if they are in archive
270
+ hot_files = self.datastore.file.multiexists(delete_from_archive, index_type=Index.HOT)
271
+ delete_objects.extend([(k, True) for k, v in hot_files.items() if not v])
272
+ expire_only.extend([(k, True) for k, v in hot_files.items() if v])
209
273
 
210
274
  delete_tasks = self.fs_hashmap[collection.name](delete_objects, final_date)
211
275
 
212
276
  # Proceed with deletion, but only after all the scheduled deletes for this
213
- self.log.info(f"[{collection.name}] Scheduled {len(delete_objects)}/{number_to_delete} files to be removed")
277
+ self.log.info(f"[{collection.name}] Scheduled {len(delete_objects)}/{number_to_delete} files to be "
278
+ f"removed from the {'cachestore' if 'cache' in collection.name else 'filestore'}")
214
279
  self._finish_delete(collection, delete_tasks, expire_only)
215
280
 
216
281
  else:
@@ -264,7 +329,7 @@ class ExpiryManager(ServerBase):
264
329
  will be affected in between start date and the date found"""
265
330
  rows = collection.search(f"expiry_ts: {{{start} TO {final_date}]", rows=1,
266
331
  offset=self.expiry_size - 1, sort='expiry_ts asc',
267
- as_obj=False, fl='expiry_ts')
332
+ as_obj=False, fl='expiry_ts', index_type=self.index_type)
268
333
  if rows['items']:
269
334
  return rows['items'][0]['expiry_ts'], self.expiry_size
270
335
  return final_date, rows['total']
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.1.dev145
3
+ Version: 4.5.1.dev151
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -284,8 +284,8 @@ def test_dispatch_extracted(clean_redis, clean_datastore):
284
284
  @mock.patch('assemblyline_core.dispatching.dispatcher.MetricsFactory', mock.MagicMock())
285
285
  @mock.patch('assemblyline_core.dispatching.dispatcher.Scheduler', DRPScheduler)
286
286
  def test_dispatch_extracted_bypass_drp(clean_redis, clean_datastore):
287
- # Dynamic Recursion Prevention is to prevent services belonging to the 'Dynamic Analysis' from analyzing the children
288
- # of files they've analyzed.
287
+ # Dynamic Recursion Prevention is to prevent services belonging to the 'Dynamic Analysis'
288
+ # from analyzing the children of files they've analyzed.
289
289
 
290
290
  # The bypass should allow services to specify files to run through Dynamic Analysis regardless of the
291
291
  # Dynamic Recursion Prevention parameter.
@@ -372,7 +372,7 @@ def test_dispatch_extracted_bypass_drp(clean_redis, clean_datastore):
372
372
  disp.service_worker(disp.process_queue_index(sid))
373
373
 
374
374
  # 'sandbox' should have a task for the extracted file
375
- #disp.dispatch_file(disp.tasks.get(sid), second_file_hash)
375
+ # disp.dispatch_file(disp.tasks.get(sid), second_file_hash)
376
376
  job = client.request_work('0', 'sandbox', '0')
377
377
  assert job.fileinfo.sha256 == second_file_hash
378
378
  assert job.filename == 'second-*'
@@ -4,6 +4,7 @@ import random
4
4
  import concurrent.futures
5
5
 
6
6
  from assemblyline.common.isotime import now_as_iso
7
+ from assemblyline.datastore.helper import AssemblylineDatastore
7
8
  from assemblyline.odm.randomizer import random_model_obj
8
9
 
9
10
  from assemblyline_core.expiry.run_expiry import ExpiryManager
@@ -14,34 +15,31 @@ expiry_collections_len = {}
14
15
  archive_collections_len = {}
15
16
 
16
17
 
17
- @pytest.fixture(scope='module')
18
- def datastore(archive_connection):
19
- return archive_connection
20
-
21
-
22
- def purge_data(datastore):
23
- for name, definition in datastore.ds.get_models().items():
18
+ def purge_data(datastore_connection: AssemblylineDatastore):
19
+ for name, definition in datastore_connection.ds.get_models().items():
24
20
  if hasattr(definition, 'expiry_ts'):
25
- getattr(datastore, name).wipe()
21
+ getattr(datastore_connection, name).wipe()
26
22
 
27
23
 
28
24
  @pytest.fixture(scope="function")
29
- def ds_expiry(request, datastore):
30
- for name, definition in datastore.ds.get_models().items():
25
+ def ds_expiry(request, datastore_connection):
26
+ for name, definition in datastore_connection.ds.get_models().items():
31
27
  if hasattr(definition, 'expiry_ts'):
32
- collection = getattr(datastore, name)
28
+ collection = getattr(datastore_connection, name)
33
29
  collection.wipe()
34
30
  expiry_len = random.randint(MIN_OBJECTS, MAX_OBJECTS)
35
31
  for x in range(expiry_len):
36
32
  obj = random_model_obj(collection.model_class)
33
+ if hasattr(definition, 'from_archive'):
34
+ obj.from_archive = False
37
35
  obj.expiry_ts = now_as_iso(-10000)
38
36
  collection.save('longer_name'+str(x), obj)
39
37
 
40
38
  expiry_collections_len[name] = expiry_len
41
39
  collection.commit()
42
40
 
43
- request.addfinalizer(lambda: purge_data(datastore))
44
- return datastore
41
+ request.addfinalizer(lambda: purge_data(datastore_connection))
42
+ return datastore_connection
45
43
 
46
44
 
47
45
  class FakeCounter(object):
@@ -58,8 +56,8 @@ class FakeCounter(object):
58
56
  return self.counts.get(name, 0)
59
57
 
60
58
 
61
- def test_expire_all(ds_expiry):
62
- expiry = ExpiryManager()
59
+ def test_expire_all(config, ds_expiry, filestore):
60
+ expiry = ExpiryManager(config=config, datastore=ds_expiry, filestore=filestore)
63
61
  expiry.running = True
64
62
  expiry.counter = FakeCounter()
65
63
  with concurrent.futures.ThreadPoolExecutor(5) as pool:
@@ -1 +0,0 @@
1
- 4.5.1.dev145