assemblyline-core 4.5.0.26__tar.gz → 4.5.0.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-core might be problematic. Click here for more details.

Files changed (88) hide show
  1. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/PKG-INFO +1 -1
  2. assemblyline-core-4.5.0.28/assemblyline_core/VERSION +1 -0
  3. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/archiver/run_archiver.py +4 -3
  4. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/dispatching/client.py +11 -0
  5. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/expiry/run_expiry.py +113 -48
  6. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/scaler/controllers/kubernetes_ctl.py +4 -2
  7. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/scaler/scaler_server.py +10 -6
  8. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/tasking_client.py +9 -4
  9. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/updater/run_updater.py +6 -3
  10. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core.egg-info/PKG-INFO +1 -1
  11. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_dispatcher.py +42 -8
  12. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_expiry.py +13 -15
  13. assemblyline-core-4.5.0.26/assemblyline_core/VERSION +0 -1
  14. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/LICENCE.md +0 -0
  15. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/README.md +0 -0
  16. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/__init__.py +0 -0
  17. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/alerter/__init__.py +0 -0
  18. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/alerter/processing.py +0 -0
  19. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/alerter/run_alerter.py +0 -0
  20. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/archiver/__init__.py +0 -0
  21. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/badlist_client.py +0 -0
  22. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/dispatching/__init__.py +0 -0
  23. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/dispatching/__main__.py +0 -0
  24. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/dispatching/dispatcher.py +0 -0
  25. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/dispatching/schedules.py +0 -0
  26. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/dispatching/timeout.py +0 -0
  27. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/expiry/__init__.py +0 -0
  28. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/ingester/__init__.py +0 -0
  29. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/ingester/__main__.py +0 -0
  30. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/ingester/constants.py +0 -0
  31. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/ingester/ingester.py +0 -0
  32. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/metrics/__init__.py +0 -0
  33. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/metrics/es_metrics.py +0 -0
  34. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/metrics/heartbeat_formatter.py +0 -0
  35. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/metrics/helper.py +0 -0
  36. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/metrics/metrics_server.py +0 -0
  37. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/metrics/run_heartbeat_manager.py +0 -0
  38. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/metrics/run_metrics_aggregator.py +0 -0
  39. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/metrics/run_statistics_aggregator.py +0 -0
  40. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/plumber/__init__.py +0 -0
  41. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/plumber/run_plumber.py +0 -0
  42. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/__init__.py +0 -0
  43. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/client.py +0 -0
  44. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/creator/__init__.py +0 -0
  45. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/creator/run.py +0 -0
  46. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/creator/run_worker.py +0 -0
  47. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/loader/__init__.py +0 -0
  48. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/loader/run.py +0 -0
  49. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/loader/run_worker.py +0 -0
  50. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/replay/replay.py +0 -0
  51. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/safelist_client.py +0 -0
  52. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/scaler/__init__.py +0 -0
  53. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/scaler/collection.py +0 -0
  54. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/scaler/controllers/__init__.py +0 -0
  55. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/scaler/controllers/docker_ctl.py +0 -0
  56. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/scaler/controllers/interface.py +0 -0
  57. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/scaler/run_scaler.py +0 -0
  58. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/server_base.py +0 -0
  59. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/signature_client.py +0 -0
  60. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/submission_client.py +0 -0
  61. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/updater/__init__.py +0 -0
  62. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/updater/helper.py +0 -0
  63. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/vacuum/__init__.py +0 -0
  64. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/vacuum/crawler.py +0 -0
  65. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/vacuum/department_map.py +0 -0
  66. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/vacuum/safelist.py +0 -0
  67. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/vacuum/stream_map.py +0 -0
  68. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/vacuum/worker.py +0 -0
  69. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/workflow/__init__.py +0 -0
  70. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core/workflow/run_workflow.py +0 -0
  71. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core.egg-info/SOURCES.txt +0 -0
  72. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core.egg-info/dependency_links.txt +0 -0
  73. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core.egg-info/requires.txt +0 -0
  74. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/assemblyline_core.egg-info/top_level.txt +0 -0
  75. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/setup.cfg +0 -0
  76. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/setup.py +0 -0
  77. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_alerter.py +0 -0
  78. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_badlist_client.py +0 -0
  79. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_plumber.py +0 -0
  80. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_replay.py +0 -0
  81. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_safelist_client.py +0 -0
  82. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_scaler.py +0 -0
  83. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_scheduler.py +0 -0
  84. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_signature_client.py +0 -0
  85. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_simulation.py +0 -0
  86. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_vacuum.py +0 -0
  87. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_worker_ingest.py +0 -0
  88. {assemblyline-core-4.5.0.26 → assemblyline-core-4.5.0.28}/test/test_worker_submit.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.0.26
3
+ Version: 4.5.0.28
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -0,0 +1 @@
1
+ 4.5.0.28
@@ -64,7 +64,7 @@ class Archiver(ServerBase):
64
64
  try:
65
65
  if len(message) == 3:
66
66
  archive_type, type_id, delete_after = message
67
- metadata = None
67
+ metadata = {}
68
68
  use_alternate_dtl = False
69
69
  elif len(message) == 4:
70
70
  archive_type, type_id, delete_after, metadata = message
@@ -90,8 +90,9 @@ class Archiver(ServerBase):
90
90
  submission, version = self.datastore.submission.get_if_exists(type_id, version=True)
91
91
 
92
92
  # If we have metadata passed in the message, we need to apply it before archiving the submission
93
- if metadata and self.config.core.archiver.use_metadata:
94
- submission.metadata.update({f"archive.{k}": v for k, v in metadata.items()})
93
+ if metadata and self.config.submission.metadata.archive:
94
+ submission.metadata.update({k: v for k, v in metadata.items()
95
+ if k not in submission.metadata})
95
96
  self.datastore.submission.save(type_id, submission, version=version)
96
97
 
97
98
  break
@@ -279,6 +279,17 @@ class DispatchClient:
279
279
  else:
280
280
  result.expiry_ts = None
281
281
  try:
282
+ if self.ds.result.exists(result_key):
283
+ # A result already exists for this key
284
+ # Regenerate entire result key based on result and modified task (ignore caching)
285
+ task.ignore_cache = True
286
+ result_key = Result.help_build_key(sha256=task.fileinfo.sha256,
287
+ service_name=result.response.service_name,
288
+ service_version=result.response.service_version,
289
+ service_tool_version=result.response.service_tool_version,
290
+ is_empty=False,
291
+ task=task)
292
+ version = "create"
282
293
  self.ds.result.save(result_key, result, version=version)
283
294
  break
284
295
  except VersionConflictException as vce:
@@ -4,38 +4,48 @@ from __future__ import annotations
4
4
  import concurrent.futures
5
5
  import threading
6
6
  import functools
7
- import elasticapm
8
7
  import time
9
-
10
8
  from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, Future, as_completed
11
9
  from concurrent.futures.process import BrokenProcessPool
12
- from datemath import dm
13
10
  from typing import Callable, Optional, TYPE_CHECKING
14
11
 
15
- from assemblyline.common.isotime import epoch_to_iso, now_as_iso
16
- from assemblyline.datastore.collection import Index
12
+ import elasticapm
13
+ from datemath import dm
14
+
17
15
  from assemblyline_core.server_base import ServerBase
18
16
  from assemblyline_core.dispatching.dispatcher import BAD_SID_HASH
19
17
  from assemblyline.common import forge
18
+ from assemblyline.common.isotime import epoch_to_iso, now_as_iso
20
19
  from assemblyline.common.metrics import MetricsFactory
21
20
  from assemblyline.filestore import FileStore
22
21
  from assemblyline.odm.messages.expiry_heartbeat import Metrics
23
22
  from assemblyline.remote.datatypes import get_client
23
+ from assemblyline.datastore.collection import Index
24
24
  from assemblyline.remote.datatypes.set import Set
25
25
 
26
26
  if TYPE_CHECKING:
27
27
  from assemblyline.datastore.collection import ESCollection
28
28
 
29
29
 
30
- def file_delete_worker(logger, filestore_urls, file_batch) -> list[str]:
30
+ def file_delete_worker(logger, filestore_urls, file_batch, archive_filestore_urls=None) -> list[tuple[str, bool]]:
31
31
  try:
32
32
  filestore = FileStore(*filestore_urls)
33
-
34
- def filestore_delete(sha256: str) -> Optional[str]:
35
- filestore.delete(sha256)
36
- if not filestore.exists(sha256):
37
- return sha256
38
- return None
33
+ if archive_filestore_urls and filestore_urls != archive_filestore_urls:
34
+ archivestore = FileStore(*archive_filestore_urls)
35
+ else:
36
+ archivestore = filestore
37
+
38
+ def filestore_delete(item: tuple[str, bool]) -> tuple[Optional[str], Optional[bool]]:
39
+ sha256, from_archive = item
40
+ if from_archive:
41
+ archivestore.delete(sha256)
42
+ if not archivestore.exists(sha256):
43
+ return sha256, True
44
+ else:
45
+ filestore.delete(sha256)
46
+ if not filestore.exists(sha256):
47
+ return sha256, False
48
+ return None, None
39
49
 
40
50
  return _file_delete_worker(logger, filestore_delete, file_batch)
41
51
 
@@ -44,8 +54,11 @@ def file_delete_worker(logger, filestore_urls, file_batch) -> list[str]:
44
54
  return []
45
55
 
46
56
 
47
- def _file_delete_worker(logger, delete_action: Callable[[str], Optional[str]], file_batch) -> list[str]:
48
- finished_files: list[str] = []
57
+ ActionSignature = Callable[[tuple[str, bool]], tuple[Optional[str], Optional[bool]]]
58
+
59
+
60
+ def _file_delete_worker(logger, delete_action: ActionSignature, file_batch) -> list[tuple[str, bool]]:
61
+ finished_files: list[tuple[str, bool]] = []
49
62
  try:
50
63
  futures = []
51
64
 
@@ -55,9 +68,9 @@ def _file_delete_worker(logger, delete_action: Callable[[str], Optional[str]], f
55
68
 
56
69
  for future in as_completed(futures):
57
70
  try:
58
- erased_name = future.result()
59
- if erased_name:
60
- finished_files.append(erased_name)
71
+ erased_name, from_archive = future.result()
72
+ if erased_name and from_archive is not None:
73
+ finished_files.append((erased_name, from_archive))
61
74
  except Exception as error:
62
75
  logger.exception("Error in filestore worker: " + str(error))
63
76
 
@@ -67,17 +80,29 @@ def _file_delete_worker(logger, delete_action: Callable[[str], Optional[str]], f
67
80
 
68
81
 
69
82
  class ExpiryManager(ServerBase):
70
- def __init__(self, redis_persist=None):
71
- self.config = forge.get_config()
83
+ def __init__(self, redis_persist=None, datastore=None, filestore=None, config=None, classification=None):
84
+ self.config = config or forge.get_config()
72
85
 
73
86
  super().__init__('assemblyline.expiry', shutdown_timeout=self.config.core.expiry.sleep_time + 5)
74
- self.datastore = forge.get_datastore(config=self.config)
75
- self.filestore = forge.get_filestore(config=self.config)
76
- self.classification = forge.get_classification()
87
+
88
+ # Set Archive related configs
89
+ if self.config.datastore.archive.enabled:
90
+ self.archive_access = True
91
+ self.index_type = Index.HOT_AND_ARCHIVE
92
+ else:
93
+ self.archive_access = False
94
+ self.index_type = Index.HOT
95
+
96
+ self.datastore = datastore or forge.get_datastore(config=self.config, archive_access=self.archive_access)
97
+ self.filestore = filestore or forge.get_filestore(config=self.config)
98
+ self.classification = classification or forge.get_classification()
77
99
  self.expirable_collections: list[ESCollection] = []
78
100
  self.counter = MetricsFactory('expiry', Metrics)
79
101
  self.file_delete_worker = ProcessPoolExecutor(self.config.core.expiry.delete_workers)
80
- self.same_storage = self.config.filestore.storage == self.config.filestore.archive
102
+ if self.config.filestore.archive:
103
+ self.same_storage = self.config.filestore.storage == self.config.filestore.archive
104
+ else:
105
+ self.same_storage = True
81
106
  self.current_submission_cleanup = set()
82
107
 
83
108
  self.redis_persist = redis_persist or get_client(
@@ -127,14 +152,15 @@ class ExpiryManager(ServerBase):
127
152
  def filestore_delete(self, file_batch, _):
128
153
  return self.file_delete_worker.submit(file_delete_worker, logger=self.log,
129
154
  filestore_urls=list(self.config.filestore.storage),
130
- file_batch=file_batch)
155
+ file_batch=file_batch,
156
+ archive_filestore_urls=list(self.config.filestore.archive))
131
157
 
132
158
  def cachestore_delete(self, file_batch, _):
133
159
  return self.file_delete_worker.submit(file_delete_worker, logger=self.log,
134
160
  filestore_urls=list(self.config.filestore.cache),
135
161
  file_batch=file_batch)
136
162
 
137
- def _finish_delete(self, collection: ESCollection, task: Future, expire_only: list[str]):
163
+ def _finish_delete(self, collection: ESCollection, task: Future, expire_only: list[tuple[str, bool]]):
138
164
  # Wait until the worker process finishes deleting files
139
165
  file_list: list[str] = []
140
166
  while self.running:
@@ -145,25 +171,41 @@ class ExpiryManager(ServerBase):
145
171
  except concurrent.futures.TimeoutError:
146
172
  pass
147
173
 
148
- file_list.extend(expire_only)
149
-
150
- # build a batch delete job for all the removed files
151
- bulk = collection.get_bulk_plan()
152
- for sha256 in file_list:
153
- bulk.add_delete_operation(sha256)
154
-
155
- if len(file_list) > 0:
174
+ if file_list:
156
175
  self.log.info(f'[{collection.name}] Deleted associated files from the '
157
176
  f'{"cachestore" if "cache" in collection.name else "filestore"}...')
158
- collection.bulk(bulk)
159
- self.counter.increment(f'{collection.name}', increment_by=len(file_list))
160
- self.log.info(f"[{collection.name}] Deleted {len(file_list)} items from the datastore...")
161
177
  else:
178
+ self.log.info(f'[{collection.name}] Nothing was deleted from the '
179
+ f'{"cachestore" if "cache" in collection.name else "filestore"}...')
180
+
181
+ # From the files to be deleted, check which are from the hot index
182
+ hot_file_list = [x[0] for x in file_list if not x[1]]
183
+ hot_file_list.extend([x[0] for x in expire_only if not x[1]])
184
+
185
+ # From the files to be deleted, check which are from the archive index
186
+ archive_file_list = [x[0] for x in file_list if x[1]]
187
+ archive_file_list.extend([x[0] for x in expire_only if x[1]])
188
+
189
+ for cur_file_list, index_type in [(hot_file_list, Index.HOT), (archive_file_list, Index.ARCHIVE)]:
190
+ if not cur_file_list:
191
+ # Nothing to delete from this index type
192
+ continue
193
+
194
+ # build a batch delete job for all the removed files
195
+ bulk = collection.get_bulk_plan(index_type=index_type)
196
+ for sha256 in cur_file_list:
197
+ bulk.add_delete_operation(sha256)
198
+
199
+ collection.bulk(bulk)
200
+ self.counter.increment(f'{collection.name}', increment_by=len(cur_file_list))
201
+ self.log.info(f"[{collection.name}] Deleted {len(cur_file_list)} items from the datastore...")
202
+
203
+ if not hot_file_list and not archive_file_list:
162
204
  self.log.warning(f'[{collection.name}] Expiry unable to clean up any of the files in filestore.')
163
205
 
164
- def _simple_delete(self, collection, delete_query, number_to_delete):
206
+ def _simple_delete(self, collection: ESCollection, delete_query, number_to_delete):
165
207
  self.heartbeat()
166
- collection.delete_by_query(delete_query)
208
+ collection.delete_by_query(delete_query, index_type=self.index_type)
167
209
  self.counter.increment(f'{collection.name}', increment_by=number_to_delete)
168
210
  self.log.info(f"[{collection.name}] Deleted {number_to_delete} items from the datastore...")
169
211
 
@@ -195,22 +237,45 @@ class ExpiryManager(ServerBase):
195
237
  # check if we are dealing with an index that needs file cleanup
196
238
  if self.config.core.expiry.delete_storage and collection.name in self.fs_hashmap:
197
239
  # Delete associated files
198
- delete_objects: list[str] = []
199
- for item in collection.stream_search(delete_query, fl='id', as_obj=False):
240
+ delete_objects: list[tuple[str, bool]] = []
241
+ for item in collection.stream_search(
242
+ delete_query, fl='id,from_archive', as_obj=False, index_type=self.index_type):
200
243
  self.heartbeat()
201
- delete_objects.append(item['id'])
244
+ delete_objects.append((item['id'], item.get('from_archive', False)))
202
245
 
203
246
  # Filter archived documents if archive filestore is the same as the filestore
204
- expire_only = []
205
- if self.same_storage and self.config.datastore.archive.enabled and collection.name == 'file':
206
- archived_files = self.datastore.file.multiexists(delete_objects, index_type=Index.ARCHIVE)
207
- delete_objects = [k for k, v in archived_files.items() if not v]
208
- expire_only = [k for k, v in archived_files.items() if v]
247
+ expire_only: list[tuple[str, bool]] = []
248
+ if self.same_storage and self.archive_access and collection.name == 'file':
249
+ # Separate hot and archive files
250
+ delete_from_archive = [i[0] for i in delete_objects if i[1]]
251
+ delete_from_hot = [i[0] for i in delete_objects if not i[1]]
252
+
253
+ # Check for overlap
254
+ overlap = set(delete_from_archive).intersection(set(delete_from_hot))
255
+ delete_from_archive = list(set(delete_from_archive)-overlap)
256
+ delete_from_hot = list(set(delete_from_hot)-overlap)
257
+
258
+ # Create the original delete_object form the overlap
259
+ delete_objects = [(k, False) for k in overlap]
260
+ delete_objects.extend([(k, True) for k in overlap])
261
+
262
+ if delete_from_hot:
263
+ # Check hot objects to delete if they are in archive
264
+ archived_files = self.datastore.file.multiexists(delete_from_hot, index_type=Index.ARCHIVE)
265
+ delete_objects.extend([(k, False) for k, v in archived_files.items() if not v])
266
+ expire_only.extend([(k, False) for k, v in archived_files.items() if v])
267
+
268
+ if delete_from_archive:
269
+ # Check hot objects to delete if they are in archive
270
+ hot_files = self.datastore.file.multiexists(delete_from_archive, index_type=Index.HOT)
271
+ delete_objects.extend([(k, True) for k, v in hot_files.items() if not v])
272
+ expire_only.extend([(k, True) for k, v in hot_files.items() if v])
209
273
 
210
274
  delete_tasks = self.fs_hashmap[collection.name](delete_objects, final_date)
211
275
 
212
276
  # Proceed with deletion, but only after all the scheduled deletes for this
213
- self.log.info(f"[{collection.name}] Scheduled {len(delete_objects)}/{number_to_delete} files to be removed")
277
+ self.log.info(f"[{collection.name}] Scheduled {len(delete_objects)}/{number_to_delete} files to be "
278
+ f"removed from the {'cachestore' if 'cache' in collection.name else 'filestore'}")
214
279
  self._finish_delete(collection, delete_tasks, expire_only)
215
280
 
216
281
  else:
@@ -264,7 +329,7 @@ class ExpiryManager(ServerBase):
264
329
  will be affected in between start date and the date found"""
265
330
  rows = collection.search(f"expiry_ts: {{{start} TO {final_date}]", rows=1,
266
331
  offset=self.expiry_size - 1, sort='expiry_ts asc',
267
- as_obj=False, fl='expiry_ts')
332
+ as_obj=False, fl='expiry_ts', index_type=self.index_type)
268
333
  if rows['items']:
269
334
  return rows['items'][0]['expiry_ts'], self.expiry_size
270
335
  return final_date, rows['total']
@@ -26,7 +26,7 @@ from kubernetes.client import V1Deployment, V1DeploymentSpec, V1PodTemplateSpec,
26
26
  V1PersistentVolumeClaimSpec, V1NetworkPolicy, V1NetworkPolicySpec, V1NetworkPolicyEgressRule, V1NetworkPolicyPeer, \
27
27
  V1NetworkPolicyIngressRule, V1Secret, V1SecretVolumeSource, V1LocalObjectReference, V1Service, \
28
28
  V1ServiceSpec, V1ServicePort, V1PodSecurityContext, V1Probe, V1ExecAction, V1SecurityContext, \
29
- V1Affinity, V1NodeAffinity, V1NodeSelector, V1NodeSelectorTerm, V1NodeSelectorRequirement
29
+ V1Affinity, V1NodeAffinity, V1NodeSelector, V1NodeSelectorTerm, V1NodeSelectorRequirement, V1Toleration
30
30
  from kubernetes.client.rest import ApiException
31
31
  from assemblyline.odm.models.service import DependencyConfig, DockerConfig, PersistentVolume
32
32
 
@@ -241,7 +241,7 @@ def parse_cpu(string: str) -> float:
241
241
  class KubernetesController(ControllerInterface):
242
242
  def __init__(self, logger, namespace: str, prefix: str, priority: str, dependency_priority: str,
243
243
  cpu_reservation: float, linux_node_selector: Selector, labels=None, log_level="INFO", core_env={},
244
- default_service_account=None, cluster_pod_list=True):
244
+ default_service_account=None, cluster_pod_list=True, default_service_tolerations = []):
245
245
  # Try loading a kubernetes connection from either the fact that we are running
246
246
  # inside of a cluster, or have a config file that tells us how
247
247
  try:
@@ -285,6 +285,7 @@ class KubernetesController(ControllerInterface):
285
285
  self._service_limited_env: dict[str, dict[str, str]] = defaultdict(dict)
286
286
  self.default_service_account: Optional[str] = default_service_account
287
287
  self.cluster_pod_list = cluster_pod_list
288
+ self.default_service_tolerations = [V1Toleration(**toleration.as_primitives()) for toleration in default_service_tolerations]
288
289
 
289
290
  # A record of previously reported events so that we don't report the same message repeatedly, fill it with
290
291
  # existing messages so we don't have a huge dump of duplicates on restart
@@ -849,6 +850,7 @@ class KubernetesController(ControllerInterface):
849
850
  security_context=V1PodSecurityContext(fs_group=1000),
850
851
  service_account_name=service_account,
851
852
  affinity=selector_to_node_affinity(self.linux_node_selector),
853
+ tolerations=self.default_service_tolerations
852
854
  )
853
855
 
854
856
  if use_pull_secret:
@@ -285,11 +285,13 @@ class ScalerServer(ThreadedCoreBase):
285
285
  'privilege': 'service'
286
286
  }
287
287
 
288
+ service_defaults_config = self.config.core.scaler.service_defaults
289
+
288
290
  # If Scaler has envs that set service-server env, then that should override configured values
289
291
  if SERVICE_API_HOST:
290
- self.config.core.scaler.service_defaults.environment = \
292
+ service_defaults_config.environment = \
291
293
  [EnvironmentVariable(dict(name="SERVICE_API_HOST", value=SERVICE_API_HOST))] + \
292
- [env for env in self.config.core.scaler.service_defaults.environment if env.name != "SERVICE_API_HOST"]
294
+ [env for env in service_defaults_config.environment if env.name != "SERVICE_API_HOST"]
293
295
 
294
296
  if self.config.core.scaler.additional_labels:
295
297
  labels.update({k: v for k, v in (_l.split("=") for _l in self.config.core.scaler.additional_labels)})
@@ -304,7 +306,9 @@ class ScalerServer(ThreadedCoreBase):
304
306
  log_level=self.config.logging.log_level,
305
307
  core_env=core_env,
306
308
  cluster_pod_list=self.config.core.scaler.cluster_pod_list,
307
- default_service_account=self.config.services.service_account)
309
+ default_service_account=self.config.services.service_account,
310
+ default_service_tolerations=service_defaults_config.tolerations
311
+ )
308
312
 
309
313
  # Add global configuration for privileged services
310
314
  self.controller.add_config_mount(KUBERNETES_AL_CONFIG, config_map=KUBERNETES_AL_CONFIG, key="config",
@@ -313,7 +317,7 @@ class ScalerServer(ThreadedCoreBase):
313
317
  # If we're passed an override for server-server and it's defining an HTTPS connection, then add a global
314
318
  # mount for the Root CA that needs to be mounted
315
319
  if INTERNAL_ENCRYPT:
316
- self.config.core.scaler.service_defaults.mounts.append(Mount(dict(
320
+ service_defaults_config.mounts.append(Mount(dict(
317
321
  name="root-ca",
318
322
  path="/etc/assemblyline/ssl/al_root-ca.crt",
319
323
  resource_type="secret",
@@ -322,7 +326,7 @@ class ScalerServer(ThreadedCoreBase):
322
326
  )))
323
327
 
324
328
  # Add default mounts for (non-)privileged services
325
- for mount in self.config.core.scaler.service_defaults.mounts:
329
+ for mount in service_defaults_config.mounts:
326
330
  # Deprecated configuration for mounting ConfigMap
327
331
  # TODO: Deprecate code on next major change
328
332
  if mount.config_map:
@@ -365,7 +369,7 @@ class ScalerServer(ThreadedCoreBase):
365
369
  if CLASSIFICATION_HOST_PATH:
366
370
  self.controller.global_mounts.append((CLASSIFICATION_HOST_PATH, '/etc/assemblyline/classification.yml'))
367
371
 
368
- for mount in self.config.core.scaler.service_defaults.mounts:
372
+ for mount in service_defaults_config.mounts:
369
373
  # Mounts are all storage-based since there's no equivalent to ConfigMaps in Docker
370
374
  if mount.privileged_only:
371
375
  self.controller.core_mounts.append((mount.name, mount.path))
@@ -91,7 +91,7 @@ class TaskingClient:
91
91
  self.event_listener.stop()
92
92
 
93
93
  @elasticapm.capture_span(span_type='tasking_client')
94
- def upload_file(self, file_path, classification, ttl, is_section_image, expected_sha256=None):
94
+ def upload_file(self, file_path, classification, ttl, is_section_image, is_supplementary, expected_sha256=None):
95
95
  # Identify the file info of the uploaded file
96
96
  file_info = self.identify.fileinfo(file_path)
97
97
 
@@ -105,8 +105,12 @@ class TaskingClient:
105
105
  file_info['expiry_ts'] = None
106
106
 
107
107
  # Update the datastore with the uploaded file
108
- self.datastore.save_or_freshen_file(file_info['sha256'], file_info, file_info['expiry_ts'],
109
- file_info['classification'], is_section_image=is_section_image)
108
+ self.datastore.save_or_freshen_file(
109
+ file_info['sha256'],
110
+ file_info, file_info['expiry_ts'],
111
+ file_info['classification'],
112
+ is_section_image=is_section_image,
113
+ is_supplementary=is_supplementary)
110
114
 
111
115
  # Upload file to the filestore (upload already checks if the file exists)
112
116
  self.filestore.upload(file_path, file_info['sha256'])
@@ -349,7 +353,8 @@ class TaskingClient:
349
353
  file_info['classification'] = item['classification']
350
354
  self.datastore.save_or_freshen_file(item['sha256'], file_info,
351
355
  file_info['expiry_ts'], file_info['classification'],
352
- is_section_image=item.get('is_section_image', False))
356
+ is_section_image=item.get('is_section_image', False),
357
+ is_supplementary=item.get('is_supplementary', False))
353
358
  return False
354
359
 
355
360
  if task.ttl:
@@ -15,7 +15,7 @@ import docker
15
15
 
16
16
  from kubernetes.client import V1Job, V1ObjectMeta, V1JobSpec, V1PodTemplateSpec, V1PodSpec, V1Volume, \
17
17
  V1VolumeMount, V1EnvVar, V1Container, V1ResourceRequirements, \
18
- V1ConfigMapVolumeSource, V1Secret, V1SecretVolumeSource, V1LocalObjectReference
18
+ V1ConfigMapVolumeSource, V1Secret, V1SecretVolumeSource, V1LocalObjectReference, V1Toleration
19
19
  from kubernetes import client, config
20
20
  from kubernetes.client.rest import ApiException
21
21
 
@@ -148,7 +148,7 @@ class DockerUpdateInterface:
148
148
 
149
149
  class KubernetesUpdateInterface:
150
150
  def __init__(self, logger, prefix, namespace, priority_class, extra_labels, linux_node_selector: Selector,
151
- log_level="INFO", default_service_account=None):
151
+ log_level="INFO", default_service_account=None, default_service_tolerations=[]):
152
152
  # Try loading a kubernetes connection from either the fact that we are running
153
153
  # inside of a cluster, or we have a configuration in the normal location
154
154
  try:
@@ -181,6 +181,8 @@ class KubernetesUpdateInterface:
181
181
  self.default_service_account = default_service_account
182
182
  self.secret_env = []
183
183
  self.linux_node_selector = linux_node_selector
184
+ self.default_service_tolerations = [V1Toleration(**toleration.as_primitives()) for toleration in default_service_tolerations]
185
+
184
186
 
185
187
  # Get the deployment of this process. Use that information to fill out the secret info
186
188
  deployment = self.apps_api.read_namespaced_deployment(name='updater', namespace=self.namespace)
@@ -465,7 +467,8 @@ class ServiceUpdater(ThreadedCoreBase):
465
467
  extra_labels=extra_labels,
466
468
  log_level=self.config.logging.log_level,
467
469
  default_service_account=self.config.services.service_account,
468
- linux_node_selector=self.config.core.scaler.linux_node_selector)
470
+ linux_node_selector=self.config.core.scaler.linux_node_selector,
471
+ default_service_tolerations=self.config.core.scaler.service_defaults.tolerations)
469
472
  # Add all additional mounts to privileged services
470
473
  self.mounts = self.config.core.scaler.service_defaults.mounts
471
474
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.0.26
3
+ Version: 4.5.0.28
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -14,8 +14,8 @@ from assemblyline.odm.randomizer import random_model_obj, random_minimal_obj, ge
14
14
  from assemblyline.odm import models
15
15
  from assemblyline.common.metrics import MetricsFactory
16
16
 
17
- from assemblyline_core.dispatching.client import DispatchClient
18
- from assemblyline_core.dispatching.dispatcher import Dispatcher, Submission
17
+ from assemblyline_core.dispatching.client import DispatchClient, DISPATCH_RESULT_QUEUE
18
+ from assemblyline_core.dispatching.dispatcher import Dispatcher, ServiceTask, Submission
19
19
  from assemblyline_core.dispatching.schedules import Scheduler as RealScheduler
20
20
 
21
21
  # noinspection PyUnresolvedReferences
@@ -123,7 +123,7 @@ def test_simple(clean_redis, clean_datastore):
123
123
  user: User = random_model_obj(User)
124
124
  ds.user.save(user.uname, user)
125
125
 
126
- sub: Submission = random_model_obj(models.submission.Submission)
126
+ sub: Submission = random_model_obj(Submission)
127
127
  sub.sid = sid = 'first-submission'
128
128
  sub.params.ignore_cache = False
129
129
  sub.params.max_extracted = 5
@@ -242,7 +242,7 @@ def test_dispatch_extracted(clean_redis, clean_datastore):
242
242
  ds.file.save(fh, obj)
243
243
 
244
244
  # Inject the fake submission
245
- submission = random_model_obj(models.submission.Submission)
245
+ submission = random_model_obj(Submission)
246
246
  submission.to_be_deleted = False
247
247
  submission.files = [dict(name='./file', sha256=file_hash)]
248
248
  submission.params.submitter = user.uname
@@ -284,8 +284,8 @@ def test_dispatch_extracted(clean_redis, clean_datastore):
284
284
  @mock.patch('assemblyline_core.dispatching.dispatcher.MetricsFactory', mock.MagicMock())
285
285
  @mock.patch('assemblyline_core.dispatching.dispatcher.Scheduler', DRPScheduler)
286
286
  def test_dispatch_extracted_bypass_drp(clean_redis, clean_datastore):
287
- # Dynamic Recursion Prevention is to prevent services belonging to the 'Dynamic Analysis' from analyzing the children
288
- # of files they've analyzed.
287
+ # Dynamic Recursion Prevention is to prevent services belonging to the 'Dynamic Analysis'
288
+ # from analyzing the children of files they've analyzed.
289
289
 
290
290
  # The bypass should allow services to specify files to run through Dynamic Analysis regardless of the
291
291
  # Dynamic Recursion Prevention parameter.
@@ -308,7 +308,7 @@ def test_dispatch_extracted_bypass_drp(clean_redis, clean_datastore):
308
308
  ds.file.save(fh, obj)
309
309
 
310
310
  # Inject the fake submission
311
- submission = random_model_obj(models.submission.Submission)
311
+ submission = random_model_obj(Submission)
312
312
  submission.to_be_deleted = False
313
313
  submission.params.ignore_dynamic_recursion_prevention = False
314
314
  submission.params.services.selected = ['extract', 'sandbox']
@@ -372,7 +372,7 @@ def test_dispatch_extracted_bypass_drp(clean_redis, clean_datastore):
372
372
  disp.service_worker(disp.process_queue_index(sid))
373
373
 
374
374
  # 'sandbox' should have a task for the extracted file
375
- #disp.dispatch_file(disp.tasks.get(sid), second_file_hash)
375
+ # disp.dispatch_file(disp.tasks.get(sid), second_file_hash)
376
376
  job = client.request_work('0', 'sandbox', '0')
377
377
  assert job.fileinfo.sha256 == second_file_hash
378
378
  assert job.filename == 'second-*'
@@ -417,3 +417,37 @@ def test_timeout():
417
417
 
418
418
  # Expire nothing
419
419
  assert len(table.timeouts()) == 0
420
+
421
+
422
+ def test_prevent_result_overwrite(clean_redis, clean_datastore):
423
+ client = DispatchClient(clean_datastore, clean_redis, clean_redis)
424
+ dispatcher_name = "test"
425
+ result_queue = client._get_queue_from_cache(DISPATCH_RESULT_QUEUE + dispatcher_name)
426
+
427
+ # Create a task and add it to set of running tasks
428
+ task = random_model_obj(ServiceTask)
429
+ task.metadata['dispatcher__'] = dispatcher_name
430
+
431
+ # Create a result that's not "empty"
432
+ result = random_model_obj(Result)
433
+ result.response.service_name = task.service_name
434
+ result.sha256 = task.fileinfo.sha256
435
+ result.result.score = 1
436
+ result_key = result.build_key()
437
+
438
+ # Submit result to be saved
439
+ client.running_tasks.add(task.key(), task.as_primitives())
440
+ client.service_finished(task.sid, result_key, result)
441
+
442
+ # Pop result from queue, we expect to get the same result key as earlier
443
+ message = result_queue.pop(blocking=False)
444
+ msg_result_key = message['result_summary']['key']
445
+ assert msg_result_key == result_key
446
+
447
+ # Save the same result again but we expect to be saved under another key
448
+ client.running_tasks.add(task.key(), task.as_primitives())
449
+ client.service_finished(task.sid, result_key, result)
450
+ message = result_queue.pop(blocking=False)
451
+ msg_result_key = message['result_summary']['key']
452
+
453
+ assert msg_result_key != result_key
@@ -4,6 +4,7 @@ import random
4
4
  import concurrent.futures
5
5
 
6
6
  from assemblyline.common.isotime import now_as_iso
7
+ from assemblyline.datastore.helper import AssemblylineDatastore
7
8
  from assemblyline.odm.randomizer import random_model_obj
8
9
 
9
10
  from assemblyline_core.expiry.run_expiry import ExpiryManager
@@ -14,34 +15,31 @@ expiry_collections_len = {}
14
15
  archive_collections_len = {}
15
16
 
16
17
 
17
- @pytest.fixture(scope='module')
18
- def datastore(archive_connection):
19
- return archive_connection
20
-
21
-
22
- def purge_data(datastore):
23
- for name, definition in datastore.ds.get_models().items():
18
+ def purge_data(datastore_connection: AssemblylineDatastore):
19
+ for name, definition in datastore_connection.ds.get_models().items():
24
20
  if hasattr(definition, 'expiry_ts'):
25
- getattr(datastore, name).wipe()
21
+ getattr(datastore_connection, name).wipe()
26
22
 
27
23
 
28
24
  @pytest.fixture(scope="function")
29
- def ds_expiry(request, datastore):
30
- for name, definition in datastore.ds.get_models().items():
25
+ def ds_expiry(request, datastore_connection):
26
+ for name, definition in datastore_connection.ds.get_models().items():
31
27
  if hasattr(definition, 'expiry_ts'):
32
- collection = getattr(datastore, name)
28
+ collection = getattr(datastore_connection, name)
33
29
  collection.wipe()
34
30
  expiry_len = random.randint(MIN_OBJECTS, MAX_OBJECTS)
35
31
  for x in range(expiry_len):
36
32
  obj = random_model_obj(collection.model_class)
33
+ if hasattr(definition, 'from_archive'):
34
+ obj.from_archive = False
37
35
  obj.expiry_ts = now_as_iso(-10000)
38
36
  collection.save('longer_name'+str(x), obj)
39
37
 
40
38
  expiry_collections_len[name] = expiry_len
41
39
  collection.commit()
42
40
 
43
- request.addfinalizer(lambda: purge_data(datastore))
44
- return datastore
41
+ request.addfinalizer(lambda: purge_data(datastore_connection))
42
+ return datastore_connection
45
43
 
46
44
 
47
45
  class FakeCounter(object):
@@ -58,8 +56,8 @@ class FakeCounter(object):
58
56
  return self.counts.get(name, 0)
59
57
 
60
58
 
61
- def test_expire_all(ds_expiry):
62
- expiry = ExpiryManager()
59
+ def test_expire_all(config, ds_expiry, filestore):
60
+ expiry = ExpiryManager(config=config, datastore=ds_expiry, filestore=filestore)
63
61
  expiry.running = True
64
62
  expiry.counter = FakeCounter()
65
63
  with concurrent.futures.ThreadPoolExecutor(5) as pool:
@@ -1 +0,0 @@
1
- 4.5.0.26