assemblyline-core 4.5.0.1__tar.gz → 4.5.1.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-core might be problematic. Click here for more details.

Files changed (88) hide show
  1. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/PKG-INFO +1 -1
  2. assemblyline-core-4.5.1.dev0/assemblyline_core/VERSION +1 -0
  3. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/dispatching/client.py +1 -0
  4. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/dispatching/dispatcher.py +315 -63
  5. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/loader/run.py +1 -1
  6. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/scaler/controllers/interface.py +10 -1
  7. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/scaler/controllers/kubernetes_ctl.py +70 -7
  8. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/scaler/scaler_server.py +36 -5
  9. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/updater/helper.py +94 -70
  10. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/updater/run_updater.py +3 -3
  11. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core.egg-info/PKG-INFO +1 -1
  12. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_scheduler.py +2 -1
  13. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_simulation.py +209 -7
  14. assemblyline-core-4.5.0.1/assemblyline_core/VERSION +0 -1
  15. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/LICENCE.md +0 -0
  16. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/README.md +0 -0
  17. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/__init__.py +0 -0
  18. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/alerter/__init__.py +0 -0
  19. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/alerter/processing.py +0 -0
  20. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/alerter/run_alerter.py +0 -0
  21. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/archiver/__init__.py +0 -0
  22. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/archiver/run_archiver.py +0 -0
  23. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/badlist_client.py +0 -0
  24. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/dispatching/__init__.py +0 -0
  25. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/dispatching/__main__.py +0 -0
  26. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/dispatching/schedules.py +0 -0
  27. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/dispatching/timeout.py +0 -0
  28. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/expiry/__init__.py +0 -0
  29. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/expiry/run_expiry.py +0 -0
  30. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/ingester/__init__.py +0 -0
  31. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/ingester/__main__.py +0 -0
  32. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/ingester/constants.py +0 -0
  33. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/ingester/ingester.py +0 -0
  34. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/metrics/__init__.py +0 -0
  35. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/metrics/es_metrics.py +0 -0
  36. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/metrics/heartbeat_formatter.py +0 -0
  37. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/metrics/helper.py +0 -0
  38. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/metrics/metrics_server.py +0 -0
  39. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/metrics/run_heartbeat_manager.py +0 -0
  40. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/metrics/run_metrics_aggregator.py +0 -0
  41. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/metrics/run_statistics_aggregator.py +0 -0
  42. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/plumber/__init__.py +0 -0
  43. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/plumber/run_plumber.py +0 -0
  44. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/__init__.py +0 -0
  45. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/client.py +0 -0
  46. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/creator/__init__.py +0 -0
  47. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/creator/run.py +0 -0
  48. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/creator/run_worker.py +0 -0
  49. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/loader/__init__.py +0 -0
  50. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/loader/run_worker.py +0 -0
  51. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/replay/replay.py +0 -0
  52. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/safelist_client.py +0 -0
  53. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/scaler/__init__.py +0 -0
  54. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/scaler/collection.py +0 -0
  55. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/scaler/controllers/__init__.py +0 -0
  56. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/scaler/controllers/docker_ctl.py +0 -0
  57. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/scaler/run_scaler.py +0 -0
  58. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/server_base.py +0 -0
  59. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/signature_client.py +0 -0
  60. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/submission_client.py +0 -0
  61. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/tasking_client.py +0 -0
  62. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/updater/__init__.py +0 -0
  63. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/vacuum/__init__.py +0 -0
  64. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/vacuum/crawler.py +0 -0
  65. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/vacuum/department_map.py +0 -0
  66. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/vacuum/safelist.py +0 -0
  67. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/vacuum/stream_map.py +0 -0
  68. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/vacuum/worker.py +0 -0
  69. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/workflow/__init__.py +0 -0
  70. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core/workflow/run_workflow.py +0 -0
  71. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core.egg-info/SOURCES.txt +0 -0
  72. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core.egg-info/dependency_links.txt +0 -0
  73. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core.egg-info/requires.txt +0 -0
  74. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/assemblyline_core.egg-info/top_level.txt +0 -0
  75. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/setup.cfg +0 -0
  76. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/setup.py +0 -0
  77. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_alerter.py +0 -0
  78. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_badlist_client.py +0 -0
  79. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_dispatcher.py +0 -0
  80. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_expiry.py +0 -0
  81. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_plumber.py +0 -0
  82. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_replay.py +0 -0
  83. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_safelist_client.py +0 -0
  84. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_scaler.py +0 -0
  85. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_signature_client.py +0 -0
  86. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_vacuum.py +0 -0
  87. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_worker_ingest.py +0 -0
  88. {assemblyline-core-4.5.0.1 → assemblyline-core-4.5.1.dev0}/test/test_worker_submit.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.0.1
3
+ Version: 4.5.1.dev0
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -0,0 +1 @@
1
+ 4.5.1.dev0
@@ -319,6 +319,7 @@ class DispatchClient:
319
319
  'key': result_key,
320
320
  'drop': result.drop_file,
321
321
  'score': result.result.score,
322
+ 'partial': result.partial,
322
323
  'children': [(r.sha256, r.parent_relation) for r in result.response.extracted],
323
324
  },
324
325
  'tags': tags,
@@ -42,17 +42,16 @@ from assemblyline.remote.datatypes.queues.named import NamedQueue
42
42
  from assemblyline.remote.datatypes.set import ExpiringSet, Set
43
43
  from assemblyline.remote.datatypes.user_quota_tracker import UserQuotaTracker
44
44
  from assemblyline_core.server_base import ThreadedCoreBase
45
- from assemblyline_core.alerter.run_alerter import ALERT_QUEUE_NAME
46
45
 
46
+ from .schedules import Scheduler
47
+ from .timeout import TimeoutTable
48
+ from ..ingester.constants import COMPLETE_QUEUE_NAME
47
49
 
48
50
  if TYPE_CHECKING:
49
51
  from assemblyline.odm.models.file import File
52
+ from redis import Redis
50
53
 
51
54
 
52
- from .schedules import Scheduler
53
- from .timeout import TimeoutTable
54
- from ..ingester.constants import COMPLETE_QUEUE_NAME
55
-
56
55
  APM_SPAN_TYPE = 'handle_message'
57
56
 
58
57
  AL_SHUTDOWN_GRACE = int(os.environ.get('AL_SHUTDOWN_GRACE', '60'))
@@ -64,6 +63,12 @@ DYNAMIC_ANALYSIS_CATEGORY = 'Dynamic Analysis'
64
63
  DAY_IN_SECONDS = 24 * 60 * 60
65
64
 
66
65
 
66
+ class KeyType(enum.Enum):
67
+ OVERWRITE = 'overwrite'
68
+ UNION = 'union'
69
+ IGNORE = 'ignore'
70
+
71
+
67
72
  class Action(enum.IntEnum):
68
73
  start = 0
69
74
  result = 1
@@ -84,6 +89,19 @@ class DispatchAction:
84
89
  event: Optional[threading.Event] = dataclasses.field(compare=False, default=None)
85
90
 
86
91
 
92
+ @dataclasses.dataclass()
93
+ class MonitorTask:
94
+ """Tracks whether a task needs to be rerun based on """
95
+ # Service name
96
+ service: str
97
+ # sha256 of file in question
98
+ sha: str
99
+ # The temporary values this task was last dispatached with
100
+ values: dict[str, Optional[str]]
101
+ # Should aservice be dispatched again when possible
102
+ dispatch_needed: bool = dataclasses.field(default=False)
103
+
104
+
87
105
  @contextmanager
88
106
  def apm_span(client, span_name: str):
89
107
  try:
@@ -99,13 +117,156 @@ def apm_span(client, span_name: str):
99
117
 
100
118
 
101
119
  class ResultSummary:
102
- def __init__(self, key, drop, score, children):
120
+ def __init__(self, key, drop, score, children, partial=False) -> None:
103
121
  self.key: str = key
104
122
  self.drop: bool = drop
123
+ self.partial: bool = partial
105
124
  self.score: int = score
106
125
  self.children: list[tuple[str, str]] = children
107
126
 
108
127
 
128
+ class TemporaryFileData:
129
+ def __init__(self, sha256: str) -> None:
130
+ self.sha256 = sha256
131
+ self.parents: list[TemporaryFileData] = []
132
+ self.children: list[TemporaryFileData] = []
133
+ self.parent_cache: dict[str, Any] = {}
134
+ self.local_values: dict[str, Any] = {}
135
+
136
+ def add_parent(self, parent_temp: TemporaryFileData):
137
+ """Add a parent to this node."""
138
+ self.parents.append(parent_temp)
139
+ parent_temp.children.append(self)
140
+
141
+ def new_child(self, child: str) -> TemporaryFileData:
142
+ """Create a linked entry for a new child."""
143
+ temp = TemporaryFileData(child)
144
+ temp.parents.append(self)
145
+ self.children.append(temp)
146
+ temp.build_parent_cache()
147
+ return temp
148
+
149
+ def build_parent_cache(self):
150
+ """Rebuild the cache of data from parent files."""
151
+ self.parent_cache.clear()
152
+ for parent in self.parents:
153
+ self.parent_cache.update(parent.read())
154
+
155
+ def read(self) -> dict[str, Any]:
156
+ """Get a copy of the current data"""
157
+ # Start with a shallow copy ofthe parent cache
158
+ data = dict(self.parent_cache)
159
+
160
+ # update, this overwrites any common keys (we want this)
161
+ data.update(self.local_values)
162
+ return data
163
+
164
+ def read_key(self, key: str) -> Any:
165
+ """Get a copy of the current data"""
166
+ try:
167
+ return self.local_values[key]
168
+ except KeyError:
169
+ return self.parent_cache.get(key)
170
+
171
+ def set_value(self, key: str, value: str) -> set[str]:
172
+ """Using a SET operation update the value on this node and all children.
173
+
174
+ Returns a list of the sha of all files who's temporary data has been modified.
175
+ """
176
+ # Check if the local value doesn't change then we won't have any effect on children
177
+ old = self.local_values.get(key)
178
+ if type(old) is type(value) and old == value:
179
+ return set()
180
+
181
+ # Update the local value and recurse into children
182
+ self.local_values[key] = value
183
+ changed = [self.sha256]
184
+ for child in self.children:
185
+ changed.extend(child.set_value_from_ancestor(key, value))
186
+ return set(changed)
187
+
188
+ def set_value_from_ancestor(self, key: str, value: str) -> set[str]:
189
+ """Given that an ancestor has changed, test if this file's temporary data will change also."""
190
+ # If this child has already set this key, the parent values don't matter
191
+ if key in self.local_values:
192
+ return set()
193
+
194
+ # If the parent value was already set to this nothing has changed
195
+ old = self.parent_cache.get(key)
196
+ if type(old) is type(value) and old == value:
197
+ return set()
198
+
199
+ # Update the parent cache and recurse into children
200
+ self.parent_cache[key] = value
201
+ changed = [self.sha256]
202
+ for child in self.children:
203
+ changed.extend(child.set_value_from_ancestor(key, value))
204
+ return set(changed)
205
+
206
+ def union_value(self, key: str, value: set[str]) -> set[str]:
207
+ """Using a MERGE operation update the value on this node and all children.
208
+
209
+ Returns a list of the sha of all files who's temporary data has been modified.
210
+ """
211
+ if not value:
212
+ return set()
213
+
214
+ # Check if the local value doesn't change then we won't have any effect on children
215
+ new_value = merge_in_values(self.local_values.get(key), value)
216
+ if new_value is None:
217
+ return set()
218
+
219
+ # Update the local value and recurse into children
220
+ self.local_values[key] = new_value
221
+ changed = [self.sha256]
222
+ for child in self.children:
223
+ changed.extend(child.union_value_from_ancestor(key, value))
224
+ return set(changed)
225
+
226
+ def union_value_from_ancestor(self, key: str, value: set[str]) -> set[str]:
227
+ """Given that an ancestor has changed, test if this file's temporary data will change also.
228
+
229
+ For values updated by union the parent and local values are the same.
230
+ """
231
+ # Merge in data to parent cache, we won't be reading from it, but we still want to keep it
232
+ # up to date and use it to check if changes are needed
233
+ new_value = merge_in_values(self.parent_cache.get(key), value)
234
+ if new_value is None:
235
+ return set()
236
+ self.parent_cache[key] = new_value
237
+
238
+ # Update the local values as well if we need to
239
+ new_value = merge_in_values(self.local_values.get(key), value)
240
+ if new_value is None:
241
+ return set()
242
+ self.local_values[key] = new_value
243
+
244
+ # Since we did change the local value, pass the new set down to children
245
+ changed = [self.sha256]
246
+ for child in self.children:
247
+ changed.extend(child.union_value_from_ancestor(key, value))
248
+ return set(changed)
249
+
250
+
251
+ def merge_in_values(old_values: Any, new_values: set[str]) -> Optional[list[str]]:
252
+ """Merge in new values into a json list.
253
+
254
+ If there is no new values return None.
255
+ """
256
+ # Read out the old value set
257
+ if isinstance(old_values, (list, set)):
258
+ old_values = set(old_values)
259
+ else:
260
+ old_values = set()
261
+
262
+ # If we have no new values to merge in
263
+ if new_values <= old_values:
264
+ return None
265
+
266
+ # We have new values, build a new set
267
+ return list(new_values | old_values)
268
+
269
+
109
270
  class SubmissionTask:
110
271
  """Dispatcher internal model for submissions"""
111
272
 
@@ -126,12 +287,13 @@ class SubmissionTask:
126
287
  self.file_schedules: dict[str, list[dict[str, Service]]] = {}
127
288
  self.file_tags: dict[str, dict[str, dict[str, Any]]] = defaultdict(dict)
128
289
  self.file_depth: dict[str, int] = {}
129
- self.file_temporary_data: dict[str, dict] = defaultdict(dict)
290
+ self.temporary_data: dict[str, TemporaryFileData] = {}
130
291
  self.extra_errors: list[str] = []
131
292
  self.active_files: set[str] = set()
132
293
  self.dropped_files: set[str] = set()
133
294
  self.dynamic_recursion_bypass: set[str] = set()
134
295
  self.service_logs: dict[tuple[str, str], list[str]] = defaultdict(list)
296
+ self.monitoring: dict[tuple[str, str], MonitorTask] = {}
135
297
 
136
298
  # mapping from file hash to a set of services that shouldn't be run on
137
299
  # any children (recursively) of that file
@@ -178,15 +340,15 @@ class SubmissionTask:
178
340
  children_detail: list[tuple[str, str]] = [(r['sha256'], r['parent_relation']) for r in extracted]
179
341
  self.service_results[(sha256, service)] = ResultSummary(
180
342
  key=k, drop=result['drop_file'], score=result['result']['score'],
181
- children=children_detail)
343
+ children=children_detail, partial=result.get('partial', False))
182
344
 
183
345
  tags = Result(result).scored_tag_dict()
184
- for key in tags.keys():
346
+ for key, tag in tags.items():
185
347
  if key in self.file_tags[sha256].keys():
186
348
  # Sum score of already known tags
187
- self.file_tags[sha256][key]['score'] += tags[key]['score']
349
+ self.file_tags[sha256][key]['score'] += tag['score']
188
350
  else:
189
- self.file_tags[sha256][key] = tags[key]
351
+ self.file_tags[sha256][key] = tag
190
352
 
191
353
  if errors is not None:
192
354
  for e in errors:
@@ -195,6 +357,7 @@ class SubmissionTask:
195
357
 
196
358
  @property
197
359
  def sid(self) -> str:
360
+ """Shortcut to read submission SID"""
198
361
  return self.submission.sid
199
362
 
200
363
  def forbid_for_children(self, sha256: str, service_name: str):
@@ -206,16 +369,23 @@ class SubmissionTask:
206
369
 
207
370
  def register_children(self, parent: str, children: list[str]):
208
371
  """
209
- Note for the purposes of dynamic recursion prevention which
210
- files extracted other files.
372
+ Note which files extracted other files.
373
+ _parent_map is for dynamic recursion prevention
374
+ temporary_data is for cascading the temp data to children
211
375
  """
376
+ parent_temp = self.temporary_data[parent]
212
377
  for child in children:
378
+ try:
379
+ self.temporary_data[child].add_parent(parent_temp)
380
+ except KeyError:
381
+ self.temporary_data[child] = parent_temp.new_child(child)
213
382
  try:
214
383
  self._parent_map[child].add(parent)
215
384
  except KeyError:
216
385
  self._parent_map[child] = {parent}
217
386
 
218
387
  def all_ancestors(self, sha256: str) -> list[str]:
388
+ """Collect all the known ancestors of the given file within this submission."""
219
389
  visited = set()
220
390
  to_visit = [sha256]
221
391
  while len(to_visit) > 0:
@@ -239,6 +409,64 @@ class SubmissionTask:
239
409
  for parent in self.all_ancestors(sha256)
240
410
  ]))
241
411
 
412
+ def set_monitoring_entry(self, sha256: str, service_name: str, values: dict[str, Optional[str]]):
413
+ """A service with monitoring has dispatched, keep track of the conditions."""
414
+ self.monitoring[(sha256, service_name)] = MonitorTask(
415
+ service=service_name,
416
+ sha=sha256,
417
+ values=values,
418
+ )
419
+
420
+ def partial_result(self, sha256, service_name):
421
+ """Note that a partial result has been recieved. If a dispatch was requested process that now."""
422
+ try:
423
+ entry = self.monitoring[(sha256, service_name)]
424
+ except KeyError:
425
+ return
426
+
427
+ if entry.dispatch_needed:
428
+ self.redispatch_service(sha256, service_name)
429
+
430
+ def clear_monitoring_entry(self, sha256, service_name):
431
+ """A service has completed normally. If the service is monitoring clear out the record."""
432
+ # We have an incoming non-partial result, flush out any partial monitoring
433
+ self.monitoring.pop((sha256, service_name), None)
434
+ # If there is a partial result for this service flush that as well so we accept this new result
435
+ result = self.service_results.get((sha256, service_name))
436
+ if result and result.partial:
437
+ self.service_results.pop((sha256, service_name), None)
438
+
439
+ def file_temporary_data_changed(self, changed_sha256: set[str], key: str) -> list[str]:
440
+ """Check all of the monitored tasks on that key for changes. Redispatch as needed."""
441
+ changed = []
442
+ for (sha256, service), entry in self.monitoring.items():
443
+ if sha256 not in changed_sha256:
444
+ continue
445
+
446
+ value = self.temporary_data[sha256].read_key(key)
447
+ dispatched_value = entry.values.get(key)
448
+
449
+ if type(value) is not type(dispatched_value) or value != dispatched_value:
450
+ result = self.service_results.get((sha256, service))
451
+ if not result:
452
+ entry.dispatch_needed = True
453
+ else:
454
+ self.redispatch_service(sha256, service)
455
+ changed.append(sha256)
456
+ return changed
457
+
458
+ def redispatch_service(self, sha256, service_name):
459
+ # Clear the result if its partial or an error
460
+ result = self.service_results.get((sha256, service_name))
461
+ if result and not result.partial:
462
+ return
463
+ self.service_results.pop((sha256, service_name), None)
464
+ self.service_errors.pop((sha256, service_name), None)
465
+ self.service_attempts[(sha256, service_name)] = 1
466
+
467
+ # Try to get the service to run again by reseting the schedule for that service
468
+ self.file_schedules.pop(sha256, None)
469
+
242
470
 
243
471
  DISPATCH_TASK_ASSIGNMENT = 'dispatcher-tasks-assigned-to-'
244
472
  TASK_ASSIGNMENT_PATTERN = DISPATCH_TASK_ASSIGNMENT + '*'
@@ -267,7 +495,7 @@ SUBMISSION_TOTAL_TIMEOUT = 60 * 20
267
495
 
268
496
  class Dispatcher(ThreadedCoreBase):
269
497
  @staticmethod
270
- def all_instances(persistent_redis):
498
+ def all_instances(persistent_redis: Redis):
271
499
  return Hash(DISPATCH_DIRECTORY, host=persistent_redis).keys()
272
500
 
273
501
  @staticmethod
@@ -287,7 +515,7 @@ class Dispatcher(ThreadedCoreBase):
287
515
  }
288
516
 
289
517
  def __init__(self, datastore=None, redis=None, redis_persist=None, logger=None,
290
- config=None, counter_name='dispatcher'):
518
+ config=None, counter_name: str = 'dispatcher'):
291
519
  super().__init__('assemblyline.dispatcher', config=config, datastore=datastore,
292
520
  redis=redis, redis_persist=redis_persist, logger=logger)
293
521
 
@@ -297,8 +525,7 @@ class Dispatcher(ThreadedCoreBase):
297
525
  self.finalizing = threading.Event()
298
526
  self.finalizing_start = 0.0
299
527
 
300
- #
301
- # # Build some utility classes
528
+ # Build some utility classes
302
529
  self.scheduler = Scheduler(self.datastore, self.config, self.redis)
303
530
  self.running_tasks = Hash(DISPATCH_RUNNING_TASK_HASH, host=self.redis)
304
531
  self.scaler_timeout_queue = NamedQueue(SCALER_TIMEOUT_QUEUE, host=self.redis_persist)
@@ -321,12 +548,12 @@ class Dispatcher(ThreadedCoreBase):
321
548
  self.ingester_scanning = Hash('m-scanning-table', self.redis_persist)
322
549
 
323
550
  # Communications queues
324
- self.start_queue = NamedQueue(DISPATCH_START_EVENTS+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
325
- self.result_queue = NamedQueue(DISPATCH_RESULT_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
326
- self.command_queue = NamedQueue(DISPATCH_COMMAND_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
327
-
328
- # Submissions that should have alerts generated
329
- self.alert_queue = NamedQueue(ALERT_QUEUE_NAME, self.redis_persist)
551
+ self.start_queue: NamedQueue[tuple[str, str, str, str]] =\
552
+ NamedQueue(DISPATCH_START_EVENTS+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
553
+ self.result_queue: NamedQueue[dict] =\
554
+ NamedQueue(DISPATCH_RESULT_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
555
+ self.command_queue: NamedQueue[dict] =\
556
+ NamedQueue(DISPATCH_COMMAND_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
330
557
 
331
558
  # Publish counters to the metrics sink.
332
559
  self.counter = MetricsFactory(metrics_type='dispatcher', schema=Metrics, name=counter_name,
@@ -387,11 +614,16 @@ class Dispatcher(ThreadedCoreBase):
387
614
  _q = self.find_process_queue(sid)
388
615
  _q.put(DispatchAction(kind=Action.check_submission, sid=sid))
389
616
 
390
- def _handle_service_change_event(self, data: ServiceChange):
617
+ def _handle_service_change_event(self, data: Optional[ServiceChange]):
618
+ if not data:
619
+ # We may have missed change messages, flush cache
620
+ self.scheduler.c12n_services.clear()
621
+ return
391
622
  if data.operation == Operation.Removed:
392
623
  # Remove all current instances of service from scheduler cache
393
- [service_set.remove(data.name) for service_set in self.scheduler.c12n_services.values()
394
- if data.name in service_set]
624
+ for service_set in self.scheduler.c12n_services.values():
625
+ if data.name in service_set:
626
+ service_set.remove(data.name)
395
627
  else:
396
628
  # If Added/Modifed, pull the service information and modify cache
397
629
  service: Service = self.datastore.get_service_with_delta(data.name)
@@ -451,7 +683,7 @@ class Dispatcher(ThreadedCoreBase):
451
683
  # If the dispatcher is exiting cleanly remove as many tasks from the service queues as we can
452
684
  service_queues = {}
453
685
  for task in self.tasks.values():
454
- for (sha256, service_name), dispatch_key in task.queue_keys.items():
686
+ for (_sha256, service_name), dispatch_key in task.queue_keys.items():
455
687
  try:
456
688
  s_queue = service_queues[service_name]
457
689
  except KeyError:
@@ -537,7 +769,7 @@ class Dispatcher(ThreadedCoreBase):
537
769
  return
538
770
 
539
771
  if not self.active_submissions.exists(sid):
540
- self.log.info(f"[{sid}] New submission received")
772
+ self.log.info("[%s] New submission received", sid)
541
773
  self.active_submissions.add(sid, {
542
774
  'completed_queue': task.completed_queue,
543
775
  'submission': submission.as_primitives()
@@ -558,9 +790,10 @@ class Dispatcher(ThreadedCoreBase):
558
790
  self.log.info(f"[{sid}] Submission counts towards {submission.params.submitter.upper()} quota")
559
791
 
560
792
  # Apply initial data parameter
793
+ temporary_data = task.temporary_data[sha256] = TemporaryFileData(sha256)
561
794
  if submission.params.initial_data:
562
795
  try:
563
- task.file_temporary_data[sha256] = {
796
+ temporary_data.local_values = {
564
797
  key: value
565
798
  for key, value in dict(json.loads(submission.params.initial_data)).items()
566
799
  if len(str(value)) <= self.config.submission.max_temp_data_length
@@ -578,7 +811,7 @@ class Dispatcher(ThreadedCoreBase):
578
811
  # Initialize ancestry chain by identifying the root file
579
812
  file_info = self.get_fileinfo(task, sha256)
580
813
  file_type = file_info.type if file_info else 'NOT_FOUND'
581
- task.file_temporary_data[sha256]['ancestry'] = [[dict(type=file_type, parent_relation="ROOT", sha256=sha256)]]
814
+ temporary_data.local_values['ancestry'] = [[dict(type=file_type, parent_relation="ROOT", sha256=sha256)]]
582
815
 
583
816
  # Start the file dispatching
584
817
  task.active_files.add(sha256)
@@ -587,6 +820,7 @@ class Dispatcher(ThreadedCoreBase):
587
820
 
588
821
  @elasticapm.capture_span(span_type='dispatcher')
589
822
  def get_fileinfo(self, task: SubmissionTask, sha256: str) -> Optional[FileInfo]:
823
+ """Read information about a file from the database, caching it locally."""
590
824
  # First try to get the info from local cache
591
825
  file_info = task.file_info.get(sha256, None)
592
826
  if file_info:
@@ -740,9 +974,12 @@ class Dispatcher(ThreadedCoreBase):
740
974
  tags = list(task.file_tags.get(sha256, {}).values())
741
975
 
742
976
  # Load the temp submission data we will pass
743
- temp_data = {}
977
+ temp_data: dict[str, str] = {}
744
978
  if service.uses_temp_submission_data:
745
- temp_data = task.file_temporary_data[sha256]
979
+ temp_data = task.temporary_data[sha256].read()
980
+ if service.monitored_keys:
981
+ values = {key: temp_data.get(key) for key in service.monitored_keys}
982
+ task.set_monitoring_entry(sha256, service.name, values)
746
983
 
747
984
  # Load the metadata we will pass
748
985
  metadata = {}
@@ -910,10 +1147,10 @@ class Dispatcher(ThreadedCoreBase):
910
1147
  if self.dispatch_file(task, file_hash):
911
1148
  return True
912
1149
  elif processing_files:
913
- self.log.debug(f"[{task.submission.sid}] Not finished waiting on {len(processing_files)} "
914
- f"files: {list(processing_files)}")
1150
+ self.log.debug("[%s] Not finished waiting on %d files: %s",
1151
+ task.submission.sid, len(processing_files), list(processing_files))
915
1152
  else:
916
- self.log.debug(f"[{task.submission.sid}] Finalizing submission.")
1153
+ self.log.debug("[%s] Finalizing submission.", task.submission.sid)
917
1154
  max_score = max(file_scores.values()) if file_scores else 0 # Submissions with no results have no score
918
1155
  if self.tasks.pop(task.sid, None):
919
1156
  self.finalize_queue.put((task, max_score, checked))
@@ -1237,6 +1474,12 @@ class Dispatcher(ThreadedCoreBase):
1237
1474
  self.clear_timeout(task, sha256, service_name)
1238
1475
  task.service_logs.pop((sha256, service_name), None)
1239
1476
 
1477
+ if summary.partial:
1478
+ self.log.info("[%s/%s] %s returned partial results", sid, sha256, service_name)
1479
+ task.partial_result(sha256, service_name)
1480
+ else:
1481
+ task.clear_monitoring_entry(sha256, service_name)
1482
+
1240
1483
  # Don't process duplicates
1241
1484
  if (sha256, service_name) in task.service_results:
1242
1485
  return
@@ -1258,8 +1501,8 @@ class Dispatcher(ThreadedCoreBase):
1258
1501
  if isinstance(tags, list):
1259
1502
  self.log.warning("Deprecation: Old format of tags found. "
1260
1503
  "This format changed with the release of 4.3 on 09-2022. "
1261
- f"Rebuilding {service_name} may be required or the result of a cache hit. "
1262
- "Proceeding with conversion to compatible format..")
1504
+ "Rebuilding %s may be required or the result of a cache hit. "
1505
+ "Proceeding with conversion to compatible format..", service_name)
1263
1506
  alt_tags = {}
1264
1507
  for t in tags:
1265
1508
  key = f"{t['type']}:{t['value']}"
@@ -1274,11 +1517,6 @@ class Dispatcher(ThreadedCoreBase):
1274
1517
  else:
1275
1518
  task.file_tags[sha256][key] = value
1276
1519
 
1277
- # Update the temporary data table for this file
1278
- for key, value in (temporary_data or {}).items():
1279
- if len(str(value)) <= self.config.submission.max_temp_data_length:
1280
- task.file_temporary_data[sha256][key] = value
1281
-
1282
1520
  # Update children to include parent_relation, likely EXTRACTED
1283
1521
  if summary.children and isinstance(summary.children[0], str):
1284
1522
  old_children = typing.cast(list[str], summary.children)
@@ -1288,6 +1526,19 @@ class Dispatcher(ThreadedCoreBase):
1288
1526
  task.service_results[(sha256, service_name)] = summary
1289
1527
  task.register_children(sha256, [c for c, _ in summary.children])
1290
1528
 
1529
+ # Update the temporary data table for this file
1530
+ force_redispatch = set()
1531
+ update_operations = self.config.submission.temporary_keys
1532
+ for key, value in (temporary_data or {}).items():
1533
+ if len(str(value)) <= self.config.submission.max_temp_data_length:
1534
+ if update_operations.get(key) == KeyType.UNION:
1535
+ changed_files = task.temporary_data[sha256].union_value(key, value)
1536
+ elif update_operations.get(key) == KeyType.IGNORE:
1537
+ changed_files = set()
1538
+ else:
1539
+ changed_files = task.temporary_data[sha256].set_value(key, value)
1540
+ force_redispatch |= set(task.file_temporary_data_changed(changed_files, key))
1541
+
1291
1542
  # Set the depth of all extracted files, even if we won't be processing them
1292
1543
  depth_limit = self.config.submission.max_extraction_depth
1293
1544
  new_depth = task.file_depth[sha256] + 1
@@ -1303,7 +1554,7 @@ class Dispatcher(ThreadedCoreBase):
1303
1554
  if new_depth < depth_limit:
1304
1555
  # Prepare the temporary data from the parent to build the temporary data table for
1305
1556
  # these newly extract files
1306
- parent_data = task.file_temporary_data[sha256]
1557
+ parent_data = task.temporary_data[sha256]
1307
1558
 
1308
1559
  for extracted_sha256, parent_relation in summary.children:
1309
1560
 
@@ -1311,7 +1562,7 @@ class Dispatcher(ThreadedCoreBase):
1311
1562
  continue
1312
1563
 
1313
1564
  if len(task.active_files) > submission.params.max_extracted:
1314
- self.log.info(f'[{sid}] hit extraction limit, dropping {extracted_sha256}')
1565
+ self.log.info('[%s] hit extraction limit, dropping %s', sid, extracted_sha256)
1315
1566
  task.dropped_files.add(extracted_sha256)
1316
1567
  self._dispatching_error(task, Error({
1317
1568
  'archive_ts': None,
@@ -1332,21 +1583,20 @@ class Dispatcher(ThreadedCoreBase):
1332
1583
 
1333
1584
  dispatched += 1
1334
1585
  task.active_files.add(extracted_sha256)
1335
- try:
1336
- parent_ancestry = parent_data['ancestry']
1337
- except KeyError:
1338
- self.log.warn(f"[{sid} :: {sha256}] missing ancestry data.")
1339
- parent_ancestry = []
1340
- existing_ancestry = task.file_temporary_data.get(extracted_sha256, {}).get('ancestry', [])
1586
+
1587
+ # Get the new ancestory data
1341
1588
  file_info = self.get_fileinfo(task, extracted_sha256)
1342
1589
  file_type = file_info.type if file_info else 'NOT_FOUND'
1343
1590
  current_ancestry_node = dict(type=file_type, parent_relation=parent_relation,
1344
1591
  sha256=extracted_sha256)
1345
1592
 
1346
- task.file_temporary_data[extracted_sha256] = dict(parent_data)
1347
- task.file_temporary_data[extracted_sha256]['ancestry'] = existing_ancestry
1348
- [task.file_temporary_data[extracted_sha256]['ancestry'].append(ancestry + [current_ancestry_node])
1349
- for ancestry in parent_ancestry]
1593
+ # Update ancestory data
1594
+ parent_ancestry = parent_data.read_key('ancestry') or []
1595
+ existing_ancestry = task.temporary_data[extracted_sha256].local_values.setdefault('ancestry', [])
1596
+ for ancestry in parent_ancestry:
1597
+ existing_ancestry.append(ancestry + [current_ancestry_node])
1598
+
1599
+ # Trigger the processing of the extracted file
1350
1600
  self.find_process_queue(sid).put(DispatchAction(kind=Action.dispatch_file, sid=sid,
1351
1601
  sha=extracted_sha256))
1352
1602
  else:
@@ -1369,13 +1619,15 @@ class Dispatcher(ThreadedCoreBase):
1369
1619
 
1370
1620
  # Check if its worth trying to run the next stage
1371
1621
  # Not worth running if we know we are waiting for another service
1372
- if any(_s == sha256 for _s, _ in task.running_services):
1373
- return
1622
+ if not any(_s == sha256 for _s, _ in task.running_services):
1623
+ force_redispatch.add(sha256)
1374
1624
  # Not worth running if we know we have services in queue
1375
- if any(_s == sha256 for _s, _ in task.queue_keys.keys()):
1376
- return
1625
+ if not any(_s == sha256 for _s, _ in task.queue_keys.keys()):
1626
+ force_redispatch.add(sha256)
1627
+
1377
1628
  # Try to run the next stage
1378
- self.dispatch_file(task, sha256)
1629
+ for sha256 in force_redispatch:
1630
+ self.dispatch_file(task, sha256)
1379
1631
 
1380
1632
  @elasticapm.capture_span(span_type='dispatcher')
1381
1633
  def _dispatching_error(self, task: SubmissionTask, error):
@@ -1658,13 +1910,13 @@ class Dispatcher(ThreadedCoreBase):
1658
1910
 
1659
1911
  @elasticapm.capture_span(span_type='dispatcher')
1660
1912
  def list_outstanding(self, sid: str, queue_name: str):
1661
- response_queue = NamedQueue(queue_name, host=self.redis)
1913
+ response_queue: NamedQueue[dict] = NamedQueue(queue_name, host=self.redis)
1662
1914
  outstanding: defaultdict[str, int] = defaultdict(int)
1663
1915
  task = self.tasks.get(sid)
1664
1916
  if task:
1665
- for sha, service_name in list(task.queue_keys.keys()):
1917
+ for _sha, service_name in list(task.queue_keys.keys()):
1666
1918
  outstanding[service_name] += 1
1667
- for sha, service_name in list(task.running_services):
1919
+ for _sha, service_name in list(task.running_services):
1668
1920
  outstanding[service_name] += 1
1669
1921
  response_queue.push(outstanding)
1670
1922
 
@@ -1679,7 +1931,7 @@ class Dispatcher(ThreadedCoreBase):
1679
1931
  error_tasks = []
1680
1932
 
1681
1933
  # iterate running tasks
1682
- for task_key, task_body in self.running_tasks:
1934
+ for _task_key, task_body in self.running_tasks:
1683
1935
  task = ServiceTask(task_body)
1684
1936
  # Its a bad task if it's dispatcher isn't running
1685
1937
  if task.metadata['dispatcher__'] not in dispatcher_instances:
@@ -84,8 +84,8 @@ class ReplayLoader(ReplayBase):
84
84
  self.maintain_threads(threads)
85
85
 
86
86
  def stop(self):
87
+ super().stop()
87
88
  self.cache.close()
88
- return super().stop()
89
89
 
90
90
 
91
91
  if __name__ == '__main__':