assemblyline-core 4.5.0.64__tar.gz → 4.5.0.66__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-core might be problematic. Click here for more details.

Files changed (88) hide show
  1. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/PKG-INFO +1 -1
  2. assemblyline-core-4.5.0.66/assemblyline_core/VERSION +1 -0
  3. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/dispatching/client.py +1 -0
  4. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/dispatching/dispatcher.py +249 -70
  5. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/tasking_client.py +1 -2
  6. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core.egg-info/PKG-INFO +1 -1
  7. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_scheduler.py +2 -1
  8. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_simulation.py +221 -8
  9. assemblyline-core-4.5.0.64/assemblyline_core/VERSION +0 -1
  10. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/LICENCE.md +0 -0
  11. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/README.md +0 -0
  12. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/__init__.py +0 -0
  13. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/alerter/__init__.py +0 -0
  14. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/alerter/processing.py +0 -0
  15. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/alerter/run_alerter.py +0 -0
  16. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/archiver/__init__.py +0 -0
  17. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/archiver/run_archiver.py +0 -0
  18. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/badlist_client.py +0 -0
  19. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/dispatching/__init__.py +0 -0
  20. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/dispatching/__main__.py +0 -0
  21. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/dispatching/schedules.py +0 -0
  22. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/dispatching/timeout.py +0 -0
  23. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/expiry/__init__.py +0 -0
  24. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/expiry/run_expiry.py +0 -0
  25. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/ingester/__init__.py +0 -0
  26. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/ingester/__main__.py +0 -0
  27. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/ingester/constants.py +0 -0
  28. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/ingester/ingester.py +0 -0
  29. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/metrics/__init__.py +0 -0
  30. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/metrics/es_metrics.py +0 -0
  31. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/metrics/heartbeat_formatter.py +0 -0
  32. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/metrics/helper.py +0 -0
  33. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/metrics/metrics_server.py +0 -0
  34. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/metrics/run_heartbeat_manager.py +0 -0
  35. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/metrics/run_metrics_aggregator.py +0 -0
  36. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/metrics/run_statistics_aggregator.py +0 -0
  37. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/plumber/__init__.py +0 -0
  38. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/plumber/run_plumber.py +0 -0
  39. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/__init__.py +0 -0
  40. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/client.py +0 -0
  41. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/creator/__init__.py +0 -0
  42. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/creator/run.py +0 -0
  43. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/creator/run_worker.py +0 -0
  44. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/loader/__init__.py +0 -0
  45. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/loader/run.py +0 -0
  46. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/loader/run_worker.py +0 -0
  47. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/replay/replay.py +0 -0
  48. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/safelist_client.py +0 -0
  49. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/scaler/__init__.py +0 -0
  50. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/scaler/collection.py +0 -0
  51. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/scaler/controllers/__init__.py +0 -0
  52. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/scaler/controllers/docker_ctl.py +0 -0
  53. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/scaler/controllers/interface.py +0 -0
  54. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/scaler/controllers/kubernetes_ctl.py +0 -0
  55. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/scaler/run_scaler.py +0 -0
  56. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/scaler/scaler_server.py +0 -0
  57. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/server_base.py +0 -0
  58. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/signature_client.py +0 -0
  59. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/submission_client.py +0 -0
  60. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/updater/__init__.py +0 -0
  61. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/updater/helper.py +0 -0
  62. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/updater/run_updater.py +0 -0
  63. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/vacuum/__init__.py +0 -0
  64. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/vacuum/crawler.py +0 -0
  65. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/vacuum/department_map.py +0 -0
  66. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/vacuum/safelist.py +0 -0
  67. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/vacuum/stream_map.py +0 -0
  68. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/vacuum/worker.py +0 -0
  69. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/workflow/__init__.py +0 -0
  70. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/workflow/run_workflow.py +0 -0
  71. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core.egg-info/SOURCES.txt +0 -0
  72. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core.egg-info/dependency_links.txt +0 -0
  73. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core.egg-info/requires.txt +0 -0
  74. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core.egg-info/top_level.txt +0 -0
  75. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/setup.cfg +0 -0
  76. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/setup.py +0 -0
  77. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_alerter.py +0 -0
  78. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_badlist_client.py +0 -0
  79. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_dispatcher.py +0 -0
  80. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_expiry.py +0 -0
  81. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_plumber.py +0 -0
  82. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_replay.py +0 -0
  83. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_safelist_client.py +0 -0
  84. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_scaler.py +0 -0
  85. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_signature_client.py +0 -0
  86. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_vacuum.py +0 -0
  87. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_worker_ingest.py +0 -0
  88. {assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_worker_submit.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.0.64
3
+ Version: 4.5.0.66
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -0,0 +1 @@
1
+ 4.5.0.66
@@ -325,6 +325,7 @@ class DispatchClient:
325
325
  'key': result_key,
326
326
  'drop': result.drop_file,
327
327
  'score': result.result.score,
328
+ 'partial': result.partial,
328
329
  'children': [(r.sha256, r.parent_relation) for r in result.response.extracted],
329
330
  },
330
331
  'tags': tags,
@@ -11,6 +11,7 @@ import json
11
11
  import enum
12
12
  from queue import PriorityQueue, Empty, Queue
13
13
  import dataclasses
14
+ from copy import deepcopy
14
15
 
15
16
  import elasticapm
16
17
 
@@ -42,17 +43,16 @@ from assemblyline.remote.datatypes.queues.named import NamedQueue
42
43
  from assemblyline.remote.datatypes.set import ExpiringSet, Set
43
44
  from assemblyline.remote.datatypes.user_quota_tracker import UserQuotaTracker
44
45
  from assemblyline_core.server_base import ThreadedCoreBase
45
- from assemblyline_core.alerter.run_alerter import ALERT_QUEUE_NAME
46
46
 
47
+ from .schedules import Scheduler
48
+ from .timeout import TimeoutTable
49
+ from ..ingester.constants import COMPLETE_QUEUE_NAME
47
50
 
48
51
  if TYPE_CHECKING:
49
52
  from assemblyline.odm.models.file import File
53
+ from redis import Redis
50
54
 
51
55
 
52
- from .schedules import Scheduler
53
- from .timeout import TimeoutTable
54
- from ..ingester.constants import COMPLETE_QUEUE_NAME
55
-
56
56
  APM_SPAN_TYPE = 'handle_message'
57
57
 
58
58
  AL_SHUTDOWN_GRACE = int(os.environ.get('AL_SHUTDOWN_GRACE', '60'))
@@ -66,6 +66,11 @@ DAY_IN_SECONDS = 24 * 60 * 60
66
66
  DYNAMIC_ANALYSIS_CATEGORY = 'Dynamic Analysis'
67
67
 
68
68
 
69
+ class KeyType(enum.Enum):
70
+ OVERWRITE = 'overwrite'
71
+ UNION = 'union'
72
+
73
+
69
74
  class Action(enum.IntEnum):
70
75
  start = 0
71
76
  result = 1
@@ -86,6 +91,19 @@ class DispatchAction:
86
91
  event: Optional[threading.Event] = dataclasses.field(compare=False, default=None)
87
92
 
88
93
 
94
+ @dataclasses.dataclass()
95
+ class MonitorTask:
96
+ """Tracks whether a task needs to be rerun based on """
97
+ # Service name
98
+ service: str
99
+ # sha256 of file in question
100
+ sha: str
101
+ # The temporary values this task was last dispatached with
102
+ values: dict[str, Optional[str]]
103
+ # Should aservice be dispatched again when possible
104
+ dispatch_needed: bool = dataclasses.field(default=False)
105
+
106
+
89
107
  @contextmanager
90
108
  def apm_span(client, span_name: str):
91
109
  try:
@@ -101,13 +119,83 @@ def apm_span(client, span_name: str):
101
119
 
102
120
 
103
121
  class ResultSummary:
104
- def __init__(self, key, drop, score, children):
122
+ def __init__(self, key, drop, score, children, partial=False) -> None:
105
123
  self.key: str = key
106
124
  self.drop: bool = drop
125
+ self.partial: bool = partial
107
126
  self.score: int = score
108
127
  self.children: list[tuple[str, str]] = children
109
128
 
110
129
 
130
+ class TemporaryFileData:
131
+ def __init__(self,
132
+ sha256: str,
133
+ config: dict[str, str],
134
+ shared: Optional[dict[str, Any]] = None,
135
+ local: Optional[dict[str, Any]] = None
136
+ ) -> None:
137
+ self.sha256 = sha256
138
+ self.config = config
139
+ self.shared_values: dict[str, Any] = {} if shared is None else shared
140
+ self.local_values: dict[str, Any] = {} if local is None else local
141
+
142
+ def new_file(self, sha256: str) -> TemporaryFileData:
143
+ """Create an entry for another file with reference to the shared values."""
144
+ return TemporaryFileData(sha256, self.config, self.shared_values, deepcopy(self.local_values))
145
+
146
+ def read(self) -> dict[str, Any]:
147
+ """Get a copy of the current data"""
148
+ # Start with a shallow copy of the local data
149
+ data = dict(self.local_values)
150
+
151
+ # mix in whatever the latest submission wide values are values are
152
+ data.update(self.shared_values)
153
+ return data
154
+
155
+ def read_key(self, key: str) -> Any:
156
+ """Get a copy of the current data"""
157
+ try:
158
+ return self.shared_values[key]
159
+ except KeyError:
160
+ return self.local_values.get(key)
161
+
162
+ def set_value(self, key: str, value: Any) -> bool:
163
+ """Set the value of a temporary data key using the appropriate method for the key.
164
+
165
+ Return true if this change could mean partial results should be reevaluated.
166
+ """
167
+ if self.config.get(key) == KeyType.UNION.value:
168
+ return self._union_shared_value(key, value)
169
+
170
+ if self.config.get(key) == KeyType.OVERWRITE.value:
171
+ change = self.shared_values.get(key) != value
172
+ self.shared_values[key] = value
173
+ return change
174
+
175
+ self.local_values[key] = value
176
+ return False
177
+
178
+ def _union_shared_value(self, key: str, values: Any) -> bool:
179
+ # Make sure the existing value is the right type
180
+ self.shared_values.setdefault(key, [])
181
+ if not isinstance(self.shared_values[key], list):
182
+ self.shared_values[key] = []
183
+
184
+ # make sure the input is the right type
185
+ if not isinstance(values, list | tuple):
186
+ return False
187
+
188
+ # Add each value one at a time testing for new values
189
+ # This is slower than using set intersection, but isn't type sensitive
190
+ changed = False
191
+ for new_item in values:
192
+ if new_item in self.shared_values[key]:
193
+ continue
194
+ self.shared_values[key].append(new_item)
195
+ changed = True
196
+ return changed
197
+
198
+
111
199
  class SubmissionTask:
112
200
  """Dispatcher internal model for submissions"""
113
201
 
@@ -128,12 +216,13 @@ class SubmissionTask:
128
216
  self.file_schedules: dict[str, list[dict[str, Service]]] = {}
129
217
  self.file_tags: dict[str, dict[str, dict[str, Any]]] = defaultdict(dict)
130
218
  self.file_depth: dict[str, int] = {}
131
- self.file_temporary_data: dict[str, dict] = defaultdict(dict)
219
+ self.temporary_data: dict[str, TemporaryFileData] = {}
132
220
  self.extra_errors: list[str] = []
133
221
  self.active_files: set[str] = set()
134
222
  self.dropped_files: set[str] = set()
135
223
  self.dynamic_recursion_bypass: set[str] = set()
136
224
  self.service_logs: dict[tuple[str, str], list[str]] = defaultdict(list)
225
+ self.monitoring: dict[tuple[str, str], MonitorTask] = {}
137
226
 
138
227
  # mapping from file hash to a set of services that shouldn't be run on
139
228
  # any children (recursively) of that file
@@ -187,15 +276,15 @@ class SubmissionTask:
187
276
  children_detail: list[tuple[str, str]] = [(r['sha256'], r['parent_relation']) for r in extracted]
188
277
  self.service_results[(sha256, service)] = ResultSummary(
189
278
  key=k, drop=result['drop_file'], score=result['result']['score'],
190
- children=children_detail)
279
+ children=children_detail, partial=result.get('partial', False))
191
280
 
192
281
  tags = Result(result).scored_tag_dict()
193
- for key in tags.keys():
282
+ for key, tag in tags.items():
194
283
  if key in self.file_tags[sha256].keys():
195
284
  # Sum score of already known tags
196
- self.file_tags[sha256][key]['score'] += tags[key]['score']
285
+ self.file_tags[sha256][key]['score'] += tag['score']
197
286
  else:
198
- self.file_tags[sha256][key] = tags[key]
287
+ self.file_tags[sha256][key] = tag
199
288
 
200
289
  if errors is not None:
201
290
  for e in errors:
@@ -204,6 +293,7 @@ class SubmissionTask:
204
293
 
205
294
  @property
206
295
  def sid(self) -> str:
296
+ """Shortcut to read submission SID"""
207
297
  return self.submission.sid
208
298
 
209
299
  def forbid_for_children(self, sha256: str, service_name: str):
@@ -213,19 +303,23 @@ class SubmissionTask:
213
303
  except KeyError:
214
304
  self._forbidden_services[sha256] = {service_name}
215
305
 
216
-
217
306
  def register_children(self, parent: str, children: list[str]):
218
307
  """
219
- Note for the purposes of dynamic recursion prevention which
220
- files extracted other files.
308
+ Note which files extracted other files.
309
+ _parent_map is for dynamic recursion prevention
310
+ temporary_data is for cascading the temp data to children
221
311
  """
312
+ parent_temp = self.temporary_data[parent]
222
313
  for child in children:
314
+ if child not in self.temporary_data:
315
+ self.temporary_data[child] = parent_temp.new_file(child)
223
316
  try:
224
317
  self._parent_map[child].add(parent)
225
318
  except KeyError:
226
319
  self._parent_map[child] = {parent}
227
320
 
228
321
  def all_ancestors(self, sha256: str) -> list[str]:
322
+ """Collect all the known ancestors of the given file within this submission."""
229
323
  visited = set()
230
324
  to_visit = [sha256]
231
325
  while len(to_visit) > 0:
@@ -249,6 +343,72 @@ class SubmissionTask:
249
343
  for parent in self.all_ancestors(sha256)
250
344
  ]))
251
345
 
346
+ def set_monitoring_entry(self, sha256: str, service_name: str, values: dict[str, Optional[str]]):
347
+ """A service with monitoring has dispatched, keep track of the conditions."""
348
+ self.monitoring[(sha256, service_name)] = MonitorTask(
349
+ service=service_name,
350
+ sha=sha256,
351
+ values=values,
352
+ )
353
+
354
+ def partial_result(self, sha256, service_name):
355
+ """Note that a partial result has been recieved. If a dispatch was requested process that now."""
356
+ try:
357
+ entry = self.monitoring[(sha256, service_name)]
358
+ except KeyError:
359
+ return
360
+
361
+ if entry.dispatch_needed:
362
+ self.redispatch_service(sha256, service_name)
363
+
364
+ def clear_monitoring_entry(self, sha256, service_name):
365
+ """A service has completed normally. If the service is monitoring clear out the record."""
366
+ # We have an incoming non-partial result, flush out any partial monitoring
367
+ self.monitoring.pop((sha256, service_name), None)
368
+ # If there is a partial result for this service flush that as well so we accept this new result
369
+ result = self.service_results.get((sha256, service_name))
370
+ if result and result.partial:
371
+ self.service_results.pop((sha256, service_name), None)
372
+
373
+ def temporary_data_changed(self, key: str) -> list[str]:
374
+ """Check all of the monitored tasks on that key for changes. Redispatch as needed."""
375
+ changed = []
376
+ for (sha256, service), entry in self.monitoring.items():
377
+ # Check if this key is actually being monitored by this entry
378
+ if key not in entry.values:
379
+ continue
380
+
381
+ # Get whatever values (if any) were provided on the previous dispatch of this service
382
+ value = self.temporary_data[sha256].read_key(key)
383
+ dispatched_value = entry.values.get(key)
384
+
385
+ if type(value) is not type(dispatched_value) or value != dispatched_value:
386
+ result = self.service_results.get((sha256, service))
387
+ if not result:
388
+ # If the value has changed since the last dispatch but results haven't come in yet
389
+ # mark this service to be disptached later. This will only happen if the service
390
+ # returns partial results, if there are full results the entry will be cleared instead.
391
+ entry.dispatch_needed = True
392
+ else:
393
+ # If there are results and there is a monitoring entry, the result was partial
394
+ # so redispatch it immediately. If there are not partial results the monitoring
395
+ # entry will have been cleared.
396
+ self.redispatch_service(sha256, service)
397
+ changed.append(sha256)
398
+ return changed
399
+
400
+ def redispatch_service(self, sha256, service_name):
401
+ # Clear the result if its partial or an error
402
+ result = self.service_results.get((sha256, service_name))
403
+ if result and not result.partial:
404
+ return
405
+ self.service_results.pop((sha256, service_name), None)
406
+ self.service_errors.pop((sha256, service_name), None)
407
+ self.service_attempts[(sha256, service_name)] = 1
408
+
409
+ # Try to get the service to run again by reseting the schedule for that service
410
+ self.file_schedules.pop(sha256, None)
411
+
252
412
 
253
413
  DISPATCH_TASK_ASSIGNMENT = 'dispatcher-tasks-assigned-to-'
254
414
  TASK_ASSIGNMENT_PATTERN = DISPATCH_TASK_ASSIGNMENT + '*'
@@ -277,7 +437,7 @@ SUBMISSION_TOTAL_TIMEOUT = 60 * 20
277
437
 
278
438
  class Dispatcher(ThreadedCoreBase):
279
439
  @staticmethod
280
- def all_instances(persistent_redis):
440
+ def all_instances(persistent_redis: Redis):
281
441
  return Hash(DISPATCH_DIRECTORY, host=persistent_redis).keys()
282
442
 
283
443
  @staticmethod
@@ -297,7 +457,7 @@ class Dispatcher(ThreadedCoreBase):
297
457
  }
298
458
 
299
459
  def __init__(self, datastore=None, redis=None, redis_persist=None, logger=None,
300
- config=None, counter_name='dispatcher'):
460
+ config=None, counter_name: str = 'dispatcher'):
301
461
  super().__init__('assemblyline.dispatcher', config=config, datastore=datastore,
302
462
  redis=redis, redis_persist=redis_persist, logger=logger)
303
463
 
@@ -307,10 +467,9 @@ class Dispatcher(ThreadedCoreBase):
307
467
  self.finalizing = threading.Event()
308
468
  self.finalizing_start = 0.0
309
469
 
310
- #
311
- # # Build some utility classes
470
+ # Build some utility classes
312
471
  self.scheduler = Scheduler(self.datastore, self.config, self.redis)
313
- self.running_tasks = Hash(DISPATCH_RUNNING_TASK_HASH, host=self.redis)
472
+ self.running_tasks: Hash[dict] = Hash(DISPATCH_RUNNING_TASK_HASH, host=self.redis)
314
473
  self.scaler_timeout_queue = NamedQueue(SCALER_TIMEOUT_QUEUE, host=self.redis_persist)
315
474
 
316
475
  self.classification_engine = get_classification()
@@ -331,12 +490,12 @@ class Dispatcher(ThreadedCoreBase):
331
490
  self.ingester_scanning = Hash('m-scanning-table', self.redis_persist)
332
491
 
333
492
  # Communications queues
334
- self.start_queue = NamedQueue(DISPATCH_START_EVENTS+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
335
- self.result_queue = NamedQueue(DISPATCH_RESULT_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
336
- self.command_queue = NamedQueue(DISPATCH_COMMAND_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
337
-
338
- # Submissions that should have alerts generated
339
- self.alert_queue = NamedQueue(ALERT_QUEUE_NAME, self.redis_persist)
493
+ self.start_queue: NamedQueue[tuple[str, str, str, str]] =\
494
+ NamedQueue(DISPATCH_START_EVENTS+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
495
+ self.result_queue: NamedQueue[dict] =\
496
+ NamedQueue(DISPATCH_RESULT_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
497
+ self.command_queue: NamedQueue[dict] =\
498
+ NamedQueue(DISPATCH_COMMAND_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
340
499
 
341
500
  # Publish counters to the metrics sink.
342
501
  self.counter = MetricsFactory(metrics_type='dispatcher', schema=Metrics, name=counter_name,
@@ -397,11 +556,16 @@ class Dispatcher(ThreadedCoreBase):
397
556
  _q = self.find_process_queue(sid)
398
557
  _q.put(DispatchAction(kind=Action.check_submission, sid=sid))
399
558
 
400
- def _handle_service_change_event(self, data: ServiceChange):
559
+ def _handle_service_change_event(self, data: Optional[ServiceChange]):
560
+ if not data:
561
+ # We may have missed change messages, flush cache
562
+ self.scheduler.c12n_services.clear()
563
+ return
401
564
  if data.operation == Operation.Removed:
402
565
  # Remove all current instances of service from scheduler cache
403
- [service_set.remove(data.name) for service_set in self.scheduler.c12n_services.values()
404
- if data.name in service_set]
566
+ for service_set in self.scheduler.c12n_services.values():
567
+ if data.name in service_set:
568
+ service_set.remove(data.name)
405
569
  else:
406
570
  # If Added/Modifed, pull the service information and modify cache
407
571
  service: Service = self.datastore.get_service_with_delta(data.name)
@@ -461,7 +625,7 @@ class Dispatcher(ThreadedCoreBase):
461
625
  # If the dispatcher is exiting cleanly remove as many tasks from the service queues as we can
462
626
  service_queues = {}
463
627
  for task in self.tasks.values():
464
- for (sha256, service_name), dispatch_key in task.queue_keys.items():
628
+ for (_sha256, service_name), dispatch_key in task.queue_keys.items():
465
629
  try:
466
630
  s_queue = service_queues[service_name]
467
631
  except KeyError:
@@ -547,7 +711,7 @@ class Dispatcher(ThreadedCoreBase):
547
711
  return
548
712
 
549
713
  if not self.active_submissions.exists(sid):
550
- self.log.info(f"[{sid}] New submission received")
714
+ self.log.info("[%s] New submission received", sid)
551
715
  self.active_submissions.add(sid, {
552
716
  'completed_queue': task.completed_queue,
553
717
  'submission': submission.as_primitives()
@@ -568,13 +732,16 @@ class Dispatcher(ThreadedCoreBase):
568
732
  self.log.info(f"[{sid}] Submission counts towards {submission.params.submitter.upper()} quota")
569
733
 
570
734
  # Apply initial data parameter
735
+ temp_key_config = dict(self.config.submission.default_temporary_keys)
736
+ temp_key_config.update(self.config.submission.temporary_keys)
737
+ temporary_data = TemporaryFileData(sha256, config=temp_key_config)
738
+ task.temporary_data[sha256] = temporary_data
571
739
  if submission.params.initial_data:
572
740
  try:
573
- task.file_temporary_data[sha256] = {
574
- key: value
575
- for key, value in dict(json.loads(submission.params.initial_data)).items()
576
- if len(str(value)) <= self.config.submission.max_temp_data_length
577
- }
741
+ for key, value in dict(json.loads(submission.params.initial_data)).items():
742
+ if len(str(value)) > self.config.submission.max_temp_data_length:
743
+ continue
744
+ temporary_data.set_value(key, value)
578
745
 
579
746
  except (ValueError, TypeError) as err:
580
747
  self.log.warning(f"[{sid}] could not process initialization data: {err}")
@@ -588,7 +755,7 @@ class Dispatcher(ThreadedCoreBase):
588
755
  # Initialize ancestry chain by identifying the root file
589
756
  file_info = self.get_fileinfo(task, sha256)
590
757
  file_type = file_info.type if file_info else 'NOT_FOUND'
591
- task.file_temporary_data[sha256]['ancestry'] = [[dict(type=file_type, parent_relation="ROOT", sha256=sha256)]]
758
+ temporary_data.local_values['ancestry'] = [[dict(type=file_type, parent_relation="ROOT", sha256=sha256)]]
592
759
 
593
760
  # Start the file dispatching
594
761
  task.active_files.add(sha256)
@@ -597,6 +764,7 @@ class Dispatcher(ThreadedCoreBase):
597
764
 
598
765
  @elasticapm.capture_span(span_type='dispatcher')
599
766
  def get_fileinfo(self, task: SubmissionTask, sha256: str) -> Optional[FileInfo]:
767
+ """Read information about a file from the database, caching it locally."""
600
768
  # First try to get the info from local cache
601
769
  file_info = task.file_info.get(sha256, None)
602
770
  if file_info:
@@ -751,9 +919,12 @@ class Dispatcher(ThreadedCoreBase):
751
919
  tags = list(task.file_tags.get(sha256, {}).values())
752
920
 
753
921
  # Load the temp submission data we will pass
754
- temp_data = {}
922
+ temp_data: dict[str, str] = {}
755
923
  if service.uses_temp_submission_data:
756
- temp_data = task.file_temporary_data[sha256]
924
+ temp_data = task.temporary_data[sha256].read()
925
+ if service.monitored_keys:
926
+ values = {key: temp_data.get(key) for key in service.monitored_keys}
927
+ task.set_monitoring_entry(sha256, service.name, values)
757
928
 
758
929
  # Load the metadata we will pass
759
930
  metadata = {}
@@ -774,7 +945,6 @@ class Dispatcher(ThreadedCoreBase):
774
945
  for service_name in prevented_services:
775
946
  task.forbid_for_children(sha256, service_name)
776
947
 
777
-
778
948
  # Build the actual service dispatch message
779
949
  config = self.build_service_config(service, submission)
780
950
  service_task = ServiceTask(dict(
@@ -929,10 +1099,10 @@ class Dispatcher(ThreadedCoreBase):
929
1099
  if self.dispatch_file(task, file_hash):
930
1100
  return True
931
1101
  elif processing_files:
932
- self.log.debug(f"[{task.submission.sid}] Not finished waiting on {len(processing_files)} "
933
- f"files: {list(processing_files)}")
1102
+ self.log.debug("[%s] Not finished waiting on %d files: %s",
1103
+ task.submission.sid, len(processing_files), list(processing_files))
934
1104
  else:
935
- self.log.debug(f"[{task.submission.sid}] Finalizing submission.")
1105
+ self.log.debug("[%s] Finalizing submission.", task.submission.sid)
936
1106
  max_score = max(file_scores.values()) if file_scores else 0 # Submissions with no results have no score
937
1107
  if self.tasks.pop(task.sid, None):
938
1108
  self.finalize_queue.put((task, max_score, checked))
@@ -1256,6 +1426,12 @@ class Dispatcher(ThreadedCoreBase):
1256
1426
  self.clear_timeout(task, sha256, service_name)
1257
1427
  task.service_logs.pop((sha256, service_name), None)
1258
1428
 
1429
+ if summary.partial:
1430
+ self.log.info("[%s/%s] %s returned partial results", sid, sha256, service_name)
1431
+ task.partial_result(sha256, service_name)
1432
+ else:
1433
+ task.clear_monitoring_entry(sha256, service_name)
1434
+
1259
1435
  # Don't process duplicates
1260
1436
  if (sha256, service_name) in task.service_results:
1261
1437
  return
@@ -1277,8 +1453,8 @@ class Dispatcher(ThreadedCoreBase):
1277
1453
  if isinstance(tags, list):
1278
1454
  self.log.warning("Deprecation: Old format of tags found. "
1279
1455
  "This format changed with the release of 4.3 on 09-2022. "
1280
- f"Rebuilding {service_name} may be required or the result of a cache hit. "
1281
- "Proceeding with conversion to compatible format..")
1456
+ "Rebuilding %s may be required or the result of a cache hit. "
1457
+ "Proceeding with conversion to compatible format..", service_name)
1282
1458
  alt_tags = {}
1283
1459
  for t in tags:
1284
1460
  key = f"{t['type']}:{t['value']}"
@@ -1293,11 +1469,6 @@ class Dispatcher(ThreadedCoreBase):
1293
1469
  else:
1294
1470
  task.file_tags[sha256][key] = value
1295
1471
 
1296
- # Update the temporary data table for this file
1297
- for key, value in (temporary_data or {}).items():
1298
- if len(str(value)) <= self.config.submission.max_temp_data_length:
1299
- task.file_temporary_data[sha256][key] = value
1300
-
1301
1472
  # Update children to include parent_relation, likely EXTRACTED
1302
1473
  if summary.children and isinstance(summary.children[0], str):
1303
1474
  old_children = typing.cast(list[str], summary.children)
@@ -1307,6 +1478,13 @@ class Dispatcher(ThreadedCoreBase):
1307
1478
  task.service_results[(sha256, service_name)] = summary
1308
1479
  task.register_children(sha256, [c for c, _ in summary.children])
1309
1480
 
1481
+ # Update the temporary data table for this file
1482
+ force_redispatch = set()
1483
+ for key, value in (temporary_data or {}).items():
1484
+ if len(str(value)) <= self.config.submission.max_temp_data_length:
1485
+ if task.temporary_data[sha256].set_value(key, value):
1486
+ force_redispatch |= set(task.temporary_data_changed(key))
1487
+
1310
1488
  # Set the depth of all extracted files, even if we won't be processing them
1311
1489
  depth_limit = self.config.submission.max_extraction_depth
1312
1490
  new_depth = task.file_depth[sha256] + 1
@@ -1322,7 +1500,7 @@ class Dispatcher(ThreadedCoreBase):
1322
1500
  if new_depth < depth_limit:
1323
1501
  # Prepare the temporary data from the parent to build the temporary data table for
1324
1502
  # these newly extract files
1325
- parent_data = task.file_temporary_data[sha256]
1503
+ parent_data = task.temporary_data[sha256]
1326
1504
 
1327
1505
  for extracted_sha256, parent_relation in summary.children:
1328
1506
 
@@ -1330,7 +1508,7 @@ class Dispatcher(ThreadedCoreBase):
1330
1508
  continue
1331
1509
 
1332
1510
  if len(task.active_files) > submission.params.max_extracted:
1333
- self.log.info(f'[{sid}] hit extraction limit, dropping {extracted_sha256}')
1511
+ self.log.info('[%s] hit extraction limit, dropping %s', sid, extracted_sha256)
1334
1512
  task.dropped_files.add(extracted_sha256)
1335
1513
  self._dispatching_error(task, Error({
1336
1514
  'archive_ts': None,
@@ -1351,21 +1529,20 @@ class Dispatcher(ThreadedCoreBase):
1351
1529
 
1352
1530
  dispatched += 1
1353
1531
  task.active_files.add(extracted_sha256)
1354
- try:
1355
- parent_ancestry = parent_data['ancestry']
1356
- except KeyError:
1357
- self.log.warn(f"[{sid} :: {sha256}] missing ancestry data.")
1358
- parent_ancestry = []
1359
- existing_ancestry = task.file_temporary_data.get(extracted_sha256, {}).get('ancestry', [])
1532
+
1533
+ # Get the new ancestory data
1360
1534
  file_info = self.get_fileinfo(task, extracted_sha256)
1361
1535
  file_type = file_info.type if file_info else 'NOT_FOUND'
1362
1536
  current_ancestry_node = dict(type=file_type, parent_relation=parent_relation,
1363
1537
  sha256=extracted_sha256)
1364
1538
 
1365
- task.file_temporary_data[extracted_sha256] = dict(parent_data)
1366
- task.file_temporary_data[extracted_sha256]['ancestry'] = existing_ancestry
1367
- [task.file_temporary_data[extracted_sha256]['ancestry'].append(ancestry + [current_ancestry_node])
1368
- for ancestry in parent_ancestry]
1539
+ # Update ancestory data
1540
+ parent_ancestry = parent_data.read_key('ancestry') or []
1541
+ existing_ancestry = task.temporary_data[extracted_sha256].local_values.setdefault('ancestry', [])
1542
+ for ancestry in parent_ancestry:
1543
+ existing_ancestry.append(ancestry + [current_ancestry_node])
1544
+
1545
+ # Trigger the processing of the extracted file
1369
1546
  self.find_process_queue(sid).put(DispatchAction(kind=Action.dispatch_file, sid=sid,
1370
1547
  sha=extracted_sha256))
1371
1548
  else:
@@ -1388,13 +1565,15 @@ class Dispatcher(ThreadedCoreBase):
1388
1565
 
1389
1566
  # Check if its worth trying to run the next stage
1390
1567
  # Not worth running if we know we are waiting for another service
1391
- if any(_s == sha256 for _s, _ in task.running_services):
1392
- return
1568
+ if not any(_s == sha256 for _s, _ in task.running_services):
1569
+ force_redispatch.add(sha256)
1393
1570
  # Not worth running if we know we have services in queue
1394
- if any(_s == sha256 for _s, _ in task.queue_keys.keys()):
1395
- return
1571
+ if not any(_s == sha256 for _s, _ in task.queue_keys.keys()):
1572
+ force_redispatch.add(sha256)
1573
+
1396
1574
  # Try to run the next stage
1397
- self.dispatch_file(task, sha256)
1575
+ for sha256 in force_redispatch:
1576
+ self.dispatch_file(task, sha256)
1398
1577
 
1399
1578
  @elasticapm.capture_span(span_type='dispatcher')
1400
1579
  def _dispatching_error(self, task: SubmissionTask, error):
@@ -1677,13 +1856,13 @@ class Dispatcher(ThreadedCoreBase):
1677
1856
 
1678
1857
  @elasticapm.capture_span(span_type='dispatcher')
1679
1858
  def list_outstanding(self, sid: str, queue_name: str):
1680
- response_queue = NamedQueue(queue_name, host=self.redis)
1859
+ response_queue: NamedQueue[dict] = NamedQueue(queue_name, host=self.redis)
1681
1860
  outstanding: defaultdict[str, int] = defaultdict(int)
1682
1861
  task = self.tasks.get(sid)
1683
1862
  if task:
1684
- for sha, service_name in list(task.queue_keys.keys()):
1863
+ for _sha, service_name in list(task.queue_keys.keys()):
1685
1864
  outstanding[service_name] += 1
1686
- for sha, service_name in list(task.running_services):
1865
+ for _sha, service_name in list(task.running_services):
1687
1866
  outstanding[service_name] += 1
1688
1867
  response_queue.push(outstanding)
1689
1868
 
@@ -1698,7 +1877,7 @@ class Dispatcher(ThreadedCoreBase):
1698
1877
  error_tasks = []
1699
1878
 
1700
1879
  # iterate running tasks
1701
- for task_key, task_body in self.running_tasks:
1880
+ for _task_key, task_body in self.running_tasks:
1702
1881
  task = ServiceTask(task_body)
1703
1882
  # Its a bad task if it's dispatcher isn't running
1704
1883
  if task.metadata['dispatcher__'] not in dispatcher_instances:
@@ -159,8 +159,7 @@ class TaskingClient:
159
159
  if not self.datastore.service_delta.exists(service.name):
160
160
  self.datastore.service_delta.save(service.name, {'version': service.version})
161
161
  self.datastore.service_delta.commit()
162
- self.log.info(f"{log_prefix}{service.name} "
163
- f"version ({service.version}) registered")
162
+ self.log.info(f"{log_prefix}{service.name} version ({service.version}) registered")
164
163
 
165
164
  new_heuristics = []
166
165
  if heuristics:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-core
3
- Version: 4.5.0.64
3
+ Version: 4.5.0.66
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -16,7 +16,7 @@ def redis(redis_connection):
16
16
  redis_connection.flushdb()
17
17
 
18
18
 
19
- def dummy_service(name, stage, category='static', accepts='', rejects=None, docid=None, extra_data=False):
19
+ def dummy_service(name, stage, category='static', accepts='', rejects=None, docid=None, extra_data=False, monitored_keys=()):
20
20
  return Service({
21
21
  'name': name,
22
22
  'stage': stage,
@@ -28,6 +28,7 @@ def dummy_service(name, stage, category='static', accepts='', rejects=None, doci
28
28
  'version': '0',
29
29
  'enabled': True,
30
30
  'timeout': 2,
31
+ 'monitored_keys': list(monitored_keys),
31
32
  'docker_config': {
32
33
  'image': 'somefakedockerimage:latest'
33
34
  }
@@ -54,6 +54,7 @@ def redis(redis_connection: Redis[Any]):
54
54
 
55
55
 
56
56
  _global_semaphore = threading.Semaphore(value=1)
57
+ print_lock = threading.Lock()
57
58
 
58
59
 
59
60
  class MockService(ServerBase):
@@ -107,6 +108,25 @@ class MockService(ServerBase):
107
108
  self.dispatch_client.service_failed(task.sid, error=error, error_key=get_random_id())
108
109
  continue
109
110
 
111
+ partial = False
112
+ temp_data = {entry.name: entry.value for entry in task.temporary_submission_data}
113
+ with print_lock:
114
+ print(self.service_name)
115
+ print('instructions', instructions)
116
+ print('temp', temp_data)
117
+ if 'partial' in instructions:
118
+ partial = True
119
+ requirements = instructions['partial']
120
+ for key, value in requirements.items():
121
+ if value in temp_data.get(key, ''):
122
+ partial = False
123
+ else:
124
+ partial = True
125
+ break
126
+
127
+ if partial:
128
+ print(self.service_name, "will produce partial results")
129
+
110
130
  result_data = {
111
131
  'archive_ts': None,
112
132
  'classification': 'U',
@@ -115,8 +135,8 @@ class MockService(ServerBase):
115
135
  'service_tool_version': '0',
116
136
  'service_name': self.service_name,
117
137
  },
118
- 'result': {
119
- },
138
+ 'result': {},
139
+ 'partial': partial,
120
140
  'sha256': task.fileinfo.sha256,
121
141
  'expiry_ts': time.time() + 600
122
142
  }
@@ -125,8 +145,12 @@ class MockService(ServerBase):
125
145
  result_data['response'].update(instructions.get('response', {}))
126
146
 
127
147
  result = Result(result_data)
128
- result_key = instructions.get('result_key', get_random_id())
129
- self.dispatch_client.service_finished(task.sid, result_key, result)
148
+ try:
149
+ result_key = instructions['result_key']
150
+ except KeyError:
151
+ result_key = result.build_key(get_random_id())
152
+ self.dispatch_client.service_finished(task.sid, result_key, result,
153
+ temporary_data=instructions.get('temporary_data'))
130
154
 
131
155
 
132
156
  class CoreSession:
@@ -226,16 +250,24 @@ def core(request, redis, filestore, config, clean_datastore: AssemblylineDatasto
226
250
  # Register services
227
251
  stages = get_service_stage_hash(redis)
228
252
 
253
+ service_config: list[tuple[str, int, str, dict]] = [
254
+ ('pre', 1, 'EXTRACT', {'extra_data': True, 'monitored_keys': ['passwords']}),
255
+ ('core-a', 2, 'CORE', {}),
256
+ ('core-b', 1, 'CORE', {}),
257
+ ('finish', 1, 'POST', {'extra_data': True})
258
+ ]
259
+
229
260
  services = []
230
- for svc, stage in [('pre', 'EXTRACT'), ('core-a', 'CORE'), ('core-b', 'CORE'), ('finish', 'POST')]:
231
- ds.service.save(f'{svc}_0', dummy_service(svc, stage, docid=f'{svc}_0'))
261
+ for svc, count, stage, details in service_config:
262
+ ds.service.save(f'{svc}_0', dummy_service(svc, stage, docid=f'{svc}_0', **details))
232
263
  ds.service_delta.save(svc, ServiceDelta({
233
264
  'name': svc,
234
265
  'version': '0',
235
266
  'enabled': True
236
267
  }))
237
268
  stages.set(svc, ServiceStage.Running)
238
- services.append(MockService(svc, ds, redis, filestore))
269
+ for _ in range(count):
270
+ services.append(MockService(svc, ds, redis, filestore))
239
271
 
240
272
  user = random_model_obj(User)
241
273
  user.uname = "user"
@@ -1134,8 +1166,189 @@ def test_tag_filter(core: CoreSession, metrics):
1134
1166
  metrics.expect('dispatcher', 'submissions_completed', 1)
1135
1167
  metrics.expect('dispatcher', 'files_completed', 1)
1136
1168
 
1137
- alert = core.dispatcher.postprocess_worker.alert_queue.pop(timeout=5)
1169
+ alert: dict = core.dispatcher.postprocess_worker.alert_queue.pop(timeout=5)
1138
1170
  assert alert['submission']['sid'] == sub['sid']
1139
1171
 
1140
1172
  finally:
1141
1173
  core.dispatcher.postprocess_worker.actions.pop('test_process')
1174
+
1175
+
1176
+ def test_partial(core: CoreSession, metrics):
1177
+ # Have pre produce a partial result, then have core-a update a monitored key
1178
+ sha, size = ready_body(core, {
1179
+ 'pre': {'partial': {'passwords': 'test_temp_data_monitoring'}},
1180
+ })
1181
+
1182
+ core.ingest_queue.push(SubmissionInput(dict(
1183
+ metadata={},
1184
+ params=dict(
1185
+ description="file abc123",
1186
+ services=dict(selected=[]),
1187
+ submitter='user',
1188
+ groups=['user'],
1189
+ max_extracted=10000
1190
+ ),
1191
+ notification=dict(
1192
+ queue='temp-data-monitor',
1193
+ threshold=0
1194
+ ),
1195
+ files=[dict(
1196
+ sha256=sha,
1197
+ size=size,
1198
+ name='abc123'
1199
+ )]
1200
+ )).as_primitives())
1201
+
1202
+ notification_queue = NamedQueue('nq-temp-data-monitor', core.redis)
1203
+ dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
1204
+ assert dropped_task
1205
+ dropped_task = IngestTask(dropped_task)
1206
+ sub: Submission = core.ds.submission.get(dropped_task.submission.sid)
1207
+ assert len(sub.errors) == 0
1208
+ assert len(sub.results) == 4, 'results'
1209
+ assert core.pre_service.hits[sha] == 1, 'pre_service.hits'
1210
+
1211
+ # Wait until we get feedback from the metrics channel
1212
+ metrics.expect('ingester', 'submissions_ingested', 1)
1213
+ metrics.expect('ingester', 'submissions_completed', 1)
1214
+ metrics.expect('dispatcher', 'submissions_completed', 1)
1215
+ metrics.expect('dispatcher', 'files_completed', 1)
1216
+
1217
+ partial_results = 0
1218
+ for res in sub.results:
1219
+ result = core.ds.get_single_result(res, as_obj=True)
1220
+ assert result is not None, res
1221
+ if result.partial:
1222
+ partial_results += 1
1223
+ assert partial_results == 1, 'partial_results'
1224
+
1225
+
1226
+ def test_temp_data_monitoring(core: CoreSession, metrics):
1227
+ # Have pre produce a partial result, then have core-a update a monitored key
1228
+ sha, size = ready_body(core, {
1229
+ 'pre': {'partial': {'passwords': 'test_temp_data_monitoring'}},
1230
+ 'core-a': {'temporary_data': {'passwords': ['test_temp_data_monitoring']}},
1231
+ 'final': {'temporary_data': {'passwords': ['some other password']}},
1232
+ })
1233
+
1234
+ core.ingest_queue.push(SubmissionInput(dict(
1235
+ metadata={},
1236
+ params=dict(
1237
+ description="file abc123",
1238
+ services=dict(selected=[]),
1239
+ submitter='user',
1240
+ groups=['user'],
1241
+ max_extracted=10000
1242
+ ),
1243
+ notification=dict(
1244
+ queue='temp-data-monitor',
1245
+ threshold=0
1246
+ ),
1247
+ files=[dict(
1248
+ sha256=sha,
1249
+ size=size,
1250
+ name='abc123'
1251
+ )]
1252
+ )).as_primitives())
1253
+
1254
+ notification_queue = NamedQueue('nq-temp-data-monitor', core.redis)
1255
+ dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
1256
+ assert dropped_task
1257
+ dropped_task = IngestTask(dropped_task)
1258
+ sub: Submission = core.ds.submission.get(dropped_task.submission.sid)
1259
+ assert len(sub.errors) == 0
1260
+ assert len(sub.results) == 4, 'results'
1261
+ assert core.pre_service.hits[sha] >= 2, f'pre_service.hits {core.pre_service.hits}'
1262
+
1263
+ # Wait until we get feedback from the metrics channel
1264
+ metrics.expect('ingester', 'submissions_ingested', 1)
1265
+ metrics.expect('ingester', 'submissions_completed', 1)
1266
+ metrics.expect('dispatcher', 'submissions_completed', 1)
1267
+ metrics.expect('dispatcher', 'files_completed', 1)
1268
+
1269
+ partial_results = 0
1270
+ for res in sub.results:
1271
+ result = core.ds.get_single_result(res, as_obj=True)
1272
+ assert result is not None, res
1273
+ if result.partial:
1274
+ partial_results += 1
1275
+ assert partial_results == 0, 'partial_results'
1276
+
1277
+
1278
+ def test_complex_extracted(core: CoreSession, metrics):
1279
+ # stages to this processing when everything goes well
1280
+ # 1. extract a file that will process to produce a partial result
1281
+ # 2. hold a few seconds on the second stage of the root file to let child start
1282
+ # 3. on the last stage of the root file produce the password
1283
+ dispatcher.TIMEOUT_EXTRA_TIME = 100
1284
+
1285
+ child_sha, _ = ready_body(core, {
1286
+ 'pre': {'partial': {'passwords': 'test_temp_data_monitoring'}},
1287
+ })
1288
+
1289
+ sha, size = ready_body(core, {
1290
+ 'pre': {
1291
+ 'response': {
1292
+ 'extracted': [{
1293
+ 'name': child_sha,
1294
+ 'sha256': child_sha,
1295
+ 'description': 'abc',
1296
+ 'classification': 'U'
1297
+ }]
1298
+ }
1299
+ },
1300
+ 'core-a': {'lock': 60},
1301
+ 'finish': {'temporary_data': {'passwords': ['test_temp_data_monitoring']}},
1302
+ })
1303
+
1304
+ core.ingest_queue.push(SubmissionInput(dict(
1305
+ metadata={},
1306
+ params=dict(
1307
+ description="file abc123",
1308
+ services=dict(selected=''),
1309
+ submitter='user',
1310
+ groups=['user'],
1311
+ max_extracted=10000
1312
+ ),
1313
+ notification=dict(
1314
+ queue='complex-extracted-file',
1315
+ threshold=0
1316
+ ),
1317
+ files=[dict(
1318
+ sha256=sha,
1319
+ size=size,
1320
+ name='abc123'
1321
+ )]
1322
+ )).as_primitives())
1323
+
1324
+ # Wait for the extract file to finish
1325
+ metrics.expect('dispatcher', 'files_completed', 1)
1326
+ # check that there is a pending result in the dispatcher
1327
+ task = next(iter(core.dispatcher.tasks.values()))
1328
+ assert 1 == sum(int(summary.partial) for summary in task.service_results.values())
1329
+ _global_semaphore.release()
1330
+
1331
+ # Wait for the entire submission to finish
1332
+ notification_queue = NamedQueue('nq-complex-extracted-file', core.redis)
1333
+ dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
1334
+ assert dropped_task
1335
+ dropped_task = IngestTask(dropped_task)
1336
+ sub: Submission = core.ds.submission.get(dropped_task.submission.sid)
1337
+ assert len(sub.errors) == 0
1338
+ assert len(sub.results) == 8, 'results'
1339
+ assert core.pre_service.hits[sha] == 1, 'pre_service.hits[root]'
1340
+ assert core.pre_service.hits[child_sha] >= 2, 'pre_service.hits[child]'
1341
+
1342
+ # Wait until we get feedback from the metrics channel
1343
+ metrics.expect('ingester', 'submissions_ingested', 1)
1344
+ metrics.expect('ingester', 'submissions_completed', 1)
1345
+ metrics.expect('dispatcher', 'submissions_completed', 1)
1346
+ metrics.expect('dispatcher', 'files_completed', 2)
1347
+
1348
+ partial_results = 0
1349
+ for res in sub.results:
1350
+ result = core.ds.get_single_result(res, as_obj=True)
1351
+ assert result is not None, res
1352
+ if result.partial:
1353
+ partial_results += 1
1354
+ assert partial_results == 0, 'partial_results'
@@ -1 +0,0 @@
1
- 4.5.0.64