PyPI - assemblyline-core - Versions diffs - 4.5.0.64__tar.gz → 4.5.0.66__tar.gz - Mend

assemblyline-core 4.5.0.64tar.gz → 4.5.0.66tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of assemblyline-core might be problematic. Click here for more details.

Files changed (88) hide show

{assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: assemblyline-core
-Version: 4.5.0.64
+Version: 4.5.0.66
 Summary: Assemblyline 4 - Core components
 Home-page: https://github.com/CybercentreCanada/assemblyline-core/
 Author: CCCS Assemblyline development team

assemblyline-core-4.5.0.66/assemblyline_core/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4.5.0.66

{assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/dispatching/client.py RENAMED Viewed

@@ -325,6 +325,7 @@ class DispatchClient:
                 'key': result_key,
                 'drop': result.drop_file,
                 'score': result.result.score,
+                'partial': result.partial,
                 'children': [(r.sha256, r.parent_relation) for r in result.response.extracted],
             },
             'tags': tags,

{assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/dispatching/dispatcher.py RENAMED Viewed

@@ -11,6 +11,7 @@ import json
 import enum
 from queue import PriorityQueue, Empty, Queue
 import dataclasses
+from copy import deepcopy
 import elasticapm
@@ -42,17 +43,16 @@ from assemblyline.remote.datatypes.queues.named import NamedQueue
 from assemblyline.remote.datatypes.set import ExpiringSet, Set
 from assemblyline.remote.datatypes.user_quota_tracker import UserQuotaTracker
 from assemblyline_core.server_base import ThreadedCoreBase
-from assemblyline_core.alerter.run_alerter import ALERT_QUEUE_NAME
+from .schedules import Scheduler
+from .timeout import TimeoutTable
+from ..ingester.constants import COMPLETE_QUEUE_NAME
 if TYPE_CHECKING:
     from assemblyline.odm.models.file import File
+    from redis import Redis
-from .schedules import Scheduler
-from .timeout import TimeoutTable
-from ..ingester.constants import COMPLETE_QUEUE_NAME
 APM_SPAN_TYPE = 'handle_message'
 AL_SHUTDOWN_GRACE = int(os.environ.get('AL_SHUTDOWN_GRACE', '60'))
@@ -66,6 +66,11 @@ DAY_IN_SECONDS = 24 * 60 * 60
 DYNAMIC_ANALYSIS_CATEGORY = 'Dynamic Analysis'
+class KeyType(enum.Enum):
+    OVERWRITE = 'overwrite'
+    UNION = 'union'
 class Action(enum.IntEnum):
     start = 0
     result = 1
@@ -86,6 +91,19 @@ class DispatchAction:
     event: Optional[threading.Event] = dataclasses.field(compare=False, default=None)
+@dataclasses.dataclass()
+class MonitorTask:
+    """Tracks whether a task needs to be rerun based on """
+    # Service name
+    service: str
+    # sha256 of file in question
+    sha: str
+    # The temporary values this task was last dispatached with
+    values: dict[str, Optional[str]]
+    # Should aservice be dispatched again when possible
+    dispatch_needed: bool = dataclasses.field(default=False)
 @contextmanager
 def apm_span(client, span_name: str):
     try:
@@ -101,13 +119,83 @@ def apm_span(client, span_name: str):
 class ResultSummary:
-    def __init__(self, key, drop, score, children):
+    def __init__(self, key, drop, score, children, partial=False) -> None:
         self.key: str = key
         self.drop: bool = drop
+        self.partial: bool = partial
         self.score: int = score
         self.children: list[tuple[str, str]] = children
+class TemporaryFileData:
+    def __init__(self,
+                 sha256: str,
+                 config: dict[str, str],
+                 shared: Optional[dict[str, Any]] = None,
+                 local: Optional[dict[str, Any]] = None
+                 ) -> None:
+        self.sha256 = sha256
+        self.config = config
+        self.shared_values: dict[str, Any] = {} if shared is None else shared
+        self.local_values: dict[str, Any] = {} if local is None else local
+    def new_file(self, sha256: str) -> TemporaryFileData:
+        """Create an entry for another file with reference to the shared values."""
+        return TemporaryFileData(sha256, self.config, self.shared_values, deepcopy(self.local_values))
+    def read(self) -> dict[str, Any]:
+        """Get a copy of the current data"""
+        # Start with a shallow copy of the local data
+        data = dict(self.local_values)
+        # mix in whatever the latest submission wide values are values are
+        data.update(self.shared_values)
+        return data
+    def read_key(self, key: str) -> Any:
+        """Get a copy of the current data"""
+        try:
+            return self.shared_values[key]
+        except KeyError:
+            return self.local_values.get(key)
+    def set_value(self, key: str, value: Any) -> bool:
+        """Set the value of a temporary data key using the appropriate method for the key.
+        Return true if this change could mean partial results should be reevaluated.
+        """
+        if self.config.get(key) == KeyType.UNION.value:
+            return self._union_shared_value(key, value)
+        if self.config.get(key) == KeyType.OVERWRITE.value:
+            change = self.shared_values.get(key) != value
+            self.shared_values[key] = value
+            return change
+        self.local_values[key] = value
+        return False
+    def _union_shared_value(self, key: str, values: Any) -> bool:
+        # Make sure the existing value is the right type
+        self.shared_values.setdefault(key, [])
+        if not isinstance(self.shared_values[key], list):
+            self.shared_values[key] = []
+        # make sure the input is the right type
+        if not isinstance(values, list | tuple):
+            return False
+        # Add each value one at a time testing for new values
+        # This is slower than using set intersection, but isn't type sensitive
+        changed = False
+        for new_item in values:
+            if new_item in self.shared_values[key]:
+                continue
+            self.shared_values[key].append(new_item)
+            changed = True
+        return changed
 class SubmissionTask:
     """Dispatcher internal model for submissions"""
@@ -128,12 +216,13 @@ class SubmissionTask:
         self.file_schedules: dict[str, list[dict[str, Service]]] = {}
         self.file_tags: dict[str, dict[str, dict[str, Any]]] = defaultdict(dict)
         self.file_depth: dict[str, int] = {}
-        self.file_temporary_data: dict[str, dict] = defaultdict(dict)
+        self.temporary_data: dict[str, TemporaryFileData] = {}
         self.extra_errors: list[str] = []
         self.active_files: set[str] = set()
         self.dropped_files: set[str] = set()
         self.dynamic_recursion_bypass: set[str] = set()
         self.service_logs: dict[tuple[str, str], list[str]] = defaultdict(list)
+        self.monitoring: dict[tuple[str, str], MonitorTask] = {}
         # mapping from file hash to a set of services that shouldn't be run on
         # any children (recursively) of that file
@@ -187,15 +276,15 @@ class SubmissionTask:
                     children_detail: list[tuple[str, str]] = [(r['sha256'], r['parent_relation']) for r in extracted]
                     self.service_results[(sha256, service)] = ResultSummary(
                         key=k, drop=result['drop_file'], score=result['result']['score'],
-                        children=children_detail)
+                        children=children_detail, partial=result.get('partial', False))
                 tags = Result(result).scored_tag_dict()
-                for key in tags.keys():
+                for key, tag in tags.items():
                     if key in self.file_tags[sha256].keys():
                         # Sum score of already known tags
-                        self.file_tags[sha256][key]['score'] += tags[key]['score']
+                        self.file_tags[sha256][key]['score'] += tag['score']
                     else:
-                        self.file_tags[sha256][key] = tags[key]
+                        self.file_tags[sha256][key] = tag
         if errors is not None:
             for e in errors:
@@ -204,6 +293,7 @@ class SubmissionTask:
     @property
     def sid(self) -> str:
+        """Shortcut to read submission SID"""
         return self.submission.sid
     def forbid_for_children(self, sha256: str, service_name: str):
@@ -213,19 +303,23 @@ class SubmissionTask:
         except KeyError:
             self._forbidden_services[sha256] = {service_name}
     def register_children(self, parent: str, children: list[str]):
         """
-        Note for the purposes of dynamic recursion prevention which
-        files extracted other files.
+        Note which files extracted other files.
+        _parent_map is for dynamic recursion prevention
+        temporary_data is for cascading the temp data to children
         """
+        parent_temp = self.temporary_data[parent]
         for child in children:
+            if child not in self.temporary_data:
+                self.temporary_data[child] = parent_temp.new_file(child)
             try:
                 self._parent_map[child].add(parent)
             except KeyError:
                 self._parent_map[child] = {parent}
     def all_ancestors(self, sha256: str) -> list[str]:
+        """Collect all the known ancestors of the given file within this submission."""
         visited = set()
         to_visit = [sha256]
         while len(to_visit) > 0:
@@ -249,6 +343,72 @@ class SubmissionTask:
             for parent in self.all_ancestors(sha256)
         ]))
+    def set_monitoring_entry(self, sha256: str, service_name: str, values: dict[str, Optional[str]]):
+        """A service with monitoring has dispatched, keep track of the conditions."""
+        self.monitoring[(sha256, service_name)] = MonitorTask(
+            service=service_name,
+            sha=sha256,
+            values=values,
+        )
+    def partial_result(self, sha256, service_name):
+        """Note that a partial result has been recieved. If a dispatch was requested process that now."""
+        try:
+            entry = self.monitoring[(sha256, service_name)]
+        except KeyError:
+            return
+        if entry.dispatch_needed:
+            self.redispatch_service(sha256, service_name)
+    def clear_monitoring_entry(self, sha256, service_name):
+        """A service has completed normally. If the service is monitoring clear out the record."""
+        # We have an incoming non-partial result, flush out any partial monitoring
+        self.monitoring.pop((sha256, service_name), None)
+        # If there is a partial result for this service flush that as well so we accept this new result
+        result = self.service_results.get((sha256, service_name))
+        if result and result.partial:
+            self.service_results.pop((sha256, service_name), None)
+    def temporary_data_changed(self, key: str) -> list[str]:
+        """Check all of the monitored tasks on that key for changes. Redispatch as needed."""
+        changed = []
+        for (sha256, service), entry in self.monitoring.items():
+            # Check if this key is actually being monitored by this entry
+            if key not in entry.values:
+                continue
+            # Get whatever values (if any) were provided on the previous dispatch of this service
+            value = self.temporary_data[sha256].read_key(key)
+            dispatched_value = entry.values.get(key)
+            if type(value) is not type(dispatched_value) or value != dispatched_value:
+                result = self.service_results.get((sha256, service))
+                if not result:
+                    # If the value has changed since the last dispatch but results haven't come in yet
+                    # mark this service to be disptached later. This will only happen if the service
+                    # returns partial results, if there are full results the entry will be cleared instead.
+                    entry.dispatch_needed = True
+                else:
+                    # If there are results and there is a monitoring entry, the result was partial
+                    # so redispatch it immediately. If there are not partial results the monitoring
+                    # entry will have been cleared.
+                    self.redispatch_service(sha256, service)
+                    changed.append(sha256)
+        return changed
+    def redispatch_service(self, sha256, service_name):
+        # Clear the result if its partial or an error
+        result = self.service_results.get((sha256, service_name))
+        if result and not result.partial:
+            return
+        self.service_results.pop((sha256, service_name), None)
+        self.service_errors.pop((sha256, service_name), None)
+        self.service_attempts[(sha256, service_name)] = 1
+        # Try to get the service to run again by reseting the schedule for that service
+        self.file_schedules.pop(sha256, None)
 DISPATCH_TASK_ASSIGNMENT = 'dispatcher-tasks-assigned-to-'
 TASK_ASSIGNMENT_PATTERN = DISPATCH_TASK_ASSIGNMENT + '*'
@@ -277,7 +437,7 @@ SUBMISSION_TOTAL_TIMEOUT = 60 * 20
 class Dispatcher(ThreadedCoreBase):
     @staticmethod
-    def all_instances(persistent_redis):
+    def all_instances(persistent_redis: Redis):
         return Hash(DISPATCH_DIRECTORY, host=persistent_redis).keys()
     @staticmethod
@@ -297,7 +457,7 @@ class Dispatcher(ThreadedCoreBase):
         }
     def __init__(self, datastore=None, redis=None, redis_persist=None, logger=None,
-                 config=None, counter_name='dispatcher'):
+                 config=None, counter_name: str = 'dispatcher'):
         super().__init__('assemblyline.dispatcher', config=config, datastore=datastore,
                          redis=redis, redis_persist=redis_persist, logger=logger)
@@ -307,10 +467,9 @@ class Dispatcher(ThreadedCoreBase):
         self.finalizing = threading.Event()
         self.finalizing_start = 0.0
-        #
-        # # Build some utility classes
+        # Build some utility classes
         self.scheduler = Scheduler(self.datastore, self.config, self.redis)
-        self.running_tasks = Hash(DISPATCH_RUNNING_TASK_HASH, host=self.redis)
+        self.running_tasks: Hash[dict] = Hash(DISPATCH_RUNNING_TASK_HASH, host=self.redis)
         self.scaler_timeout_queue = NamedQueue(SCALER_TIMEOUT_QUEUE, host=self.redis_persist)
         self.classification_engine = get_classification()
@@ -331,12 +490,12 @@ class Dispatcher(ThreadedCoreBase):
         self.ingester_scanning = Hash('m-scanning-table', self.redis_persist)
         # Communications queues
-        self.start_queue = NamedQueue(DISPATCH_START_EVENTS+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
-        self.result_queue = NamedQueue(DISPATCH_RESULT_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
-        self.command_queue = NamedQueue(DISPATCH_COMMAND_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
-        # Submissions that should have alerts generated
-        self.alert_queue = NamedQueue(ALERT_QUEUE_NAME, self.redis_persist)
+        self.start_queue: NamedQueue[tuple[str, str, str, str]] =\
+            NamedQueue(DISPATCH_START_EVENTS+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
+        self.result_queue: NamedQueue[dict] =\
+            NamedQueue(DISPATCH_RESULT_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
+        self.command_queue: NamedQueue[dict] =\
+            NamedQueue(DISPATCH_COMMAND_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
         # Publish counters to the metrics sink.
         self.counter = MetricsFactory(metrics_type='dispatcher', schema=Metrics, name=counter_name,
@@ -397,11 +556,16 @@ class Dispatcher(ThreadedCoreBase):
                 _q = self.find_process_queue(sid)
                 _q.put(DispatchAction(kind=Action.check_submission, sid=sid))
-    def _handle_service_change_event(self, data: ServiceChange):
+    def _handle_service_change_event(self, data: Optional[ServiceChange]):
+        if not data:
+            # We may have missed change messages, flush cache
+            self.scheduler.c12n_services.clear()
+            return
         if data.operation == Operation.Removed:
             # Remove all current instances of service from scheduler cache
-            [service_set.remove(data.name) for service_set in self.scheduler.c12n_services.values()
-             if data.name in service_set]
+            for service_set in self.scheduler.c12n_services.values():
+                if data.name in service_set:
+                    service_set.remove(data.name)
         else:
             # If Added/Modifed, pull the service information and modify cache
             service: Service = self.datastore.get_service_with_delta(data.name)
@@ -461,7 +625,7 @@ class Dispatcher(ThreadedCoreBase):
         # If the dispatcher is exiting cleanly remove as many tasks from the service queues as we can
         service_queues = {}
         for task in self.tasks.values():
-            for (sha256, service_name), dispatch_key in task.queue_keys.items():
+            for (_sha256, service_name), dispatch_key in task.queue_keys.items():
                 try:
                     s_queue = service_queues[service_name]
                 except KeyError:
@@ -547,7 +711,7 @@ class Dispatcher(ThreadedCoreBase):
             return
         if not self.active_submissions.exists(sid):
-            self.log.info(f"[{sid}] New submission received")
+            self.log.info("[%s] New submission received", sid)
             self.active_submissions.add(sid, {
                 'completed_queue': task.completed_queue,
                 'submission': submission.as_primitives()
@@ -568,13 +732,16 @@ class Dispatcher(ThreadedCoreBase):
             self.log.info(f"[{sid}] Submission counts towards {submission.params.submitter.upper()} quota")
         # Apply initial data parameter
+        temp_key_config = dict(self.config.submission.default_temporary_keys)
+        temp_key_config.update(self.config.submission.temporary_keys)
+        temporary_data = TemporaryFileData(sha256, config=temp_key_config)
+        task.temporary_data[sha256] = temporary_data
         if submission.params.initial_data:
             try:
-                task.file_temporary_data[sha256] = {
-                    key: value
-                    for key, value in dict(json.loads(submission.params.initial_data)).items()
-                    if len(str(value)) <= self.config.submission.max_temp_data_length
-                }
+                for key, value in dict(json.loads(submission.params.initial_data)).items():
+                    if len(str(value)) > self.config.submission.max_temp_data_length:
+                        continue
+                    temporary_data.set_value(key, value)
             except (ValueError, TypeError) as err:
                 self.log.warning(f"[{sid}] could not process initialization data: {err}")
@@ -588,7 +755,7 @@ class Dispatcher(ThreadedCoreBase):
         # Initialize ancestry chain by identifying the root file
         file_info = self.get_fileinfo(task, sha256)
         file_type = file_info.type if file_info else 'NOT_FOUND'
-        task.file_temporary_data[sha256]['ancestry'] = [[dict(type=file_type, parent_relation="ROOT", sha256=sha256)]]
+        temporary_data.local_values['ancestry'] = [[dict(type=file_type, parent_relation="ROOT", sha256=sha256)]]
         # Start the file dispatching
         task.active_files.add(sha256)
@@ -597,6 +764,7 @@ class Dispatcher(ThreadedCoreBase):
     @elasticapm.capture_span(span_type='dispatcher')
     def get_fileinfo(self, task: SubmissionTask, sha256: str) -> Optional[FileInfo]:
+        """Read information about a file from the database, caching it locally."""
         # First try to get the info from local cache
         file_info = task.file_info.get(sha256, None)
         if file_info:
@@ -751,9 +919,12 @@ class Dispatcher(ThreadedCoreBase):
                         tags = list(task.file_tags.get(sha256, {}).values())
                     # Load the temp submission data we will pass
-                    temp_data = {}
+                    temp_data: dict[str, str] = {}
                     if service.uses_temp_submission_data:
-                        temp_data = task.file_temporary_data[sha256]
+                        temp_data = task.temporary_data[sha256].read()
+                        if service.monitored_keys:
+                            values = {key: temp_data.get(key) for key in service.monitored_keys}
+                            task.set_monitoring_entry(sha256, service.name, values)
                     # Load the metadata we will pass
                     metadata = {}
@@ -774,7 +945,6 @@ class Dispatcher(ThreadedCoreBase):
                     for service_name in prevented_services:
                         task.forbid_for_children(sha256, service_name)
                     # Build the actual service dispatch message
                     config = self.build_service_config(service, submission)
                     service_task = ServiceTask(dict(
@@ -929,10 +1099,10 @@ class Dispatcher(ThreadedCoreBase):
                 if self.dispatch_file(task, file_hash):
                     return True
         elif processing_files:
-            self.log.debug(f"[{task.submission.sid}] Not finished waiting on {len(processing_files)} "
-                           f"files: {list(processing_files)}")
+            self.log.debug("[%s] Not finished waiting on %d files: %s",
+                           task.submission.sid, len(processing_files), list(processing_files))
         else:
-            self.log.debug(f"[{task.submission.sid}] Finalizing submission.")
+            self.log.debug("[%s] Finalizing submission.", task.submission.sid)
             max_score = max(file_scores.values()) if file_scores else 0  # Submissions with no results have no score
             if self.tasks.pop(task.sid, None):
                 self.finalize_queue.put((task, max_score, checked))
@@ -1256,6 +1426,12 @@ class Dispatcher(ThreadedCoreBase):
         self.clear_timeout(task, sha256, service_name)
         task.service_logs.pop((sha256, service_name), None)
+        if summary.partial:
+            self.log.info("[%s/%s] %s returned partial results", sid, sha256, service_name)
+            task.partial_result(sha256, service_name)
+        else:
+            task.clear_monitoring_entry(sha256, service_name)
         # Don't process duplicates
         if (sha256, service_name) in task.service_results:
             return
@@ -1277,8 +1453,8 @@ class Dispatcher(ThreadedCoreBase):
         if isinstance(tags, list):
             self.log.warning("Deprecation: Old format of tags found. "
                              "This format changed with the release of 4.3 on 09-2022. "
-                             f"Rebuilding {service_name} may be required or the result of a cache hit. "
-                             "Proceeding with conversion to compatible format..")
+                             "Rebuilding %s may be required or the result of a cache hit. "
+                             "Proceeding with conversion to compatible format..", service_name)
             alt_tags = {}
             for t in tags:
                 key = f"{t['type']}:{t['value']}"
@@ -1293,11 +1469,6 @@ class Dispatcher(ThreadedCoreBase):
             else:
                 task.file_tags[sha256][key] = value
-        # Update the temporary data table for this file
-        for key, value in (temporary_data or {}).items():
-            if len(str(value)) <= self.config.submission.max_temp_data_length:
-                task.file_temporary_data[sha256][key] = value
         # Update children to include parent_relation, likely EXTRACTED
         if summary.children and isinstance(summary.children[0], str):
             old_children = typing.cast(list[str], summary.children)
@@ -1307,6 +1478,13 @@ class Dispatcher(ThreadedCoreBase):
         task.service_results[(sha256, service_name)] = summary
         task.register_children(sha256, [c for c, _ in summary.children])
+        # Update the temporary data table for this file
+        force_redispatch = set()
+        for key, value in (temporary_data or {}).items():
+            if len(str(value)) <= self.config.submission.max_temp_data_length:
+                if task.temporary_data[sha256].set_value(key, value):
+                    force_redispatch |= set(task.temporary_data_changed(key))
         # Set the depth of all extracted files, even if we won't be processing them
         depth_limit = self.config.submission.max_extraction_depth
         new_depth = task.file_depth[sha256] + 1
@@ -1322,7 +1500,7 @@ class Dispatcher(ThreadedCoreBase):
             if new_depth < depth_limit:
                 # Prepare the temporary data from the parent to build the temporary data table for
                 # these newly extract files
-                parent_data = task.file_temporary_data[sha256]
+                parent_data = task.temporary_data[sha256]
                 for extracted_sha256, parent_relation in summary.children:
@@ -1330,7 +1508,7 @@ class Dispatcher(ThreadedCoreBase):
                         continue
                     if len(task.active_files) > submission.params.max_extracted:
-                        self.log.info(f'[{sid}] hit extraction limit, dropping {extracted_sha256}')
+                        self.log.info('[%s] hit extraction limit, dropping %s', sid, extracted_sha256)
                         task.dropped_files.add(extracted_sha256)
                         self._dispatching_error(task, Error({
                             'archive_ts': None,
@@ -1351,21 +1529,20 @@ class Dispatcher(ThreadedCoreBase):
                     dispatched += 1
                     task.active_files.add(extracted_sha256)
-                    try:
-                        parent_ancestry = parent_data['ancestry']
-                    except KeyError:
-                        self.log.warn(f"[{sid} :: {sha256}] missing ancestry data.")
-                        parent_ancestry = []
-                    existing_ancestry = task.file_temporary_data.get(extracted_sha256, {}).get('ancestry', [])
+                    # Get the new ancestory data
                     file_info = self.get_fileinfo(task, extracted_sha256)
                     file_type = file_info.type if file_info else 'NOT_FOUND'
                     current_ancestry_node = dict(type=file_type, parent_relation=parent_relation,
                                                  sha256=extracted_sha256)
-                    task.file_temporary_data[extracted_sha256] = dict(parent_data)
-                    task.file_temporary_data[extracted_sha256]['ancestry'] = existing_ancestry
-                    [task.file_temporary_data[extracted_sha256]['ancestry'].append(ancestry + [current_ancestry_node])
-                     for ancestry in parent_ancestry]
+                    # Update ancestory data
+                    parent_ancestry = parent_data.read_key('ancestry') or []
+                    existing_ancestry = task.temporary_data[extracted_sha256].local_values.setdefault('ancestry', [])
+                    for ancestry in parent_ancestry:
+                        existing_ancestry.append(ancestry + [current_ancestry_node])
+                    # Trigger the processing of the extracted file
                     self.find_process_queue(sid).put(DispatchAction(kind=Action.dispatch_file, sid=sid,
                                                                     sha=extracted_sha256))
             else:
@@ -1388,13 +1565,15 @@ class Dispatcher(ThreadedCoreBase):
         # Check if its worth trying to run the next stage
         # Not worth running if we know we are waiting for another service
-        if any(_s == sha256 for _s, _ in task.running_services):
-            return
+        if not any(_s == sha256 for _s, _ in task.running_services):
+            force_redispatch.add(sha256)
         # Not worth running if we know we have services in queue
-        if any(_s == sha256 for _s, _ in task.queue_keys.keys()):
-            return
+        if not any(_s == sha256 for _s, _ in task.queue_keys.keys()):
+            force_redispatch.add(sha256)
         # Try to run the next stage
-        self.dispatch_file(task, sha256)
+        for sha256 in force_redispatch:
+            self.dispatch_file(task, sha256)
     @elasticapm.capture_span(span_type='dispatcher')
     def _dispatching_error(self, task: SubmissionTask, error):
@@ -1677,13 +1856,13 @@ class Dispatcher(ThreadedCoreBase):
     @elasticapm.capture_span(span_type='dispatcher')
     def list_outstanding(self, sid: str, queue_name: str):
-        response_queue = NamedQueue(queue_name, host=self.redis)
+        response_queue: NamedQueue[dict] = NamedQueue(queue_name, host=self.redis)
         outstanding: defaultdict[str, int] = defaultdict(int)
         task = self.tasks.get(sid)
         if task:
-            for sha, service_name in list(task.queue_keys.keys()):
+            for _sha, service_name in list(task.queue_keys.keys()):
                 outstanding[service_name] += 1
-            for sha, service_name in list(task.running_services):
+            for _sha, service_name in list(task.running_services):
                 outstanding[service_name] += 1
         response_queue.push(outstanding)
@@ -1698,7 +1877,7 @@ class Dispatcher(ThreadedCoreBase):
                 error_tasks = []
                 # iterate running tasks
-                for task_key, task_body in self.running_tasks:
+                for _task_key, task_body in self.running_tasks:
                     task = ServiceTask(task_body)
                     # Its a bad task if it's dispatcher isn't running
                     if task.metadata['dispatcher__'] not in dispatcher_instances:

{assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core/tasking_client.py RENAMED Viewed

@@ -159,8 +159,7 @@ class TaskingClient:
             if not self.datastore.service_delta.exists(service.name):
                 self.datastore.service_delta.save(service.name, {'version': service.version})
                 self.datastore.service_delta.commit()
-                self.log.info(f"{log_prefix}{service.name} "
-                              f"version ({service.version}) registered")
+                self.log.info(f"{log_prefix}{service.name} version ({service.version}) registered")
             new_heuristics = []
             if heuristics:

{assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/assemblyline_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: assemblyline-core
-Version: 4.5.0.64
+Version: 4.5.0.66
 Summary: Assemblyline 4 - Core components
 Home-page: https://github.com/CybercentreCanada/assemblyline-core/
 Author: CCCS Assemblyline development team

{assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_scheduler.py RENAMED Viewed

@@ -16,7 +16,7 @@ def redis(redis_connection):
     redis_connection.flushdb()
-def dummy_service(name, stage, category='static', accepts='', rejects=None, docid=None, extra_data=False):
+def dummy_service(name, stage, category='static', accepts='', rejects=None, docid=None, extra_data=False, monitored_keys=()):
     return Service({
         'name': name,
         'stage': stage,
@@ -28,6 +28,7 @@ def dummy_service(name, stage, category='static', accepts='', rejects=None, doci
         'version': '0',
         'enabled': True,
         'timeout': 2,
+        'monitored_keys': list(monitored_keys),
         'docker_config': {
             'image': 'somefakedockerimage:latest'
         }

{assemblyline-core-4.5.0.64 → assemblyline-core-4.5.0.66}/test/test_simulation.py RENAMED Viewed

@@ -54,6 +54,7 @@ def redis(redis_connection: Redis[Any]):
 _global_semaphore = threading.Semaphore(value=1)
+print_lock = threading.Lock()
 class MockService(ServerBase):
@@ -107,6 +108,25 @@ class MockService(ServerBase):
                 self.dispatch_client.service_failed(task.sid, error=error, error_key=get_random_id())
                 continue
+            partial = False
+            temp_data = {entry.name: entry.value for entry in task.temporary_submission_data}
+            with print_lock:
+                print(self.service_name)
+                print('instructions', instructions)
+                print('temp', temp_data)
+            if 'partial' in instructions:
+                partial = True
+                requirements = instructions['partial']
+                for key, value in requirements.items():
+                    if value in temp_data.get(key, ''):
+                        partial = False
+                    else:
+                        partial = True
+                        break
+            if partial:
+                print(self.service_name, "will produce partial results")
             result_data = {
                 'archive_ts': None,
                 'classification': 'U',
@@ -115,8 +135,8 @@ class MockService(ServerBase):
                     'service_tool_version': '0',
                     'service_name': self.service_name,
                 },
-                'result': {
-                },
+                'result': {},
+                'partial': partial,
                 'sha256': task.fileinfo.sha256,
                 'expiry_ts': time.time() + 600
             }
@@ -125,8 +145,12 @@ class MockService(ServerBase):
             result_data['response'].update(instructions.get('response', {}))
             result = Result(result_data)
-            result_key = instructions.get('result_key', get_random_id())
-            self.dispatch_client.service_finished(task.sid, result_key, result)
+            try:
+                result_key = instructions['result_key']
+            except KeyError:
+                result_key = result.build_key(get_random_id())
+            self.dispatch_client.service_finished(task.sid, result_key, result,
+                                                  temporary_data=instructions.get('temporary_data'))
 class CoreSession:
@@ -226,16 +250,24 @@ def core(request, redis, filestore, config, clean_datastore: AssemblylineDatasto
     # Register services
     stages = get_service_stage_hash(redis)
+    service_config: list[tuple[str, int, str, dict]] = [
+        ('pre', 1, 'EXTRACT', {'extra_data': True, 'monitored_keys': ['passwords']}),
+        ('core-a', 2, 'CORE', {}),
+        ('core-b', 1, 'CORE', {}),
+        ('finish', 1, 'POST', {'extra_data': True})
+    ]
     services = []
-    for svc, stage in [('pre', 'EXTRACT'), ('core-a', 'CORE'), ('core-b', 'CORE'), ('finish', 'POST')]:
-        ds.service.save(f'{svc}_0', dummy_service(svc, stage, docid=f'{svc}_0'))
+    for svc, count, stage, details in service_config:
+        ds.service.save(f'{svc}_0', dummy_service(svc, stage, docid=f'{svc}_0', **details))
         ds.service_delta.save(svc, ServiceDelta({
             'name': svc,
             'version': '0',
             'enabled': True
         }))
         stages.set(svc, ServiceStage.Running)
-        services.append(MockService(svc, ds, redis, filestore))
+        for _ in range(count):
+            services.append(MockService(svc, ds, redis, filestore))
     user = random_model_obj(User)
     user.uname = "user"
@@ -1134,8 +1166,189 @@ def test_tag_filter(core: CoreSession, metrics):
         metrics.expect('dispatcher', 'submissions_completed', 1)
         metrics.expect('dispatcher', 'files_completed', 1)
-        alert = core.dispatcher.postprocess_worker.alert_queue.pop(timeout=5)
+        alert: dict = core.dispatcher.postprocess_worker.alert_queue.pop(timeout=5)
         assert alert['submission']['sid'] == sub['sid']
     finally:
         core.dispatcher.postprocess_worker.actions.pop('test_process')
+def test_partial(core: CoreSession, metrics):
+    # Have pre produce a partial result, then have core-a update a monitored key
+    sha, size = ready_body(core, {
+        'pre': {'partial': {'passwords': 'test_temp_data_monitoring'}},
+    })
+    core.ingest_queue.push(SubmissionInput(dict(
+        metadata={},
+        params=dict(
+            description="file abc123",
+            services=dict(selected=[]),
+            submitter='user',
+            groups=['user'],
+            max_extracted=10000
+        ),
+        notification=dict(
+            queue='temp-data-monitor',
+            threshold=0
+        ),
+        files=[dict(
+            sha256=sha,
+            size=size,
+            name='abc123'
+        )]
+    )).as_primitives())
+    notification_queue = NamedQueue('nq-temp-data-monitor', core.redis)
+    dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
+    assert dropped_task
+    dropped_task = IngestTask(dropped_task)
+    sub: Submission = core.ds.submission.get(dropped_task.submission.sid)
+    assert len(sub.errors) == 0
+    assert len(sub.results) == 4, 'results'
+    assert core.pre_service.hits[sha] == 1, 'pre_service.hits'
+    # Wait until we get feedback from the metrics channel
+    metrics.expect('ingester', 'submissions_ingested', 1)
+    metrics.expect('ingester', 'submissions_completed', 1)
+    metrics.expect('dispatcher', 'submissions_completed', 1)
+    metrics.expect('dispatcher', 'files_completed', 1)
+    partial_results = 0
+    for res in sub.results:
+        result = core.ds.get_single_result(res, as_obj=True)
+        assert result is not None, res
+        if result.partial:
+            partial_results += 1
+    assert partial_results == 1, 'partial_results'
+def test_temp_data_monitoring(core: CoreSession, metrics):
+    # Have pre produce a partial result, then have core-a update a monitored key
+    sha, size = ready_body(core, {
+        'pre': {'partial': {'passwords': 'test_temp_data_monitoring'}},
+        'core-a': {'temporary_data': {'passwords': ['test_temp_data_monitoring']}},
+        'final': {'temporary_data': {'passwords': ['some other password']}},
+    })
+    core.ingest_queue.push(SubmissionInput(dict(
+        metadata={},
+        params=dict(
+            description="file abc123",
+            services=dict(selected=[]),
+            submitter='user',
+            groups=['user'],
+            max_extracted=10000
+        ),
+        notification=dict(
+            queue='temp-data-monitor',
+            threshold=0
+        ),
+        files=[dict(
+            sha256=sha,
+            size=size,
+            name='abc123'
+        )]
+    )).as_primitives())
+    notification_queue = NamedQueue('nq-temp-data-monitor', core.redis)
+    dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
+    assert dropped_task
+    dropped_task = IngestTask(dropped_task)
+    sub: Submission = core.ds.submission.get(dropped_task.submission.sid)
+    assert len(sub.errors) == 0
+    assert len(sub.results) == 4, 'results'
+    assert core.pre_service.hits[sha] >= 2, f'pre_service.hits {core.pre_service.hits}'
+    # Wait until we get feedback from the metrics channel
+    metrics.expect('ingester', 'submissions_ingested', 1)
+    metrics.expect('ingester', 'submissions_completed', 1)
+    metrics.expect('dispatcher', 'submissions_completed', 1)
+    metrics.expect('dispatcher', 'files_completed', 1)
+    partial_results = 0
+    for res in sub.results:
+        result = core.ds.get_single_result(res, as_obj=True)
+        assert result is not None, res
+        if result.partial:
+            partial_results += 1
+    assert partial_results == 0, 'partial_results'
+def test_complex_extracted(core: CoreSession, metrics):
+    # stages to this processing when everything goes well
+    # 1. extract a file that will process to produce a partial result
+    # 2. hold a few seconds on the second stage of the root file to let child start
+    # 3. on the last stage of the root file produce the password
+    dispatcher.TIMEOUT_EXTRA_TIME = 100
+    child_sha, _ = ready_body(core, {
+        'pre': {'partial': {'passwords': 'test_temp_data_monitoring'}},
+    })
+    sha, size = ready_body(core, {
+        'pre': {
+            'response': {
+                'extracted': [{
+                    'name': child_sha,
+                    'sha256': child_sha,
+                    'description': 'abc',
+                    'classification': 'U'
+                }]
+            }
+        },
+        'core-a': {'lock': 60},
+        'finish': {'temporary_data': {'passwords': ['test_temp_data_monitoring']}},
+    })
+    core.ingest_queue.push(SubmissionInput(dict(
+        metadata={},
+        params=dict(
+            description="file abc123",
+            services=dict(selected=''),
+            submitter='user',
+            groups=['user'],
+            max_extracted=10000
+        ),
+        notification=dict(
+            queue='complex-extracted-file',
+            threshold=0
+        ),
+        files=[dict(
+            sha256=sha,
+            size=size,
+            name='abc123'
+        )]
+    )).as_primitives())
+    # Wait for the extract file to finish
+    metrics.expect('dispatcher', 'files_completed', 1)
+    # check that there is a pending result in the dispatcher
+    task = next(iter(core.dispatcher.tasks.values()))
+    assert 1 == sum(int(summary.partial) for summary in task.service_results.values())
+    _global_semaphore.release()
+    # Wait for the entire submission to finish
+    notification_queue = NamedQueue('nq-complex-extracted-file', core.redis)
+    dropped_task = notification_queue.pop(timeout=RESPONSE_TIMEOUT)
+    assert dropped_task
+    dropped_task = IngestTask(dropped_task)
+    sub: Submission = core.ds.submission.get(dropped_task.submission.sid)
+    assert len(sub.errors) == 0
+    assert len(sub.results) == 8, 'results'
+    assert core.pre_service.hits[sha] == 1, 'pre_service.hits[root]'
+    assert core.pre_service.hits[child_sha] >= 2, 'pre_service.hits[child]'
+    # Wait until we get feedback from the metrics channel
+    metrics.expect('ingester', 'submissions_ingested', 1)
+    metrics.expect('ingester', 'submissions_completed', 1)
+    metrics.expect('dispatcher', 'submissions_completed', 1)
+    metrics.expect('dispatcher', 'files_completed', 2)
+    partial_results = 0
+    for res in sub.results:
+        result = core.ds.get_single_result(res, as_obj=True)
+        assert result is not None, res
+        if result.partial:
+            partial_results += 1
+    assert partial_results == 0, 'partial_results'