assemblyline-core 4.5.1.dev426__tar.gz → 4.7.0.dev45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/PKG-INFO +3 -2
  2. assemblyline_core-4.7.0.dev45/assemblyline_core/VERSION +1 -0
  3. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/dispatching/client.py +2 -1
  4. assemblyline_core-4.7.0.dev45/assemblyline_core/dispatching/dispatcher.py +327 -0
  5. assemblyline_core-4.7.0.dev45/assemblyline_core/ingester/ingester.py +116 -0
  6. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/metrics/heartbeat_formatter.py +1 -1
  7. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/replay/client.py +34 -15
  8. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/replay/loader/run.py +1 -1
  9. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/replay/loader/run_worker.py +7 -6
  10. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/scaler/controllers/docker_ctl.py +12 -6
  11. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/scaler/controllers/interface.py +1 -10
  12. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/scaler/controllers/kubernetes_ctl.py +102 -87
  13. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/scaler/scaler_server.py +61 -75
  14. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/submission_client.py +55 -4
  15. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/tasking_client.py +75 -34
  16. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/updater/helper.py +8 -10
  17. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/updater/run_updater.py +52 -30
  18. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/vacuum/worker.py +29 -26
  19. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/workflow/run_workflow.py +6 -1
  20. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core.egg-info/PKG-INFO +3 -2
  21. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core.egg-info/SOURCES.txt +2 -10
  22. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_replay.py +47 -5
  23. assemblyline_core-4.7.0.dev45/test/test_tasking_client.py +77 -0
  24. assemblyline_core-4.7.0.dev45/test/test_workflow.py +45 -0
  25. assemblyline_core-4.5.1.dev426/assemblyline_core/VERSION +0 -1
  26. assemblyline_core-4.5.1.dev426/assemblyline_core/dispatching/__main__.py +0 -5
  27. assemblyline_core-4.5.1.dev426/assemblyline_core/dispatching/dispatcher.py +0 -1992
  28. assemblyline_core-4.5.1.dev426/assemblyline_core/dispatching/timeout.py +0 -59
  29. assemblyline_core-4.5.1.dev426/assemblyline_core/ingester/__main__.py +0 -5
  30. assemblyline_core-4.5.1.dev426/assemblyline_core/ingester/ingester.py +0 -950
  31. assemblyline_core-4.5.1.dev426/assemblyline_core/plumber/run_plumber.py +0 -194
  32. assemblyline_core-4.5.1.dev426/assemblyline_core/workflow/__init__.py +0 -0
  33. assemblyline_core-4.5.1.dev426/test/test_dispatcher.py +0 -456
  34. assemblyline_core-4.5.1.dev426/test/test_plumber.py +0 -109
  35. assemblyline_core-4.5.1.dev426/test/test_simulation.py +0 -1354
  36. assemblyline_core-4.5.1.dev426/test/test_worker_ingest.py +0 -248
  37. assemblyline_core-4.5.1.dev426/test/test_worker_submit.py +0 -138
  38. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/LICENCE.md +0 -0
  39. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/README.md +0 -0
  40. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/__init__.py +0 -0
  41. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/alerter/__init__.py +0 -0
  42. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/alerter/processing.py +0 -0
  43. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/alerter/run_alerter.py +0 -0
  44. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/archiver/__init__.py +0 -0
  45. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/archiver/run_archiver.py +0 -0
  46. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/badlist_client.py +0 -0
  47. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/dispatching/__init__.py +0 -0
  48. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/dispatching/schedules.py +0 -0
  49. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/expiry/__init__.py +0 -0
  50. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/expiry/run_expiry.py +0 -0
  51. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/ingester/__init__.py +0 -0
  52. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/ingester/constants.py +0 -0
  53. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/metrics/__init__.py +0 -0
  54. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/metrics/es_metrics.py +0 -0
  55. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/metrics/helper.py +0 -0
  56. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/metrics/metrics_server.py +0 -0
  57. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/metrics/run_heartbeat_manager.py +0 -0
  58. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/metrics/run_metrics_aggregator.py +0 -0
  59. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/metrics/run_statistics_aggregator.py +0 -0
  60. {assemblyline_core-4.5.1.dev426/assemblyline_core/plumber → assemblyline_core-4.7.0.dev45/assemblyline_core/replay}/__init__.py +0 -0
  61. {assemblyline_core-4.5.1.dev426/assemblyline_core/replay → assemblyline_core-4.7.0.dev45/assemblyline_core/replay/creator}/__init__.py +0 -0
  62. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/replay/creator/run.py +0 -0
  63. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/replay/creator/run_worker.py +0 -0
  64. {assemblyline_core-4.5.1.dev426/assemblyline_core/replay/creator → assemblyline_core-4.7.0.dev45/assemblyline_core/replay/loader}/__init__.py +0 -0
  65. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/replay/replay.py +0 -0
  66. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/safelist_client.py +0 -0
  67. {assemblyline_core-4.5.1.dev426/assemblyline_core/replay/loader → assemblyline_core-4.7.0.dev45/assemblyline_core/scaler}/__init__.py +0 -0
  68. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/scaler/collection.py +0 -0
  69. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/scaler/controllers/__init__.py +0 -0
  70. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/scaler/run_scaler.py +0 -0
  71. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/server_base.py +0 -0
  72. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/signature_client.py +0 -0
  73. {assemblyline_core-4.5.1.dev426/assemblyline_core/scaler → assemblyline_core-4.7.0.dev45/assemblyline_core/updater}/__init__.py +0 -0
  74. {assemblyline_core-4.5.1.dev426/assemblyline_core/updater → assemblyline_core-4.7.0.dev45/assemblyline_core/vacuum}/__init__.py +0 -0
  75. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/vacuum/crawler.py +0 -0
  76. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/vacuum/department_map.py +0 -0
  77. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/vacuum/safelist.py +0 -0
  78. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core/vacuum/stream_map.py +0 -0
  79. {assemblyline_core-4.5.1.dev426/assemblyline_core/vacuum → assemblyline_core-4.7.0.dev45/assemblyline_core/workflow}/__init__.py +0 -0
  80. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core.egg-info/dependency_links.txt +0 -0
  81. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core.egg-info/requires.txt +0 -0
  82. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/assemblyline_core.egg-info/top_level.txt +0 -0
  83. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/setup.cfg +0 -0
  84. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/setup.py +0 -0
  85. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_alerter.py +0 -0
  86. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_badlist_client.py +0 -0
  87. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_expiry.py +0 -0
  88. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_safelist_client.py +0 -0
  89. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_scaler.py +0 -0
  90. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_scheduler.py +0 -0
  91. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_signature_client.py +0 -0
  92. {assemblyline_core-4.5.1.dev426 → assemblyline_core-4.7.0.dev45}/test/test_vacuum.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: assemblyline-core
3
- Version: 4.5.1.dev426
3
+ Version: 4.7.0.dev45
4
4
  Summary: Assemblyline 4 - Core components
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-core/
6
6
  Author: CCCS Assemblyline development team
@@ -33,6 +33,7 @@ Dynamic: description-content-type
33
33
  Dynamic: home-page
34
34
  Dynamic: keywords
35
35
  Dynamic: license
36
+ Dynamic: license-file
36
37
  Dynamic: provides-extra
37
38
  Dynamic: requires-dist
38
39
  Dynamic: summary
@@ -0,0 +1 @@
1
+ 4.7.0.dev45
@@ -16,6 +16,7 @@ from assemblyline.common.constants import DISPATCH_RUNNING_TASK_HASH, SUBMISSION
16
16
  make_watcher_list_name, DISPATCH_TASK_HASH
17
17
  from assemblyline.common.forge import CachedObject, get_service_queue
18
18
  from assemblyline.common.isotime import now_as_iso
19
+ from assemblyline.common.dispatcher import Dispatcher
19
20
  from assemblyline.datastore.exceptions import VersionConflictException
20
21
  from assemblyline.odm.base import DATEFORMAT
21
22
  from assemblyline.odm.messages.dispatching import DispatcherCommandMessage, CREATE_WATCH, \
@@ -30,7 +31,7 @@ from assemblyline.remote.datatypes.hash import ExpiringHash, Hash
30
31
  from assemblyline.remote.datatypes.queues.named import NamedQueue
31
32
  from assemblyline.remote.datatypes.set import ExpiringSet, Set
32
33
  from assemblyline_core.dispatching.dispatcher import DISPATCH_START_EVENTS, DISPATCH_RESULT_QUEUE, \
33
- DISPATCH_COMMAND_QUEUE, QUEUE_EXPIRY, BAD_SID_HASH, ServiceTask, Dispatcher
34
+ DISPATCH_COMMAND_QUEUE, QUEUE_EXPIRY, BAD_SID_HASH, ServiceTask
34
35
 
35
36
 
36
37
  MAX_CANCEL_RESPONSE_WAIT = 10
@@ -0,0 +1,327 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ import enum
5
+ import os
6
+ import threading
7
+ import time
8
+ import uuid
9
+ from contextlib import contextmanager
10
+ from copy import deepcopy
11
+ from queue import Empty, PriorityQueue, Queue
12
+ from typing import TYPE_CHECKING, Any, Iterable, Optional
13
+
14
+ import elasticapm
15
+
16
+ from assemblyline.common.constants import (
17
+ DISPATCH_RUNNING_TASK_HASH,
18
+ DISPATCH_TASK_HASH,
19
+ SCALER_TIMEOUT_QUEUE,
20
+ SUBMISSION_QUEUE,
21
+ make_watcher_list_name,
22
+ )
23
+ from assemblyline.common.forge import (
24
+ get_apm_client,
25
+ get_classification,
26
+ get_service_queue,
27
+ )
28
+ from assemblyline.common.isotime import now_as_iso
29
+ from assemblyline.common.metrics import MetricsFactory
30
+ from assemblyline.common.postprocess import ActionWorker
31
+ from assemblyline.datastore.helper import AssemblylineDatastore
32
+ from assemblyline.odm.messages.changes import Operation, ServiceChange
33
+ from assemblyline.odm.messages.dispatcher_heartbeat import Metrics
34
+ from assemblyline.odm.messages.dispatching import (
35
+ CREATE_WATCH,
36
+ LIST_OUTSTANDING,
37
+ UPDATE_BAD_SID,
38
+ CreateWatch,
39
+ DispatcherCommandMessage,
40
+ ListOutstanding,
41
+ WatchQueueMessage,
42
+ )
43
+ from assemblyline.odm.messages.service_heartbeat import Metrics as ServiceMetrics
44
+ from assemblyline.odm.messages.submission import (
45
+ SubmissionMessage,
46
+ from_datastore_submission,
47
+ )
48
+ from assemblyline.odm.messages.task import FileInfo
49
+ from assemblyline.odm.messages.task import Task as ServiceTask
50
+ from assemblyline.odm.models.error import Error
51
+ from assemblyline.odm.models.result import Result
52
+ from assemblyline.odm.models.service import Service
53
+ from assemblyline.odm.models.submission import Submission, TraceEvent
54
+ from assemblyline.odm.models.user import User
55
+ from assemblyline.remote.datatypes.events import EventWatcher
56
+ from assemblyline.remote.datatypes.exporting_counter import export_metrics_once
57
+ from assemblyline.remote.datatypes.hash import Hash
58
+ from assemblyline.remote.datatypes.queues.comms import CommsQueue
59
+ from assemblyline.remote.datatypes.queues.named import NamedQueue
60
+ from assemblyline.remote.datatypes.set import ExpiringSet, Set
61
+ from assemblyline.remote.datatypes.user_quota_tracker import UserQuotaTracker
62
+ from assemblyline_core.server_base import ThreadedCoreBase
63
+
64
+ from ..ingester.constants import COMPLETE_QUEUE_NAME
65
+ from .schedules import Scheduler
66
+
67
+ if TYPE_CHECKING:
68
+ from redis import Redis
69
+
70
+ from assemblyline.odm.models.file import File
71
+ from assemblyline.odm.models.config import Config
72
+
73
+
74
+ APM_SPAN_TYPE = 'handle_message'
75
+
76
+ AL_SHUTDOWN_GRACE = int(os.environ.get('AL_SHUTDOWN_GRACE', '60'))
77
+ AL_SHUTDOWN_QUIT = 60
78
+ FINALIZING_WINDOW = max(AL_SHUTDOWN_GRACE - AL_SHUTDOWN_QUIT, 0)
79
+ RESULT_BATCH_SIZE = int(os.environ.get('DISPATCHER_RESULT_BATCH_SIZE', '50'))
80
+ ERROR_BATCH_SIZE = int(os.environ.get('DISPATCHER_ERROR_BATCH_SIZE', '50'))
81
+ DAY_IN_SECONDS = 24 * 60 * 60
82
+
83
+
84
+ class KeyType(enum.Enum):
85
+ OVERWRITE = 'overwrite'
86
+ UNION = 'union'
87
+
88
+
89
+ class Action(enum.IntEnum):
90
+ start = 0
91
+ result = 1
92
+ dispatch_file = 2
93
+ service_timeout = 3
94
+ check_submission = 4
95
+ bad_sid = 5
96
+
97
+
98
+ @dataclasses.dataclass(order=True)
99
+ class DispatchAction:
100
+ kind: Action
101
+ sid: str = dataclasses.field(compare=False)
102
+ sha: Optional[str] = dataclasses.field(compare=False, default=None)
103
+ service_name: Optional[str] = dataclasses.field(compare=False, default=None)
104
+ worker_id: Optional[str] = dataclasses.field(compare=False, default=None)
105
+ data: Any = dataclasses.field(compare=False, default=None)
106
+ event: Optional[threading.Event] = dataclasses.field(compare=False, default=None)
107
+
108
+
109
+
110
+ @contextmanager
111
+ def apm_span(client, span_name: str):
112
+ try:
113
+ if client:
114
+ client.begin_transaction(APM_SPAN_TYPE)
115
+ yield None
116
+ if client:
117
+ client.end_transaction(span_name, 'success')
118
+ except Exception:
119
+ if client:
120
+ client.end_transaction(span_name, 'exception')
121
+ raise
122
+
123
+
124
+ DISPATCH_TASK_ASSIGNMENT = 'dispatcher-tasks-assigned-to-'
125
+ TASK_ASSIGNMENT_PATTERN = DISPATCH_TASK_ASSIGNMENT + '*'
126
+ DISPATCH_START_EVENTS = 'dispatcher-start-events-'
127
+ DISPATCH_RESULT_QUEUE = 'dispatcher-results-'
128
+ DISPATCH_COMMAND_QUEUE = 'dispatcher-commands-'
129
+ DISPATCH_DIRECTORY = 'dispatchers-directory'
130
+ DISPATCH_DIRECTORY_FINALIZE = 'dispatchers-directory-finalizing'
131
+ BAD_SID_HASH = 'bad-sid-hash'
132
+ QUEUE_EXPIRY = 60*60
133
+ SERVICE_VERSION_EXPIRY_TIME = 30 * 60 # How old service version info can be before we ignore it
134
+ GUARD_TIMEOUT = 60*2
135
+ GLOBAL_TASK_CHECK_INTERVAL = 60*10
136
+ TIMEOUT_EXTRA_TIME = 5
137
+ TIMEOUT_TEST_INTERVAL = 5
138
+ MAX_RESULT_BUFFER = 64
139
+ RESULT_THREADS = max(1, int(os.getenv('DISPATCHER_RESULT_THREADS', '2')))
140
+ FINALIZE_THREADS = max(1, int(os.getenv('DISPATCHER_FINALIZE_THREADS', '2')))
141
+
142
+ # After 20 minutes, check if a submission is still making progress.
143
+ # In the case of a crash somewhere else in the system, we may not have
144
+ # gotten a message we are expecting. This should prompt a retry in most
145
+ # cases.
146
+ SUBMISSION_TOTAL_TIMEOUT = 60 * 20
147
+
148
+
149
+ class Dispatcher(ThreadedCoreBase):
150
+ # @staticmethod
151
+ # def all_instances(persistent_redis: Redis):
152
+ # return Hash(DISPATCH_DIRECTORY, host=persistent_redis).keys()
153
+
154
+ # @staticmethod
155
+ # def instance_assignment_size(persistent_redis, instance_id):
156
+ # return Hash(DISPATCH_TASK_ASSIGNMENT + instance_id, host=persistent_redis).length()
157
+
158
+ # @staticmethod
159
+ # def instance_assignment(persistent_redis, instance_id) -> list[str]:
160
+ # return Hash(DISPATCH_TASK_ASSIGNMENT + instance_id, host=persistent_redis).keys()
161
+
162
+ # @staticmethod
163
+ # def all_queue_lengths(redis, instance_id):
164
+ # return {
165
+ # 'start': NamedQueue(DISPATCH_START_EVENTS + instance_id, host=redis).length(),
166
+ # 'result': NamedQueue(DISPATCH_RESULT_QUEUE + instance_id, host=redis).length(),
167
+ # 'command': NamedQueue(DISPATCH_COMMAND_QUEUE + instance_id, host=redis).length()
168
+ # }
169
+
170
+ def __init__(self, datastore=None, redis=None, redis_persist=None, logger=None,
171
+ config=None, counter_name: str = 'dispatcher'):
172
+ super().__init__('assemblyline.dispatcher', config=config, datastore=datastore,
173
+ redis=redis, redis_persist=redis_persist, logger=logger)
174
+
175
+ # Load the datastore collections that we are going to be using
176
+ self.instance_id = uuid.uuid4().hex
177
+ self.tasks: dict[str, SubmissionTask] = {}
178
+ self.finalizing = threading.Event()
179
+ self.finalizing_start = 0.0
180
+
181
+ # Build some utility classes
182
+ self.scheduler = Scheduler(self.datastore, self.config, self.redis)
183
+ self.running_tasks: Hash[dict] = Hash(DISPATCH_RUNNING_TASK_HASH, host=self.redis)
184
+ self.scaler_timeout_queue = NamedQueue(SCALER_TIMEOUT_QUEUE, host=self.redis_persist)
185
+
186
+ self.classification_engine = get_classification()
187
+
188
+ # Output. Duplicate our input traffic into this queue so it may be cloned by other systems
189
+ self.traffic_queue = CommsQueue('submissions', self.redis)
190
+ self.quota_tracker = UserQuotaTracker('submissions', timeout=60 * 60, host=self.redis_persist)
191
+ self.submission_queue = NamedQueue(SUBMISSION_QUEUE, self.redis)
192
+
193
+ # Table to track the running dispatchers
194
+ self.dispatchers_directory: Hash[int] = Hash(DISPATCH_DIRECTORY, host=self.redis_persist)
195
+ self.dispatchers_directory_finalize: Hash[int] = Hash(DISPATCH_DIRECTORY_FINALIZE, host=self.redis_persist)
196
+ self.running_dispatchers_estimate = 1
197
+
198
+ # Tables to track what submissions are running where
199
+ self.active_submissions = Hash(DISPATCH_TASK_ASSIGNMENT+self.instance_id, host=self.redis_persist)
200
+ self.submissions_assignments = Hash(DISPATCH_TASK_HASH, host=self.redis_persist)
201
+ self.ingester_scanning = Hash('m-scanning-table', self.redis_persist)
202
+
203
+ # Communications queues
204
+ self.start_queue: NamedQueue[tuple[str, str, str, str]] =\
205
+ NamedQueue(DISPATCH_START_EVENTS+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
206
+ self.result_queue: NamedQueue[dict] =\
207
+ NamedQueue(DISPATCH_RESULT_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
208
+ self.command_queue: NamedQueue[dict] =\
209
+ NamedQueue(DISPATCH_COMMAND_QUEUE+self.instance_id, host=self.redis, ttl=QUEUE_EXPIRY)
210
+
211
+ # Publish counters to the metrics sink.
212
+ self.counter = MetricsFactory(metrics_type='dispatcher', schema=Metrics, name=counter_name,
213
+ redis=self.redis, config=self.config)
214
+
215
+ self.apm_client = None
216
+ if self.config.core.metrics.apm_server.server_url:
217
+ elasticapm.instrument()
218
+ self.apm_client = get_apm_client("dispatcher")
219
+
220
+ self._service_timeouts: TimeoutTable[tuple[str, str, str], str] = TimeoutTable()
221
+ self._submission_timeouts: TimeoutTable[str, None] = TimeoutTable()
222
+
223
+ # Setup queues for work to be divided into
224
+ self.process_queues: list[PriorityQueue[DispatchAction]] = [PriorityQueue() for _ in range(RESULT_THREADS)]
225
+ self.queue_ready_signals: list[threading.Semaphore] = [threading.Semaphore(MAX_RESULT_BUFFER)
226
+ for _ in range(RESULT_THREADS)]
227
+
228
+ # Queue of finished submissions/errors waiting to be saved into elastic
229
+ self.finalize_queue = Queue()
230
+ self.error_queue: Queue[tuple[str, Error]] = Queue()
231
+
232
+ # Queue to hold of service timeouts that need to be processed
233
+ # They will be held in this queue until results in redis are
234
+ # already processed
235
+ self.timeout_queue: Queue[DispatchAction] = Queue()
236
+
237
+ # Utility object to handle post-processing actions
238
+ self.postprocess_worker = ActionWorker(cache=False, config=self.config, datastore=self.datastore,
239
+ redis_persist=self.redis_persist)
240
+
241
+ # Update bad sid list
242
+ self.redis_bad_sids = Set(BAD_SID_HASH, host=self.redis_persist)
243
+ self.bad_sids: set[str] = set(self.redis_bad_sids.members())
244
+
245
+ # Event Watchers
246
+ self.service_change_watcher = EventWatcher(self.redis, deserializer=ServiceChange.deserialize)
247
+ self.service_change_watcher.register('changes.services.*', self._handle_service_change_event)
248
+
249
+ def stop(self):
250
+ super().stop()
251
+ self.service_change_watcher.stop()
252
+ self.postprocess_worker.stop()
253
+
254
+ def try_run(self):
255
+ self.log.info(f'Using dispatcher id {self.instance_id}')
256
+ self.service_change_watcher.start()
257
+ threads = {
258
+ # Process to protect against old dead tasks timing out
259
+ 'Global Timeout Backstop': self.timeout_backstop,
260
+ }
261
+
262
+ for ii in range(RESULT_THREADS):
263
+ # Process results
264
+ threads[f'Service Update Worker #{ii}'] = self.service_worker_factory(ii)
265
+
266
+ self.maintain_threads(threads)
267
+
268
+ # If the dispatcher is exiting cleanly remove as many tasks from the service queues as we can
269
+ service_queues = {}
270
+ for task in self.tasks.values():
271
+ for (_sha256, service_name), dispatch_key in task.queue_keys.items():
272
+ try:
273
+ s_queue = service_queues[service_name]
274
+ except KeyError:
275
+ s_queue = get_service_queue(service_name, self.redis)
276
+ service_queues[service_name] = s_queue
277
+ s_queue.remove(dispatch_key)
278
+
279
+
280
+ def timeout_backstop(self):
281
+ while self.running:
282
+ cpu_mark = time.process_time()
283
+ time_mark = time.time()
284
+
285
+ # Start of process dispatcher transaction
286
+ with apm_span(self.apm_client, 'timeout_backstop'):
287
+ dispatcher_instances = set(Dispatcher.all_instances(persistent_redis=self.redis_persist))
288
+ error_tasks = []
289
+
290
+ # iterate running tasks
291
+ for _task_key, task_body in self.running_tasks:
292
+ task = ServiceTask(task_body)
293
+ # Its a bad task if it's dispatcher isn't running
294
+ if task.metadata['dispatcher__'] not in dispatcher_instances:
295
+ error_tasks.append(task)
296
+ # Its a bad task if its OUR task, but we aren't tracking that submission anymore
297
+ if task.metadata['dispatcher__'] == self.instance_id and task.sid not in self.tasks:
298
+ error_tasks.append(task)
299
+
300
+ # Refresh our dispatcher list.
301
+ dispatcher_instances = set(Dispatcher.all_instances(persistent_redis=self.redis_persist))
302
+ other_dispatcher_instances = dispatcher_instances - {self.instance_id}
303
+
304
+ # The remaining running tasks (probably) belong to dead dispatchers and can be killed
305
+ for task in error_tasks:
306
+ # Check against our refreshed dispatcher list in case it changed during the previous scan
307
+ if task.metadata['dispatcher__'] in other_dispatcher_instances:
308
+ continue
309
+
310
+ # If its already been handled, we don't need to
311
+ if not self.running_tasks.pop(task.key()):
312
+ continue
313
+
314
+ # Kill the task that would report to a dead dispatcher
315
+ self.log.warning(f"[{task.sid}]Task killed by backstop {task.service_name} {task.fileinfo.sha256}")
316
+ self.scaler_timeout_queue.push({
317
+ 'service': task.service_name,
318
+ 'container': task.metadata['worker__']
319
+ })
320
+
321
+ # Report to the metrics system that a recoverable error has occurred for that service
322
+ export_metrics_once(task.service_name, ServiceMetrics, dict(fail_recoverable=1),
323
+ host=task.metadata['worker__'], counter_type='service', redis=self.redis)
324
+
325
+ self.counter.increment_execution_time('cpu_seconds', time.process_time() - cpu_mark)
326
+ self.counter.increment_execution_time('busy_seconds', time.time() - time_mark)
327
+ self.sleep(GLOBAL_TASK_CHECK_INTERVAL)
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Ingester
4
+
5
+ Ingester is responsible for monitoring for incoming submission requests,
6
+ sending submissions, waiting for submissions to complete, sending a message
7
+ to a notification queue as specified by the submission and, based on the
8
+ score received, possibly sending a message to indicate that an alert should
9
+ be created.
10
+ """
11
+
12
+ import logging
13
+ import threading
14
+ import time
15
+ from os import environ
16
+ from random import random
17
+ from typing import Any, Iterable, List, Optional, Tuple
18
+
19
+ import elasticapm
20
+
21
+ from assemblyline import odm
22
+ from assemblyline.common import exceptions, forge, isotime
23
+ from assemblyline.common.constants import DROP_PRIORITY
24
+ from assemblyline.common.exceptions import get_stacktrace_info
25
+ from assemblyline.common.importing import load_module_by_path
26
+ from assemblyline.common.isotime import now, now_as_iso
27
+ from assemblyline.common.metrics import MetricsFactory
28
+ from assemblyline.common.postprocess import ActionWorker
29
+ from assemblyline.common.str_utils import dotdump, safe_str
30
+ from assemblyline.datastore.exceptions import DataStoreException
31
+ from assemblyline.filestore import CorruptedFileStoreException, FileStoreException
32
+ from assemblyline.odm.messages.ingest_heartbeat import Metrics
33
+ from assemblyline.odm.messages.submission import Submission as MessageSubmission
34
+ from assemblyline.odm.messages.submission import SubmissionMessage
35
+ from assemblyline.odm.models.alert import EXTENDED_SCAN_VALUES
36
+ from assemblyline.odm.models.filescore import FileScore
37
+ from assemblyline.odm.models.submission import Submission as DatabaseSubmission
38
+ from assemblyline.odm.models.submission import SubmissionParams
39
+ from assemblyline.odm.models.user import User
40
+ from assemblyline.remote.datatypes.events import EventWatcher
41
+ from assemblyline.remote.datatypes.hash import Hash
42
+ from assemblyline.remote.datatypes.queues.comms import CommsQueue
43
+ from assemblyline.remote.datatypes.queues.multi import MultiQueue
44
+ from assemblyline.remote.datatypes.queues.named import NamedQueue
45
+ from assemblyline.remote.datatypes.queues.priority import PriorityQueue
46
+ from assemblyline.remote.datatypes.user_quota_tracker import UserQuotaTracker
47
+ from assemblyline_core.dispatching.dispatcher import Dispatcher
48
+ from assemblyline_core.server_base import ThreadedCoreBase
49
+ from assemblyline_core.submission_client import SubmissionClient
50
+
51
+ from .constants import COMPLETE_QUEUE_NAME, INGEST_QUEUE_NAME, drop_chance
52
+
53
+ _dup_prefix = 'w-m-'
54
+ _notification_queue_prefix = 'nq-'
55
+ _max_retries = 10
56
+ _retry_delay = 60 * 4 # Wait 4 minutes to retry
57
+ _max_time = 2 * 24 * 60 * 60 # Wait 2 days for responses.
58
+ HOUR_IN_SECONDS = 60 * 60
59
+ COMPLETE_THREADS = int(environ.get('INGESTER_COMPLETE_THREADS', 4))
60
+ INGEST_THREADS = int(environ.get('INGESTER_INGEST_THREADS', 1))
61
+ SUBMIT_THREADS = int(environ.get('INGESTER_SUBMIT_THREADS', 4))
62
+
63
+
64
+ def must_drop(length: int, maximum: int) -> bool:
65
+ """
66
+ To calculate the probability of dropping an incoming submission we compare
67
+ the number returned by random() which will be in the range [0,1) and the
68
+ number returned by tanh() which will be in the range (-1,1).
69
+
70
+ If length is less than maximum the number returned by tanh will be negative
71
+ and so drop will always return False since the value returned by random()
72
+ cannot be less than 0.
73
+
74
+ If length is greater than maximum, drop will return False with a probability
75
+ that increases as the distance between maximum and length increases:
76
+
77
+ Length Chance of Dropping
78
+
79
+ <= maximum 0
80
+ 1.5 * maximum 0.76
81
+ 2 * maximum 0.96
82
+ 3 * maximum 0.999
83
+ """
84
+ return random() < drop_chance(length, maximum)
85
+
86
+
87
+ @odm.model()
88
+ class IngestTask(odm.Model):
89
+ # Submission Parameters
90
+ submission: MessageSubmission = odm.compound(MessageSubmission)
91
+
92
+ # Shortcut for properties of the submission
93
+ @property
94
+ def file_size(self) -> int:
95
+ return sum(file.size for file in self.submission.files)
96
+
97
+ @property
98
+ def params(self) -> SubmissionParams:
99
+ return self.submission.params
100
+
101
+ @property
102
+ def sha256(self) -> str:
103
+ return self.submission.files[0].sha256
104
+
105
+ # Information about the ingestion itself, parameters irrelevant
106
+ retries = odm.Integer(default=0)
107
+
108
+ # Fields added after a submission is complete for notification/bookkeeping processes
109
+ failure = odm.Text(default='') # If the ingestion has failed for some reason, what is it?
110
+ score = odm.Optional(odm.Integer()) # Score from previous processing of this file
111
+ extended_scan = odm.Enum(EXTENDED_SCAN_VALUES, default="skipped") # Status of the extended scan
112
+ ingest_id = odm.UUID() # Ingestion Identifier
113
+ ingest_time = odm.Date(default="NOW") # Time at which the file was ingested
114
+ notify_time = odm.Optional(odm.Date()) # Time at which the user is notify the submission is finished
115
+ to_ingest = odm.Boolean(default=False)
116
+
@@ -6,6 +6,7 @@ from assemblyline.common import forge, metrics
6
6
  from assemblyline.common.archiving import ARCHIVE_QUEUE_NAME
7
7
  from assemblyline.common.constants import DISPATCH_TASK_HASH, SUBMISSION_QUEUE, \
8
8
  SERVICE_STATE_HASH, ServiceStatus
9
+ from assemblyline.common.dispatcher import Dispatcher
9
10
  from assemblyline.datastore.exceptions import SearchException
10
11
  from assemblyline.odm.messages.retrohunt_heartbeat import RetrohuntMessage
11
12
  from assemblyline.odm.messages.scaler_heartbeat import ScalerMessage
@@ -25,7 +26,6 @@ from assemblyline.remote.datatypes.queues.named import NamedQueue
25
26
  from assemblyline.remote.datatypes.queues.priority import PriorityQueue
26
27
 
27
28
  from assemblyline_core.alerter.run_alerter import ALERT_QUEUE_NAME, ALERT_RETRY_QUEUE_NAME
28
- from assemblyline_core.dispatching.dispatcher import Dispatcher
29
29
  from assemblyline_core.ingester import INGEST_QUEUE_NAME, drop_chance
30
30
  from assemblyline_core.ingester.constants import COMPLETE_QUEUE_NAME
31
31
 
@@ -4,13 +4,13 @@ import time
4
4
 
5
5
  from assemblyline.common import forge
6
6
  from assemblyline.common.bundling import create_bundle, import_bundle
7
+ from assemblyline.common.classification import InvalidClassification
7
8
  from assemblyline.odm import Model
8
- from assemblyline.remote.datatypes.queues.named import NamedQueue
9
9
  from assemblyline.remote.datatypes.hash import Hash
10
- from assemblyline_core.replay.replay import INPUT_TYPES
10
+ from assemblyline.remote.datatypes.queues.named import NamedQueue
11
11
  from assemblyline_core.badlist_client import BadlistClient
12
+ from assemblyline_core.replay.replay import INPUT_TYPES
12
13
  from assemblyline_core.safelist_client import SafelistClient
13
- from assemblyline_core.signature_client import SignatureClient
14
14
 
15
15
  EMPTY_WAIT_TIME = int(os.environ.get('EMPTY_WAIT_TIME', '30'))
16
16
  REPLAY_REQUESTED = 'requested'
@@ -296,13 +296,17 @@ class APIClient(ClientBase):
296
296
  def create_al_bundle(self, id, bundle_path, use_alert=False):
297
297
  self.al_client.bundle.create(id, output=bundle_path, use_alert=use_alert)
298
298
 
299
- def load_bundle(self, bundle_path, min_classification, rescan_services, exist_ok=True):
300
- self.al_client.bundle.import_bundle(bundle_path,
301
- min_classification=min_classification,
302
- rescan_services=rescan_services,
303
- exist_ok=exist_ok)
299
+ def load_bundle(self, bundle_path, min_classification, rescan_services, exist_ok=True, reclassification=None):
300
+ self.al_client.bundle.import_bundle(
301
+ bundle_path,
302
+ min_classification=min_classification,
303
+ rescan_services=rescan_services,
304
+ exist_ok=exist_ok,
305
+ reclassification=reclassification,
306
+ to_ingest=True, # send submissions to ingester
307
+ )
304
308
 
305
- def load_json(self, file_path):
309
+ def load_json(self, file_path, reclassification=None):
306
310
  from assemblyline_client import ClientError
307
311
 
308
312
  # We're assuming all JSON that loaded has an "enabled" field
@@ -374,6 +378,7 @@ class DirectClient(ClientBase):
374
378
  # Initialize connection to redis-persistent for checkpointing
375
379
  redis_persist = get_client(config.core.redis.persistent.host,
376
380
  config.core.redis.persistent.port, False)
381
+ self.classification = forge.get_classification()
377
382
  self.datastore = forge.get_datastore(config=config)
378
383
  self.queues = {
379
384
  queue_type: NamedQueue(f"replay_{queue_type}", host=redis)
@@ -409,13 +414,17 @@ class DirectClient(ClientBase):
409
414
  temp_bundle_file = create_bundle(id, working_dir=os.path.dirname(bundle_path), use_alert=use_alert)
410
415
  os.rename(temp_bundle_file, bundle_path)
411
416
 
412
- def load_bundle(self, bundle_path, min_classification, rescan_services, exist_ok=True):
413
- import_bundle(bundle_path,
414
- min_classification=min_classification,
415
- rescan_services=rescan_services,
416
- exist_ok=exist_ok)
417
+ def load_bundle(self, bundle_path, min_classification, rescan_services, exist_ok=True, reclassification=None):
418
+ import_bundle(
419
+ bundle_path,
420
+ min_classification=min_classification,
421
+ rescan_services=rescan_services,
422
+ exist_ok=exist_ok,
423
+ reclassification=reclassification,
424
+ to_ingest=True, # send submissions to ingester
425
+ )
417
426
 
418
- def load_json(self, file_path):
427
+ def load_json(self, file_path, reclassification=None):
419
428
  # We're assuming all JSON that loaded has an "enabled" field
420
429
  collection = os.path.basename(file_path).split('_', 1)[0]
421
430
  with open(file_path) as fp:
@@ -428,6 +437,16 @@ class DirectClient(ClientBase):
428
437
 
429
438
  # Let's see if there's an existing document with the same ID in the collection
430
439
  obj = es_collection.get_if_exists(id, as_obj=False)
440
+ if obj:
441
+ # Check if the classification of the object is compatible with the system's classification
442
+ try:
443
+ self.classification.normalize_classification(obj['classification'])
444
+ except InvalidClassification:
445
+ if reclassification:
446
+ # If reclassification is requested, then we can change the classification
447
+ obj['classification'] = reclassification
448
+ else:
449
+ raise
431
450
 
432
451
  if collection == "workflow":
433
452
  # If there has been any edits by another user, then preserve the enabled state
@@ -86,8 +86,8 @@ class ReplayLoader(ReplayBase):
86
86
  self.maintain_threads(threads)
87
87
 
88
88
  def stop(self):
89
- super().stop()
90
89
  self.cache.close()
90
+ return super().stop()
91
91
 
92
92
 
93
93
  if __name__ == '__main__':
@@ -1,5 +1,5 @@
1
- import shutil
2
1
  import os
2
+ import shutil
3
3
 
4
4
  from cart import unpack_file
5
5
 
@@ -32,15 +32,16 @@ class ReplayLoaderWorker(ReplayBase):
32
32
  if file_path.endswith(".al_bundle"):
33
33
  self.client.load_bundle(file_path,
34
34
  min_classification=self.replay_config.loader.min_classification,
35
- rescan_services=self.replay_config.loader.rescan)
35
+ rescan_services=self.replay_config.loader.rescan,
36
+ reclassification=self.replay_config.loader.reclassification)
36
37
  elif file_path.endswith(".al_json"):
37
- self.client.load_json(file_path)
38
+ self.client.load_json(file_path, reclassification=self.replay_config.loader.reclassification)
38
39
 
39
40
  elif file_path.endswith(".al_json.cart"):
40
41
  cart_path = file_path
41
42
  file_path = file_path[:-5]
42
43
  unpack_file(cart_path, file_path)
43
- self.client.load_json(file_path)
44
+ self.client.load_json(file_path, reclassification=self.replay_config.loader.reclassification)
44
45
  os.unlink(cart_path)
45
46
 
46
47
  if os.path.exists(file_path):
@@ -55,11 +56,11 @@ class ReplayLoaderWorker(ReplayBase):
55
56
  # Terminate on NFS-related error
56
57
  self.log.warning("'Invalid cross-device link' exception detected. Terminating..")
57
58
  self.stop()
58
- except Exception:
59
+ except Exception as e:
59
60
  # Make sure failed directory exists
60
61
  os.makedirs(self.replay_config.loader.failed_directory, exist_ok=True)
61
62
 
62
- self.log.error(f"Failed to load the bundle file {file_path}, moving it to the failed directory.")
63
+ self.log.error(f"Failed to load the bundle file {file_path}, moving it to the failed directory. Reason: {e}")
63
64
  failed_path = os.path.join(self.replay_config.loader.failed_directory, os.path.basename(file_path))
64
65
  shutil.move(file_path, failed_path)
65
66