sl-shared-assets 5.0.1__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +4 -0
- sl_shared_assets/command_line_interfaces/manage.py +53 -0
- sl_shared_assets/data_classes/__init__.py +3 -3
- sl_shared_assets/data_classes/configuration_data.py +105 -138
- sl_shared_assets/data_classes/runtime_data.py +2 -4
- sl_shared_assets/data_classes/session_data.py +116 -86
- sl_shared_assets/data_classes/surgery_data.py +44 -44
- sl_shared_assets/server/__init__.py +1 -2
- sl_shared_assets/server/job.py +43 -50
- sl_shared_assets/server/pipeline.py +108 -119
- sl_shared_assets/server/server.py +45 -104
- sl_shared_assets/tools/__init__.py +4 -0
- sl_shared_assets/tools/packaging_tools.py +1 -1
- sl_shared_assets/tools/project_management_tools.py +67 -12
- {sl_shared_assets-5.0.1.dist-info → sl_shared_assets-5.1.0.dist-info}/METADATA +1 -1
- sl_shared_assets-5.1.0.dist-info/RECORD +23 -0
- sl_shared_assets-5.0.1.dist-info/RECORD +0 -23
- {sl_shared_assets-5.0.1.dist-info → sl_shared_assets-5.1.0.dist-info}/WHEEL +0 -0
- {sl_shared_assets-5.0.1.dist-info → sl_shared_assets-5.1.0.dist-info}/entry_points.txt +0 -0
- {sl_shared_assets-5.0.1.dist-info → sl_shared_assets-5.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
"""This module provides tools
|
|
2
|
-
|
|
3
|
-
parallel
|
|
1
|
+
"""This module provides tools used to run complex data processing pipelines on remote compute servers. A processing
|
|
2
|
+
pipeline represents a higher unit of abstraction relative to the Job class, often leveraging multiple sequential or
|
|
3
|
+
parallel jobs to process the data."""
|
|
4
4
|
|
|
5
5
|
import copy
|
|
6
6
|
from enum import IntEnum, StrEnum
|
|
@@ -20,11 +20,8 @@ from .server import Server
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class TrackerFileNames(StrEnum):
|
|
23
|
-
"""
|
|
24
|
-
formation pipelines to track the progress
|
|
25
|
-
|
|
26
|
-
This enumeration standardizes the names for all processing tracker files used in the lab. It is designed to be used
|
|
27
|
-
via the get_processing_tracker() function to generate ProcessingTracker instances.
|
|
23
|
+
"""Stores the names of the processing tacker .yaml files used by the Sun lab data preprocessing, processing, and
|
|
24
|
+
dataset formation pipelines to track the pipeline's progress.
|
|
28
25
|
|
|
29
26
|
Notes:
|
|
30
27
|
The elements in this enumeration match the elements in the ProcessingPipelines enumeration, since each valid
|
|
@@ -52,18 +49,14 @@ class TrackerFileNames(StrEnum):
|
|
|
52
49
|
|
|
53
50
|
|
|
54
51
|
class ProcessingPipelines(StrEnum):
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
All processing pipelines currently supported by the lab codebase are defined in this enumeration. Primarily,
|
|
58
|
-
the elements from this enumeration are used in terminal messages and data logging entries to identify the pipelines
|
|
59
|
-
to the user.
|
|
52
|
+
"""Stores the names of the data processing pipelines currently used in the lab.
|
|
60
53
|
|
|
61
54
|
Notes:
|
|
62
|
-
The elements in this enumeration match the elements in the
|
|
55
|
+
The elements in this enumeration match the elements in the TrackerFileNames enumeration, since each valid
|
|
63
56
|
ProcessingPipeline instance has an associated ProcessingTracker file instance.
|
|
64
57
|
|
|
65
58
|
The order of pipelines in this enumeration loosely follows the sequence in which they are executed during the
|
|
66
|
-
|
|
59
|
+
Sun lab data workflow.
|
|
67
60
|
"""
|
|
68
61
|
|
|
69
62
|
MANIFEST = "manifest generation"
|
|
@@ -72,8 +65,8 @@ class ProcessingPipelines(StrEnum):
|
|
|
72
65
|
pipeline automatically conduct the manifest (re)generation at the end of their runtime."""
|
|
73
66
|
CHECKSUM = "checksum resolution"
|
|
74
67
|
"""Checksum resolution pipeline. Primarily, it is used to verify that the raw data has been transferred to the
|
|
75
|
-
remote storage server from the main acquisition system PC intact. This pipeline is
|
|
76
|
-
|
|
68
|
+
remote storage server from the main acquisition system PC intact. This pipeline is also used to regenerate
|
|
69
|
+
(re-checksum) the data stored on the remote compute server."""
|
|
77
70
|
PREPARATION = "processing preparation"
|
|
78
71
|
"""Data processing preparation pipeline. Since the compute server uses a two-volume design with a slow (HDD) storage
|
|
79
72
|
volume and a fast (NVME) working volume, to optimize data processing performance, the data needs to be transferred
|
|
@@ -81,8 +74,7 @@ class ProcessingPipelines(StrEnum):
|
|
|
81
74
|
volume to the working volume."""
|
|
82
75
|
BEHAVIOR = "behavior processing"
|
|
83
76
|
"""Behavior processing pipeline. This pipeline is used to process .npz log files to extract animal behavior data
|
|
84
|
-
acquired during a single session (day).
|
|
85
|
-
to video and mesoscope frame data, and experiment configuration and task information."""
|
|
77
|
+
acquired during a single session (day)."""
|
|
86
78
|
SUITE2P = "single-day suite2p processing"
|
|
87
79
|
"""Single-day suite2p pipeline. This pipeline is used to extract the cell activity data from 2-photon imaging data
|
|
88
80
|
acquired during a single session (day)."""
|
|
@@ -91,24 +83,22 @@ class ProcessingPipelines(StrEnum):
|
|
|
91
83
|
behavior video frames acquired during a single session (day)."""
|
|
92
84
|
MULTIDAY = "multi-day suite2p processing"
|
|
93
85
|
"""Multi-day suite2p processing (cell tracking) pipeline. This pipeline is used to track cells processed with the
|
|
94
|
-
single-day suite2p pipelines across multiple days.
|
|
95
|
-
same dataset as the first step of dataset creation."""
|
|
86
|
+
single-day suite2p pipelines across multiple days."""
|
|
96
87
|
FORGING = "dataset forging"
|
|
97
88
|
"""Dataset creation (forging) pipeline. This pipeline typically runs after the multi-day pipeline. It extracts and
|
|
98
|
-
integrates the processed data from
|
|
99
|
-
dataset."""
|
|
89
|
+
integrates the processed data from all sources into a unified dataset."""
|
|
100
90
|
ARCHIVING = "data archiving"
|
|
101
|
-
"""Data archiving pipeline. To conserve the (limited) space on the fast working volume,
|
|
102
|
-
processed and integrated into a stable dataset, the processed data folder is moved to the
|
|
103
|
-
folders under the root session folder on the processed data volume are
|
|
91
|
+
"""Data archiving pipeline. To conserve the (limited) space on the remote compute server's fast working volume,
|
|
92
|
+
once the data has been processed and integrated into a stable dataset, the processed data folder is moved to the
|
|
93
|
+
storage volume. After the data is moved, all folders under the root session folder on the processed data volume are
|
|
94
|
+
deleted to free up the processing volume space."""
|
|
104
95
|
|
|
105
96
|
|
|
106
97
|
class ProcessingStatus(IntEnum):
|
|
107
98
|
"""Maps integer-based processing pipeline status (state) codes to human-readable names.
|
|
108
99
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
class to communicate the status of the managed pipelines to external processes.
|
|
100
|
+
The codes from this enumeration are used by the ProcessingPipeline class to communicate the status of the managed
|
|
101
|
+
pipelines to manager processes that oversee the execution of each pipeline.
|
|
112
102
|
|
|
113
103
|
Notes:
|
|
114
104
|
The status codes from this enumeration track the state of the pipeline as a whole, instead of tracking the
|
|
@@ -129,41 +119,40 @@ class ProcessingStatus(IntEnum):
|
|
|
129
119
|
|
|
130
120
|
@dataclass()
|
|
131
121
|
class ProcessingTracker(YamlConfig):
|
|
132
|
-
"""Wraps the .yaml file that tracks the state of a data processing pipeline and provides tools for communicating
|
|
133
|
-
state between multiple processes in a thread-safe manner.
|
|
122
|
+
"""Wraps the .yaml file that tracks the state of a data processing pipeline and provides tools for communicating
|
|
123
|
+
this state between multiple processes in a thread-safe manner.
|
|
134
124
|
|
|
135
125
|
This class is used by all data processing pipelines running on the remote compute server(s) to prevent race
|
|
136
|
-
conditions
|
|
137
|
-
|
|
126
|
+
conditions. It is also used to evaluate the status (success / failure) of each pipeline as they are executed by the
|
|
127
|
+
remote server.
|
|
138
128
|
|
|
139
129
|
Note:
|
|
140
|
-
|
|
141
|
-
when a pipeline starts running on the remote server, its tracker is switched into the 'running' (locked) state
|
|
142
|
-
until the pipeline completes, aborts, or encounters an error. When the tracker is locked, all modifications to
|
|
143
|
-
the tracker or processed data have to originate from the same process that started the pipeline that locked the
|
|
144
|
-
tracker file. This feature supports running complex processing pipelines that use multiple concurrent and / or
|
|
145
|
-
sequential processing jobs on the remote server.
|
|
146
|
-
|
|
147
|
-
This instance frequently refers to a 'manager process' in method documentation. A 'manager process' is the
|
|
130
|
+
This instance frequently refers to the 'manager process' in method documentation. A 'manager process' is the
|
|
148
131
|
highest-level process that manages the tracked pipeline. When a pipeline runs on remote compute servers, the
|
|
149
132
|
manager process is typically the process running on the non-server machine (user PC) that submits the remote
|
|
150
|
-
processing jobs to the compute server
|
|
151
|
-
|
|
133
|
+
processing jobs to the compute server. The worker process(es) that run the processing job(s) on the remote
|
|
134
|
+
compute servers are not considered manager processes.
|
|
135
|
+
|
|
136
|
+
The processing trackers work similar to 'lock' files. When a pipeline starts running on the remote server, its
|
|
137
|
+
tracker is switched into the 'running' (locked) state until the pipeline completes, aborts, or encounters an
|
|
138
|
+
error. When the tracker is locked, all modifications to the tracker have to originate from the same manager
|
|
139
|
+
process that started the pipeline. This feature supports running complex processing pipelines that use multiple
|
|
140
|
+
concurrent and / or sequential processing jobs on the remote server.
|
|
152
141
|
"""
|
|
153
142
|
|
|
154
143
|
file_path: Path
|
|
155
144
|
"""Stores the path to the .yaml file used to cache the tracker data on disk. The class instance functions as a
|
|
156
145
|
wrapper around the data stored inside the specified .yaml file."""
|
|
157
146
|
_complete: bool = False
|
|
158
|
-
"""Tracks whether the processing
|
|
147
|
+
"""Tracks whether the processing pipeline managed by this tracker has finished successfully."""
|
|
159
148
|
_encountered_error: bool = False
|
|
160
|
-
"""Tracks whether the processing
|
|
149
|
+
"""Tracks whether the processing pipeline managed by this tracker has encountered an error and has finished
|
|
161
150
|
unsuccessfully."""
|
|
162
151
|
_running: bool = False
|
|
163
|
-
"""Tracks whether the processing
|
|
152
|
+
"""Tracks whether the processing pipeline managed by this tracker is currently running."""
|
|
164
153
|
_manager_id: int = -1
|
|
165
154
|
"""Stores the xxHash3-64 hash value that represents the unique identifier of the manager process that started the
|
|
166
|
-
|
|
155
|
+
pipeline. The manager process is typically running on a remote control machine (computer) and is used to
|
|
167
156
|
support processing runtimes that are distributed over multiple separate batch jobs on the compute server. This
|
|
168
157
|
ID should be generated using the 'generate_manager_id()' function exposed by this library."""
|
|
169
158
|
_lock_path: str = field(init=False)
|
|
@@ -218,20 +207,17 @@ class ProcessingTracker(YamlConfig):
|
|
|
218
207
|
|
|
219
208
|
def start(self, manager_id: int, job_count: int = 1) -> None:
|
|
220
209
|
"""Configures the tracker file to indicate that a manager process is currently executing the tracked processing
|
|
221
|
-
|
|
210
|
+
pipeline.
|
|
222
211
|
|
|
223
|
-
Calling this method
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
212
|
+
Calling this method locks the tracked session and processing pipeline combination to only be accessible from the
|
|
213
|
+
manager process that calls this method. Calling this method for an already running pipeline managed by the same
|
|
214
|
+
process does not have any effect, so it is safe to call this method at the beginning of each processing job that
|
|
215
|
+
makes up the pipeline.
|
|
227
216
|
|
|
228
217
|
Args:
|
|
229
|
-
manager_id: The unique
|
|
230
|
-
|
|
231
|
-
job_count: The total number of jobs to be executed as part of the tracked pipeline.
|
|
232
|
-
the stop() method properly track the end of the pipeline as a whole, rather than the end of intermediate
|
|
233
|
-
jobs. Primarily, this is used by multi-job pipelines where all jobs are submitted as part of a single
|
|
234
|
-
phase and the job completion order cannot be known in-advance.
|
|
218
|
+
manager_id: The unique identifier of the manager process which attempts to start the pipeline tracked by
|
|
219
|
+
this tracker file.
|
|
220
|
+
job_count: The total number of jobs to be executed as part of the tracked pipeline.
|
|
235
221
|
|
|
236
222
|
Raises:
|
|
237
223
|
TimeoutError: If the .lock file for the target .YAML file cannot be acquired within the timeout period.
|
|
@@ -242,23 +228,23 @@ class ProcessingTracker(YamlConfig):
|
|
|
242
228
|
# Loads tracker state from the .yaml file
|
|
243
229
|
self._load_state()
|
|
244
230
|
|
|
245
|
-
# If the
|
|
231
|
+
# If the pipeline is already running from a different process, aborts with an error.
|
|
246
232
|
if self._running and manager_id != self._manager_id:
|
|
247
233
|
message = (
|
|
248
|
-
f"Unable to start the processing
|
|
234
|
+
f"Unable to start the processing pipeline from the manager process with id {manager_id}. The "
|
|
249
235
|
f"{self.file_path.name} tracker file indicates that the manager process with id {self._manager_id} "
|
|
250
|
-
f"is currently executing the tracked
|
|
251
|
-
f"the
|
|
236
|
+
f"is currently executing the tracked pipeline. Only a single manager process is allowed to execute "
|
|
237
|
+
f"the pipeline at the same time."
|
|
252
238
|
)
|
|
253
239
|
console.error(message=message, error=RuntimeError)
|
|
254
240
|
raise RuntimeError(message) # Fallback to appease mypy, should not be reachable
|
|
255
241
|
|
|
256
|
-
# Otherwise, if the
|
|
242
|
+
# Otherwise, if the pipeline is already running for the current manager process, returns without modifying
|
|
257
243
|
# the tracker data.
|
|
258
244
|
elif self._running and manager_id == self._manager_id:
|
|
259
245
|
return
|
|
260
246
|
|
|
261
|
-
# Otherwise, locks the
|
|
247
|
+
# Otherwise, locks the pipeline for the current manager process and updates the cached tracker data
|
|
262
248
|
self._running = True
|
|
263
249
|
self._manager_id = manager_id
|
|
264
250
|
self._complete = False
|
|
@@ -267,16 +253,16 @@ class ProcessingTracker(YamlConfig):
|
|
|
267
253
|
self._save_state()
|
|
268
254
|
|
|
269
255
|
def error(self, manager_id: int) -> None:
|
|
270
|
-
"""Configures the tracker file to indicate that the tracked processing
|
|
256
|
+
"""Configures the tracker file to indicate that the tracked processing pipeline encountered an error and failed
|
|
271
257
|
to complete.
|
|
272
258
|
|
|
273
|
-
This method
|
|
274
|
-
|
|
275
|
-
|
|
259
|
+
This method unlocks the pipeline, allowing other manager processes to interface with the tracked pipeline. It
|
|
260
|
+
also updates the tracker file to reflect that the pipeline was interrupted due to an error, which is used by the
|
|
261
|
+
manager processes to detect and handle processing failures.
|
|
276
262
|
|
|
277
263
|
Args:
|
|
278
|
-
manager_id: The unique
|
|
279
|
-
|
|
264
|
+
manager_id: The unique identifier of the manager process which attempts to report that the pipeline tracked
|
|
265
|
+
by this tracker file has encountered an error.
|
|
280
266
|
|
|
281
267
|
Raises:
|
|
282
268
|
TimeoutError: If the .lock file for the target .YAML file cannot be acquired within the timeout period.
|
|
@@ -286,22 +272,22 @@ class ProcessingTracker(YamlConfig):
|
|
|
286
272
|
# Loads tracker state from the .yaml file
|
|
287
273
|
self._load_state()
|
|
288
274
|
|
|
289
|
-
# If the
|
|
275
|
+
# If the pipeline is not running, returns without doing anything
|
|
290
276
|
if not self._running:
|
|
291
277
|
return
|
|
292
278
|
|
|
293
|
-
# Ensures that only the active manager process can report
|
|
279
|
+
# Ensures that only the active manager process can report pipeline errors using the tracker file
|
|
294
280
|
if manager_id != self._manager_id:
|
|
295
281
|
message = (
|
|
296
|
-
f"Unable to report that the processing
|
|
297
|
-
f"with id {manager_id}. The {self.file_path.name} tracker file indicates that the
|
|
282
|
+
f"Unable to report that the processing pipeline has encountered an error from the manager process "
|
|
283
|
+
f"with id {manager_id}. The {self.file_path.name} tracker file indicates that the pipeline is "
|
|
298
284
|
f"managed by the process with id {self._manager_id}, preventing other processes from interfacing "
|
|
299
|
-
f"with the
|
|
285
|
+
f"with the pipeline."
|
|
300
286
|
)
|
|
301
287
|
console.error(message=message, error=RuntimeError)
|
|
302
288
|
raise RuntimeError(message) # Fallback to appease mypy, should not be reachable
|
|
303
289
|
|
|
304
|
-
# Indicates that the
|
|
290
|
+
# Indicates that the pipeline aborted with an error
|
|
305
291
|
self._running = False
|
|
306
292
|
self._manager_id = -1
|
|
307
293
|
self._complete = False
|
|
@@ -309,15 +295,19 @@ class ProcessingTracker(YamlConfig):
|
|
|
309
295
|
self._save_state()
|
|
310
296
|
|
|
311
297
|
def stop(self, manager_id: int) -> None:
|
|
312
|
-
"""Configures the tracker file to indicate that the tracked processing
|
|
298
|
+
"""Configures the tracker file to indicate that the tracked processing pipeline has been completed successfully.
|
|
313
299
|
|
|
314
|
-
This method
|
|
315
|
-
also configures the tracker file to indicate that the
|
|
300
|
+
This method unlocks the pipeline, allowing other manager processes to interface with the tracked pipeline. It
|
|
301
|
+
also configures the tracker file to indicate that the pipeline has been completed successfully, which is used
|
|
316
302
|
by the manager processes to detect and handle processing completion.
|
|
317
303
|
|
|
304
|
+
Notes:
|
|
305
|
+
This method tracks how many jobs executed as part of the tracked pipeline have been completed and only
|
|
306
|
+
marks the pipeline as complete if all it's processing jobs have been completed.
|
|
307
|
+
|
|
318
308
|
Args:
|
|
319
|
-
manager_id: The unique
|
|
320
|
-
|
|
309
|
+
manager_id: The unique identifier of the manager process which attempts to report that the pipeline tracked
|
|
310
|
+
by this tracker file has been completed successfully.
|
|
321
311
|
|
|
322
312
|
Raises:
|
|
323
313
|
TimeoutError: If the .lock file for the target .YAML file cannot be acquired within the timeout period.
|
|
@@ -327,17 +317,17 @@ class ProcessingTracker(YamlConfig):
|
|
|
327
317
|
# Loads tracker state from the .yaml file
|
|
328
318
|
self._load_state()
|
|
329
319
|
|
|
330
|
-
# If the
|
|
320
|
+
# If the pipeline is not running, does not do anything
|
|
331
321
|
if not self._running:
|
|
332
322
|
return
|
|
333
323
|
|
|
334
|
-
# Ensures that only the active manager process can report
|
|
324
|
+
# Ensures that only the active manager process can report pipeline completion using the tracker file
|
|
335
325
|
if manager_id != self._manager_id:
|
|
336
326
|
message = (
|
|
337
|
-
f"Unable to report that the processing
|
|
338
|
-
f"with id {manager_id}. The {self.file_path.name} tracker file indicates that the
|
|
339
|
-
f"managed by the process with id {self._manager_id}, preventing other processes from
|
|
340
|
-
f"with the
|
|
327
|
+
f"Unable to report that the processing pipeline has completed successfully from the manager "
|
|
328
|
+
f"process with id {manager_id}. The {self.file_path.name} tracker file indicates that the pipeline "
|
|
329
|
+
f"is managed by the process with id {self._manager_id}, preventing other processes from "
|
|
330
|
+
f"interfacing with the pipeline."
|
|
341
331
|
)
|
|
342
332
|
console.error(message=message, error=RuntimeError)
|
|
343
333
|
raise RuntimeError(message) # Fallback to appease mypy, should not be reachable
|
|
@@ -345,7 +335,7 @@ class ProcessingTracker(YamlConfig):
|
|
|
345
335
|
# Increments completed job tracker
|
|
346
336
|
self._completed_jobs += 1
|
|
347
337
|
|
|
348
|
-
# If the pipeline has completed all required jobs, marks the
|
|
338
|
+
# If the pipeline has completed all required jobs, marks the pipeline as complete (stopped)
|
|
349
339
|
if self._completed_jobs >= self._job_count:
|
|
350
340
|
self._running = False
|
|
351
341
|
self._manager_id = -1
|
|
@@ -354,30 +344,32 @@ class ProcessingTracker(YamlConfig):
|
|
|
354
344
|
self._save_state()
|
|
355
345
|
|
|
356
346
|
def abort(self) -> None:
|
|
357
|
-
"""Resets the
|
|
347
|
+
"""Resets the pipeline tracker file to the default state.
|
|
358
348
|
|
|
359
|
-
This method can be used to reset the
|
|
360
|
-
instance methods, this method can be called from any manager process, even if the
|
|
361
|
-
another process. This method is only intended to be used in the case of emergency to
|
|
362
|
-
|
|
349
|
+
This method can be used to reset the pipeline tracker file, regardless of the current pipeline state. Unlike
|
|
350
|
+
other instance methods, this method can be called from any manager process, even if the pipeline is already
|
|
351
|
+
locked by another process. This method is only intended to be used in the case of emergency to unlock a
|
|
352
|
+
deadlocked pipeline.
|
|
363
353
|
"""
|
|
364
354
|
lock = FileLock(self._lock_path)
|
|
365
355
|
with lock.acquire(timeout=10.0):
|
|
366
356
|
# Loads tracker state from the .yaml file.
|
|
367
357
|
self._load_state()
|
|
368
358
|
|
|
369
|
-
# Resets the tracker file to the default state. Note, does not indicate that the
|
|
359
|
+
# Resets the tracker file to the default state. Note, does not indicate that the pipeline completed nor
|
|
370
360
|
# that it has encountered an error.
|
|
371
361
|
self._running = False
|
|
372
362
|
self._manager_id = -1
|
|
363
|
+
self._completed_jobs = 0
|
|
364
|
+
self._job_count = 1
|
|
373
365
|
self._complete = False
|
|
374
366
|
self._encountered_error = False
|
|
375
367
|
self._save_state()
|
|
376
368
|
|
|
377
369
|
@property
|
|
378
370
|
def is_complete(self) -> bool:
|
|
379
|
-
"""Returns True if the tracker wrapped by the instance indicates that the processing
|
|
380
|
-
successfully and that the
|
|
371
|
+
"""Returns True if the tracker wrapped by the instance indicates that the processing pipeline has been completed
|
|
372
|
+
successfully and that the pipeline is not currently ongoing."""
|
|
381
373
|
lock = FileLock(self._lock_path)
|
|
382
374
|
with lock.acquire(timeout=10.0):
|
|
383
375
|
# Loads tracker state from the .yaml file
|
|
@@ -386,7 +378,7 @@ class ProcessingTracker(YamlConfig):
|
|
|
386
378
|
|
|
387
379
|
@property
|
|
388
380
|
def encountered_error(self) -> bool:
|
|
389
|
-
"""Returns True if the tracker wrapped by the instance indicates that the processing
|
|
381
|
+
"""Returns True if the tracker wrapped by the instance indicates that the processing pipeline has aborted due
|
|
390
382
|
to encountering an error."""
|
|
391
383
|
lock = FileLock(self._lock_path)
|
|
392
384
|
with lock.acquire(timeout=10.0):
|
|
@@ -396,7 +388,7 @@ class ProcessingTracker(YamlConfig):
|
|
|
396
388
|
|
|
397
389
|
@property
|
|
398
390
|
def is_running(self) -> bool:
|
|
399
|
-
"""Returns True if the tracker wrapped by the instance indicates that the processing
|
|
391
|
+
"""Returns True if the tracker wrapped by the instance indicates that the processing pipeline is currently
|
|
400
392
|
ongoing."""
|
|
401
393
|
lock = FileLock(self._lock_path)
|
|
402
394
|
with lock.acquire(timeout=10.0):
|
|
@@ -407,19 +399,19 @@ class ProcessingTracker(YamlConfig):
|
|
|
407
399
|
|
|
408
400
|
@dataclass()
|
|
409
401
|
class ProcessingPipeline:
|
|
410
|
-
"""
|
|
402
|
+
"""Provides an interface to construct and execute data processing pipelines on the target remote compute server.
|
|
411
403
|
|
|
412
404
|
This class functions as an interface for all data processing pipelines running on Sun lab compute servers. It is
|
|
413
|
-
pipeline-type-agnostic and works for all data processing pipelines
|
|
414
|
-
|
|
405
|
+
pipeline-type-agnostic and works for all data processing pipelines used in the lab. After instantiation, the class
|
|
406
|
+
automatically handles all interactions with the server necessary to run the remote processing pipeline and
|
|
415
407
|
verify the runtime outcome via the runtime_cycle() method that has to be called cyclically until the pipeline is
|
|
416
408
|
complete.
|
|
417
409
|
|
|
418
410
|
Notes:
|
|
419
|
-
Each pipeline
|
|
420
|
-
pipeline can be seen as an execution graph that sequentially submits batches of jobs to the
|
|
421
|
-
processing graph for each pipeline is fully resolved at the instantiation of this class
|
|
422
|
-
instance contains the necessary data to run the entire processing pipeline.
|
|
411
|
+
Each pipeline is executed as a series of one or more stages with each stage using one or more parallel jobs.
|
|
412
|
+
Therefore, each pipeline can be seen as an execution graph that sequentially submits batches of jobs to the
|
|
413
|
+
remote server. The processing graph for each pipeline is fully resolved at the instantiation of this class, so
|
|
414
|
+
each instance contains the necessary data to run the entire processing pipeline.
|
|
423
415
|
|
|
424
416
|
The minimum self-contained unit of the processing pipeline is a single job. Since jobs can depend on the output
|
|
425
417
|
of other jobs, they are organized into stages based on the dependency graph between jobs. Combined with cluster
|
|
@@ -431,27 +423,24 @@ class ProcessingPipeline:
|
|
|
431
423
|
"""Stores the name of the processing pipeline managed by this instance. Primarily, this is used to identify the
|
|
432
424
|
pipeline to the user in terminal messages and logs."""
|
|
433
425
|
server: Server
|
|
434
|
-
"""
|
|
435
|
-
running the pipeline."""
|
|
426
|
+
"""Store the reference to the Server object used to interface with the remote server running the pipeline."""
|
|
436
427
|
manager_id: int
|
|
437
|
-
"""The unique identifier for the manager process that constructs and manages the runtime of the tracked pipeline.
|
|
438
|
-
This is used to ensure that only a single pipeline instance can work with each session's data at the same time on
|
|
439
|
-
the remote server."""
|
|
428
|
+
"""The unique identifier for the manager process that constructs and manages the runtime of the tracked pipeline."""
|
|
440
429
|
jobs: dict[int, tuple[tuple[Job, Path], ...]]
|
|
441
430
|
"""Stores the dictionary that maps the pipeline processing stage integer-codes to two-element tuples. Each tuple
|
|
442
|
-
stores the Job
|
|
443
|
-
stage."""
|
|
431
|
+
stores the Job object and the path to its remote working directory to be submitted to the server as part of that
|
|
432
|
+
executing that stage."""
|
|
444
433
|
remote_tracker_path: Path
|
|
445
|
-
"""
|
|
434
|
+
"""Stores the path to the pipeline's processing tracker .yaml file stored on the remote compute server."""
|
|
446
435
|
local_tracker_path: Path
|
|
447
|
-
"""
|
|
448
|
-
this location when the instance verifies the outcome of
|
|
436
|
+
"""Stores the path to the pipeline's processing tracker .yaml file on the local machine. The remote file is
|
|
437
|
+
pulled to this location when the instance verifies the outcome of the tracked processing pipeline."""
|
|
449
438
|
session: str
|
|
450
|
-
"""
|
|
439
|
+
"""Stores the ID of the session whose data is being processed by the tracked pipeline."""
|
|
451
440
|
animal: str
|
|
452
|
-
"""
|
|
441
|
+
"""Stores the ID of the animal whose data is being processed by the tracked pipeline."""
|
|
453
442
|
project: str
|
|
454
|
-
"""
|
|
443
|
+
"""Stores the name of the project whose data is being processed by the tracked pipeline."""
|
|
455
444
|
keep_job_logs: bool = False
|
|
456
445
|
"""Determines whether to keep the logs for the jobs making up the pipeline execution graph or (default) to remove
|
|
457
446
|
them after pipeline successfully ends its runtime. If the pipeline fails to complete its runtime, the logs are kept
|
|
@@ -566,7 +555,7 @@ class ProcessingPipeline:
|
|
|
566
555
|
|
|
567
556
|
|
|
568
557
|
def generate_manager_id() -> int:
|
|
569
|
-
"""Generates and returns a unique integer
|
|
558
|
+
"""Generates and returns a unique integer value that can be used to identify the manager process that calls
|
|
570
559
|
this function.
|
|
571
560
|
|
|
572
561
|
The identifier is generated based on the current timestamp, accurate to microseconds, and a random number between 1
|