sl-shared-assets 4.0.1__py3-none-any.whl → 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +45 -42
- sl_shared_assets/command_line_interfaces/__init__.py +3 -0
- sl_shared_assets/command_line_interfaces/configure.py +173 -0
- sl_shared_assets/command_line_interfaces/manage.py +226 -0
- sl_shared_assets/data_classes/__init__.py +33 -32
- sl_shared_assets/data_classes/configuration_data.py +267 -79
- sl_shared_assets/data_classes/session_data.py +226 -289
- sl_shared_assets/server/__init__.py +24 -4
- sl_shared_assets/server/job.py +6 -7
- sl_shared_assets/server/pipeline.py +570 -0
- sl_shared_assets/server/server.py +57 -25
- sl_shared_assets/tools/__init__.py +9 -8
- sl_shared_assets/tools/packaging_tools.py +14 -25
- sl_shared_assets/tools/project_management_tools.py +602 -523
- sl_shared_assets/tools/transfer_tools.py +88 -23
- {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/METADATA +46 -203
- sl_shared_assets-5.0.0.dist-info/RECORD +23 -0
- sl_shared_assets-5.0.0.dist-info/entry_points.txt +3 -0
- sl_shared_assets/__init__.pyi +0 -91
- sl_shared_assets/cli.py +0 -501
- sl_shared_assets/cli.pyi +0 -106
- sl_shared_assets/data_classes/__init__.pyi +0 -75
- sl_shared_assets/data_classes/configuration_data.pyi +0 -235
- sl_shared_assets/data_classes/runtime_data.pyi +0 -157
- sl_shared_assets/data_classes/session_data.pyi +0 -379
- sl_shared_assets/data_classes/surgery_data.pyi +0 -89
- sl_shared_assets/server/__init__.pyi +0 -11
- sl_shared_assets/server/job.pyi +0 -205
- sl_shared_assets/server/server.pyi +0 -298
- sl_shared_assets/tools/__init__.pyi +0 -19
- sl_shared_assets/tools/ascension_tools.py +0 -265
- sl_shared_assets/tools/ascension_tools.pyi +0 -68
- sl_shared_assets/tools/packaging_tools.pyi +0 -58
- sl_shared_assets/tools/project_management_tools.pyi +0 -239
- sl_shared_assets/tools/transfer_tools.pyi +0 -53
- sl_shared_assets-4.0.1.dist-info/RECORD +0 -36
- sl_shared_assets-4.0.1.dist-info/entry_points.txt +0 -7
- {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/WHEEL +0 -0
- {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,8 +1,28 @@
|
|
|
1
|
-
"""This package provides the classes and methods used by all Sun lab libraries to
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
"""This package provides the classes and methods used by all Sun lab libraries to work with the data stored on remote
|
|
2
|
+
compute servers, such as the BioHPC server. It provides tools for submitting and monitoring jobs, running complex
|
|
3
|
+
processing pipelines and interactively working with the data via a Jupyter lab server."""
|
|
4
4
|
|
|
5
5
|
from .job import Job, JupyterJob
|
|
6
6
|
from .server import Server, ServerCredentials, generate_server_credentials
|
|
7
|
+
from .pipeline import (
|
|
8
|
+
ProcessingStatus,
|
|
9
|
+
TrackerFileNames,
|
|
10
|
+
ProcessingTracker,
|
|
11
|
+
ProcessingPipeline,
|
|
12
|
+
ProcessingPipelines,
|
|
13
|
+
generate_manager_id,
|
|
14
|
+
)
|
|
7
15
|
|
|
8
|
-
__all__ = [
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Job",
|
|
18
|
+
"JupyterJob",
|
|
19
|
+
"ProcessingPipeline",
|
|
20
|
+
"ProcessingPipelines",
|
|
21
|
+
"ProcessingStatus",
|
|
22
|
+
"ProcessingTracker",
|
|
23
|
+
"Server",
|
|
24
|
+
"ServerCredentials",
|
|
25
|
+
"TrackerFileNames",
|
|
26
|
+
"generate_manager_id",
|
|
27
|
+
"generate_server_credentials",
|
|
28
|
+
]
|
sl_shared_assets/server/job.py
CHANGED
|
@@ -7,7 +7,6 @@ Since version 3.0.0, this module also provides the specialized JupyterJob class
|
|
|
7
7
|
notebook servers.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
# noinspection PyProtectedMember
|
|
11
10
|
import re
|
|
12
11
|
from pathlib import Path
|
|
13
12
|
import datetime
|
|
@@ -49,7 +48,7 @@ class _JupyterConnectionInfo:
|
|
|
49
48
|
|
|
50
49
|
|
|
51
50
|
class Job:
|
|
52
|
-
"""Aggregates the data of a single SLURM-managed job to be executed on the Sun lab
|
|
51
|
+
"""Aggregates the data of a single SLURM-managed job to be executed on the Sun lab's remote compute server.
|
|
53
52
|
|
|
54
53
|
This class provides the API for constructing any server-side job in the Sun lab. Internally, it wraps an instance
|
|
55
54
|
of a Slurm class to package the job data into the format expected by the SLURM job manager. All jobs managed by this
|
|
@@ -222,7 +221,7 @@ class JupyterJob(Job):
|
|
|
222
221
|
connection_info: Stores the JupyterConnectionInfo instance after the Jupyter server is instantiated.
|
|
223
222
|
host: Stores the hostname of the remote server.
|
|
224
223
|
user: Stores the username used to connect with the remote server.
|
|
225
|
-
connection_info_file: The absolute path to the file that stores connection information
|
|
224
|
+
connection_info_file: The absolute path to the file that stores connection information relative to the remote
|
|
226
225
|
server root.
|
|
227
226
|
_command: Stores the shell command for launching the Jupyter server.
|
|
228
227
|
"""
|
|
@@ -273,12 +272,12 @@ class JupyterJob(Job):
|
|
|
273
272
|
"""Builds the command to launch the Jupyter notebook server on the remote Sun lab server."""
|
|
274
273
|
|
|
275
274
|
# Gets the hostname of the compute node and caches it in the connection data file. Also caches the port name.
|
|
276
|
-
self.add_command('echo "COMPUTE_NODE: $(hostname)" > {
|
|
277
|
-
self.add_command('echo "PORT: {}" >> {
|
|
275
|
+
self.add_command(f'echo "COMPUTE_NODE: $(hostname)" > {self.connection_info_file}')
|
|
276
|
+
self.add_command(f'echo "PORT: {self.port}" >> {self.connection_info_file}')
|
|
278
277
|
|
|
279
278
|
# Generates a random access token for security and caches it in the connection data file.
|
|
280
279
|
self.add_command("TOKEN=$(openssl rand -hex 24)")
|
|
281
|
-
self.add_command('echo "TOKEN: $TOKEN" >> {
|
|
280
|
+
self.add_command(f'echo "TOKEN: $TOKEN" >> {self.connection_info_file}')
|
|
282
281
|
|
|
283
282
|
# Builds Jupyter startup command.
|
|
284
283
|
jupyter_cmd = [
|
|
@@ -312,7 +311,7 @@ class JupyterJob(Job):
|
|
|
312
311
|
information to be parsed.
|
|
313
312
|
"""
|
|
314
313
|
|
|
315
|
-
with open(
|
|
314
|
+
with info_file.open() as f:
|
|
316
315
|
content = f.read()
|
|
317
316
|
|
|
318
317
|
# Extracts information using regex
|
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
"""This module provides tools and classes for running complex data processing pipelines on remote compute servers.
|
|
2
|
+
A Pipeline represents a higher unit of abstraction relative to the Job class, often leveraging multiple sequential or
|
|
3
|
+
parallel processing jobs to conduct the required processing."""
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
from enum import IntEnum, StrEnum
|
|
7
|
+
from random import randint
|
|
8
|
+
import shutil as sh
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from dataclasses import field, dataclass
|
|
11
|
+
|
|
12
|
+
from xxhash import xxh3_64
|
|
13
|
+
from filelock import FileLock
|
|
14
|
+
from ataraxis_base_utilities import console, ensure_directory_exists
|
|
15
|
+
from ataraxis_data_structures import YamlConfig
|
|
16
|
+
from ataraxis_time.time_helpers import get_timestamp
|
|
17
|
+
|
|
18
|
+
from .job import Job
|
|
19
|
+
from .server import Server
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TrackerFileNames(StrEnum):
|
|
23
|
+
"""Defines a set of processing tacker .yaml files used by the Sun lab data preprocessing, processing, and dataset
|
|
24
|
+
formation pipelines to track the progress of the remotely executed pipelines.
|
|
25
|
+
|
|
26
|
+
This enumeration standardizes the names for all processing tracker files used in the lab. It is designed to be used
|
|
27
|
+
via the get_processing_tracker() function to generate ProcessingTracker instances.
|
|
28
|
+
|
|
29
|
+
Notes:
|
|
30
|
+
The elements in this enumeration match the elements in the ProcessingPipelines enumeration, since each valid
|
|
31
|
+
ProcessingPipeline instance has an associated ProcessingTracker file instance.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
MANIFEST = "manifest_generation_tracker.yaml"
|
|
35
|
+
"""This file is used to track the state of the project manifest generation pipeline."""
|
|
36
|
+
CHECKSUM = "checksum_resolution_tracker.yaml"
|
|
37
|
+
"""This file is used to track the state of the checksum resolution pipeline."""
|
|
38
|
+
PREPARATION = "processing_preparation_tracker.yaml"
|
|
39
|
+
"""This file is used to track the state of the data processing preparation pipeline."""
|
|
40
|
+
BEHAVIOR = "behavior_processing_tracker.yaml"
|
|
41
|
+
"""This file is used to track the state of the behavior log processing pipeline."""
|
|
42
|
+
SUITE2P = "suite2p_processing_tracker.yaml"
|
|
43
|
+
"""This file is used to track the state of the single-day suite2p processing pipeline."""
|
|
44
|
+
VIDEO = "video_processing_tracker.yaml"
|
|
45
|
+
"""This file is used to track the state of the video (DeepLabCut) processing pipeline."""
|
|
46
|
+
FORGING = "dataset_forging_tracker.yaml"
|
|
47
|
+
"""This file is used to track the state of the dataset creation (forging) pipeline."""
|
|
48
|
+
MULTIDAY = "multiday_processing_tracker.yaml"
|
|
49
|
+
"""This file is used to track the state of the multiday suite2p processing pipeline."""
|
|
50
|
+
ARCHIVING = "data_archiving_tracker.yaml"
|
|
51
|
+
"""This file is used to track the state of the data archiving pipeline."""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ProcessingPipelines(StrEnum):
|
|
55
|
+
"""Defines the set of processing pipelines currently supported in the Sun lab.
|
|
56
|
+
|
|
57
|
+
All processing pipelines currently supported by the lab codebase are defined in this enumeration. Primarily,
|
|
58
|
+
the elements from this enumeration are used in terminal messages and data logging entries to identify the pipelines
|
|
59
|
+
to the user.
|
|
60
|
+
|
|
61
|
+
Notes:
|
|
62
|
+
The elements in this enumeration match the elements in the ProcessingTracker enumeration, since each valid
|
|
63
|
+
ProcessingPipeline instance has an associated ProcessingTracker file instance.
|
|
64
|
+
|
|
65
|
+
The order of pipelines in this enumeration loosely follows the sequence in which they are executed during the
|
|
66
|
+
lifetime of the Sun lab data on the remote compute server.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
MANIFEST = "manifest generation"
|
|
70
|
+
"""Project manifest generation pipeline. This pipeline is generally not used in most runtime contexts. It allows
|
|
71
|
+
manually regenerating the project manifest .feather file, which is typically only used during testing. All other
|
|
72
|
+
pipeline automatically conduct the manifest (re)generation at the end of their runtime."""
|
|
73
|
+
CHECKSUM = "checksum resolution"
|
|
74
|
+
"""Checksum resolution pipeline. Primarily, it is used to verify that the raw data has been transferred to the
|
|
75
|
+
remote storage server from the main acquisition system PC intact. This pipeline is sometimes also used to
|
|
76
|
+
regenerate (re-checksum) the data stored on the remote compute server."""
|
|
77
|
+
PREPARATION = "processing preparation"
|
|
78
|
+
"""Data processing preparation pipeline. Since the compute server uses a two-volume design with a slow (HDD) storage
|
|
79
|
+
volume and a fast (NVME) working volume, to optimize data processing performance, the data needs to be transferred
|
|
80
|
+
to the working volume before processing. This pipeline copies the raw data for the target session from the storage
|
|
81
|
+
volume to the working volume."""
|
|
82
|
+
BEHAVIOR = "behavior processing"
|
|
83
|
+
"""Behavior processing pipeline. This pipeline is used to process .npz log files to extract animal behavior data
|
|
84
|
+
acquired during a single session (day). The processed logs also contain the timestamps use to synchronize behavior
|
|
85
|
+
to video and mesoscope frame data, and experiment configuration and task information."""
|
|
86
|
+
SUITE2P = "single-day suite2p processing"
|
|
87
|
+
"""Single-day suite2p pipeline. This pipeline is used to extract the cell activity data from 2-photon imaging data
|
|
88
|
+
acquired during a single session (day)."""
|
|
89
|
+
VIDEO = "video processing"
|
|
90
|
+
"""DeepLabCut (Video) processing pipeline. This pipeline is used to extract animal pose estimation data from the
|
|
91
|
+
behavior video frames acquired during a single session (day)."""
|
|
92
|
+
MULTIDAY = "multi-day suite2p processing"
|
|
93
|
+
"""Multi-day suite2p processing (cell tracking) pipeline. This pipeline is used to track cells processed with the
|
|
94
|
+
single-day suite2p pipelines across multiple days. It is executed for all sessions marked for integration into the
|
|
95
|
+
same dataset as the first step of dataset creation."""
|
|
96
|
+
FORGING = "dataset forging"
|
|
97
|
+
"""Dataset creation (forging) pipeline. This pipeline typically runs after the multi-day pipeline. It extracts and
|
|
98
|
+
integrates the processed data from various sources such as brain activity, behavior, videos, etc., into a unified
|
|
99
|
+
dataset."""
|
|
100
|
+
ARCHIVING = "data archiving"
|
|
101
|
+
"""Data archiving pipeline. To conserve the (limited) space on the fast working volume, once the data has been
|
|
102
|
+
processed and integrated into a stable dataset, the processed data folder is moved to the storage volume and all
|
|
103
|
+
folders under the root session folder on the processed data volume are deleted."""
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class ProcessingStatus(IntEnum):
|
|
107
|
+
"""Maps integer-based processing pipeline status (state) codes to human-readable names.
|
|
108
|
+
|
|
109
|
+
This enumeration is used to track and communicate the progress of Sun lab processing pipelines as they are executed
|
|
110
|
+
by the remote compute server. Specifically, the codes from this enumeration are used by the ProcessingPipeline
|
|
111
|
+
class to communicate the status of the managed pipelines to external processes.
|
|
112
|
+
|
|
113
|
+
Notes:
|
|
114
|
+
The status codes from this enumeration track the state of the pipeline as a whole, instead of tracking the
|
|
115
|
+
state of each job that comprises the pipeline.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
RUNNING = 0
|
|
119
|
+
"""The pipeline is currently running on the remote server. It may be executed (in progress) or waiting for
|
|
120
|
+
the required resources to become available (queued)."""
|
|
121
|
+
SUCCEEDED = 1
|
|
122
|
+
"""The server has successfully completed the processing pipeline."""
|
|
123
|
+
FAILED = 2
|
|
124
|
+
"""The server has failed to complete the pipeline due to a runtime error."""
|
|
125
|
+
ABORTED = 3
|
|
126
|
+
"""The pipeline execution has been aborted prematurely, either by the manager process or due to an overriding
|
|
127
|
+
request from another user."""
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass()
|
|
131
|
+
class ProcessingTracker(YamlConfig):
|
|
132
|
+
"""Wraps the .yaml file that tracks the state of a data processing pipeline and provides tools for communicating the
|
|
133
|
+
state between multiple processes in a thread-safe manner.
|
|
134
|
+
|
|
135
|
+
This class is used by all data processing pipelines running on the remote compute server(s) to prevent race
|
|
136
|
+
conditions and ensure that pipelines have exclusive access to the processed data. It is also used to evaluate the
|
|
137
|
+
status (success / failure) of each pipeline as they are executed by the remote server.
|
|
138
|
+
|
|
139
|
+
Note:
|
|
140
|
+
In library version 4.0.0 the processing trackers have been refactored to work similar to 'lock' files. That is,
|
|
141
|
+
when a pipeline starts running on the remote server, its tracker is switched into the 'running' (locked) state
|
|
142
|
+
until the pipeline completes, aborts, or encounters an error. When the tracker is locked, all modifications to
|
|
143
|
+
the tracker or processed data have to originate from the same process that started the pipeline that locked the
|
|
144
|
+
tracker file. This feature supports running complex processing pipelines that use multiple concurrent and / or
|
|
145
|
+
sequential processing jobs on the remote server.
|
|
146
|
+
|
|
147
|
+
This instance frequently refers to a 'manager process' in method documentation. A 'manager process' is the
|
|
148
|
+
highest-level process that manages the tracked pipeline. When a pipeline runs on remote compute servers, the
|
|
149
|
+
manager process is typically the process running on the non-server machine (user PC) that submits the remote
|
|
150
|
+
processing jobs to the compute server (via SSH or similar protocol). The worker process(es) that run the
|
|
151
|
+
processing job(s) on the remote compute servers are NOT considered manager processes.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
file_path: Path
|
|
155
|
+
"""Stores the path to the .yaml file used to cache the tracker data on disk. The class instance functions as a
|
|
156
|
+
wrapper around the data stored inside the specified .yaml file."""
|
|
157
|
+
_complete: bool = False
|
|
158
|
+
"""Tracks whether the processing runtime managed by this tracker has finished successfully."""
|
|
159
|
+
_encountered_error: bool = False
|
|
160
|
+
"""Tracks whether the processing runtime managed by this tracker has encountered an error and has finished
|
|
161
|
+
unsuccessfully."""
|
|
162
|
+
_running: bool = False
|
|
163
|
+
"""Tracks whether the processing runtime managed by this tracker is currently running."""
|
|
164
|
+
_manager_id: int = -1
|
|
165
|
+
"""Stores the xxHash3-64 hash value that represents the unique identifier of the manager process that started the
|
|
166
|
+
runtime. The manager process is typically running on a remote control machine (computer) and is used to
|
|
167
|
+
support processing runtimes that are distributed over multiple separate batch jobs on the compute server. This
|
|
168
|
+
ID should be generated using the 'generate_manager_id()' function exposed by this library."""
|
|
169
|
+
_lock_path: str = field(init=False)
|
|
170
|
+
"""Stores the path to the .lock file used to ensure that only a single process can simultaneously access the data
|
|
171
|
+
stored inside the tracker file."""
|
|
172
|
+
|
|
173
|
+
def __post_init__(self) -> None:
|
|
174
|
+
# Generates the .lock file path for the target tracker .yaml file.
|
|
175
|
+
if self.file_path is not None:
|
|
176
|
+
self._lock_path = str(self.file_path.with_suffix(self.file_path.suffix + ".lock"))
|
|
177
|
+
|
|
178
|
+
# Ensures that the input processing tracker file name is supported.
|
|
179
|
+
if self.file_path.name not in tuple(TrackerFileNames):
|
|
180
|
+
message = (
|
|
181
|
+
f"Unsupported processing tracker file encountered when instantiating a ProcessingTracker "
|
|
182
|
+
f"instance: {self.file_path}. Currently, only the following tracker file names are "
|
|
183
|
+
f"supported: {', '.join(tuple(TrackerFileNames))}."
|
|
184
|
+
)
|
|
185
|
+
console.error(message=message, error=ValueError)
|
|
186
|
+
|
|
187
|
+
else:
|
|
188
|
+
self._lock_path = ""
|
|
189
|
+
|
|
190
|
+
def _load_state(self) -> None:
|
|
191
|
+
"""Reads the current processing state from the wrapped .YAML file."""
|
|
192
|
+
if self.file_path.exists():
|
|
193
|
+
# Loads the data for the state values but does not replace the file path or lock attributes.
|
|
194
|
+
instance: ProcessingTracker = self.from_yaml(self.file_path) # type: ignore
|
|
195
|
+
self._complete = copy.copy(instance._complete)
|
|
196
|
+
self._encountered_error = copy.copy(instance._encountered_error)
|
|
197
|
+
self._running = copy.copy(instance._running)
|
|
198
|
+
self._manager_id = copy.copy(instance._manager_id)
|
|
199
|
+
else:
|
|
200
|
+
# Otherwise, if the tracker file does not exist, generates a new .yaml file using default instance values
|
|
201
|
+
# and saves it to disk using the specified tracker file path.
|
|
202
|
+
self._save_state()
|
|
203
|
+
|
|
204
|
+
def _save_state(self) -> None:
|
|
205
|
+
"""Saves the current processing state stored inside instance attributes to the specified .YAML file."""
|
|
206
|
+
# Resets the _lock_path and file_path to None before dumping the data to .YAML to avoid issues with loading it
|
|
207
|
+
# back.
|
|
208
|
+
original = copy.deepcopy(self)
|
|
209
|
+
original.file_path = None # type: ignore
|
|
210
|
+
original._lock_path = None # type: ignore
|
|
211
|
+
original.to_yaml(file_path=self.file_path)
|
|
212
|
+
|
|
213
|
+
def start(self, manager_id: int) -> None:
|
|
214
|
+
"""Configures the tracker file to indicate that a manager process is currently executing the tracked processing
|
|
215
|
+
runtime.
|
|
216
|
+
|
|
217
|
+
Calling this method effectively 'locks' the tracked session and processing runtime combination to only be
|
|
218
|
+
accessible from the manager process that calls this method. Calling this method for an already running runtime
|
|
219
|
+
managed by the same process does not have any effect, so it is safe to call this method at the beginning of
|
|
220
|
+
each processing job that makes up the runtime.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
manager_id: The unique xxHash-64 hash identifier of the manager process which attempts to start the runtime
|
|
224
|
+
tracked by this tracker file.
|
|
225
|
+
|
|
226
|
+
Raises:
|
|
227
|
+
TimeoutError: If the .lock file for the target .YAML file cannot be acquired within the timeout period.
|
|
228
|
+
"""
|
|
229
|
+
# Acquires the lock
|
|
230
|
+
lock = FileLock(self._lock_path)
|
|
231
|
+
with lock.acquire(timeout=10.0):
|
|
232
|
+
# Loads tracker state from the .yaml file
|
|
233
|
+
self._load_state()
|
|
234
|
+
|
|
235
|
+
# If the runtime is already running from a different process, aborts with an error.
|
|
236
|
+
if self._running and manager_id != self._manager_id:
|
|
237
|
+
message = (
|
|
238
|
+
f"Unable to start the processing runtime from the manager process with id {manager_id}. The "
|
|
239
|
+
f"{self.file_path.name} tracker file indicates that the manager process with id {self._manager_id} "
|
|
240
|
+
f"is currently executing the tracked runtime. Only a single manager process is allowed to execute "
|
|
241
|
+
f"the runtime at the same time."
|
|
242
|
+
)
|
|
243
|
+
console.error(message=message, error=RuntimeError)
|
|
244
|
+
raise RuntimeError(message) # Fallback to appease mypy, should not be reachable
|
|
245
|
+
|
|
246
|
+
# Otherwise, if the runtime is already running for the current manager process, returns without modifying
|
|
247
|
+
# the tracker data.
|
|
248
|
+
elif self._running and manager_id == self._manager_id:
|
|
249
|
+
return
|
|
250
|
+
|
|
251
|
+
# Otherwise, locks the runtime for the current manager process and updates the cached tracker data
|
|
252
|
+
self._running = True
|
|
253
|
+
self._manager_id = manager_id
|
|
254
|
+
self._complete = False
|
|
255
|
+
self._encountered_error = False
|
|
256
|
+
self._save_state()
|
|
257
|
+
|
|
258
|
+
def error(self, manager_id: int) -> None:
|
|
259
|
+
"""Configures the tracker file to indicate that the tracked processing runtime encountered an error and failed
|
|
260
|
+
to complete.
|
|
261
|
+
|
|
262
|
+
This method fulfills two main purposes. First, it 'unlocks' the runtime, allowing other manager processes to
|
|
263
|
+
interface with the tracked runtime. Second, it updates the tracker file to reflect that the runtime was
|
|
264
|
+
interrupted due to an error, which is used by the manager processes to detect and handle processing failures.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
manager_id: The unique xxHash-64 hash identifier of the manager process which attempts to report that the
|
|
268
|
+
runtime tracked by this tracker file has encountered an error.
|
|
269
|
+
|
|
270
|
+
Raises:
|
|
271
|
+
TimeoutError: If the .lock file for the target .YAML file cannot be acquired within the timeout period.
|
|
272
|
+
"""
|
|
273
|
+
lock = FileLock(self._lock_path)
|
|
274
|
+
with lock.acquire(timeout=10.0):
|
|
275
|
+
# Loads tracker state from the .yaml file
|
|
276
|
+
self._load_state()
|
|
277
|
+
|
|
278
|
+
# If the runtime is not running, returns without doing anything
|
|
279
|
+
if not self._running:
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
# Ensures that only the active manager process can report runtime errors using the tracker file
|
|
283
|
+
if manager_id != self._manager_id:
|
|
284
|
+
message = (
|
|
285
|
+
f"Unable to report that the processing runtime has encountered an error from the manager process "
|
|
286
|
+
f"with id {manager_id}. The {self.file_path.name} tracker file indicates that the runtime is "
|
|
287
|
+
f"managed by the process with id {self._manager_id}, preventing other processes from interfacing "
|
|
288
|
+
f"with the runtime."
|
|
289
|
+
)
|
|
290
|
+
console.error(message=message, error=RuntimeError)
|
|
291
|
+
raise RuntimeError(message) # Fallback to appease mypy, should not be reachable
|
|
292
|
+
|
|
293
|
+
# Indicates that the runtime aborted with an error
|
|
294
|
+
self._running = False
|
|
295
|
+
self._manager_id = -1
|
|
296
|
+
self._complete = False
|
|
297
|
+
self._encountered_error = True
|
|
298
|
+
self._save_state()
|
|
299
|
+
|
|
300
|
+
def stop(self, manager_id: int) -> None:
|
|
301
|
+
"""Configures the tracker file to indicate that the tracked processing runtime has been completed successfully.
|
|
302
|
+
|
|
303
|
+
This method 'unlocks' the runtime, allowing other manager processes to interface with the tracked runtime. It
|
|
304
|
+
also configures the tracker file to indicate that the runtime has been completed successfully, which is used
|
|
305
|
+
by the manager processes to detect and handle processing completion.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
manager_id: The unique xxHash-64 hash identifier of the manager process which attempts to report that the
|
|
309
|
+
runtime tracked by this tracker file has been completed successfully.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
TimeoutError: If the .lock file for the target .YAML file cannot be acquired within the timeout period.
|
|
313
|
+
"""
|
|
314
|
+
lock = FileLock(self._lock_path)
|
|
315
|
+
with lock.acquire(timeout=10.0):
|
|
316
|
+
# Loads tracker state from the .yaml file
|
|
317
|
+
self._load_state()
|
|
318
|
+
|
|
319
|
+
# If the runtime is not running, does not do anything
|
|
320
|
+
if not self._running:
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
# Ensures that only the active manager process can report runtime completion using the tracker file
|
|
324
|
+
if manager_id != self._manager_id:
|
|
325
|
+
message = (
|
|
326
|
+
f"Unable to report that the processing runtime has completed successfully from the manager process "
|
|
327
|
+
f"with id {manager_id}. The {self.file_path.name} tracker file indicates that the runtime is "
|
|
328
|
+
f"managed by the process with id {self._manager_id}, preventing other processes from interfacing "
|
|
329
|
+
f"with the runtime."
|
|
330
|
+
)
|
|
331
|
+
console.error(message=message, error=RuntimeError)
|
|
332
|
+
raise RuntimeError(message) # Fallback to appease mypy, should not be reachable
|
|
333
|
+
|
|
334
|
+
# Otherwise, marks the runtime as complete (stopped)
|
|
335
|
+
self._running = False
|
|
336
|
+
self._manager_id = -1
|
|
337
|
+
self._complete = True
|
|
338
|
+
self._encountered_error = False
|
|
339
|
+
self._save_state()
|
|
340
|
+
|
|
341
|
+
def abort(self) -> None:
|
|
342
|
+
"""Resets the runtime tracker file to the default state.
|
|
343
|
+
|
|
344
|
+
This method can be used to reset the runtime tracker file, regardless of the current runtime state. Unlike other
|
|
345
|
+
instance methods, this method can be called from any manager process, even if the runtime is already locked by
|
|
346
|
+
another process. This method is only intended to be used in the case of emergency to 'unlock' a deadlocked
|
|
347
|
+
runtime.
|
|
348
|
+
"""
|
|
349
|
+
lock = FileLock(self._lock_path)
|
|
350
|
+
with lock.acquire(timeout=10.0):
|
|
351
|
+
# Loads tracker state from the .yaml file.
|
|
352
|
+
self._load_state()
|
|
353
|
+
|
|
354
|
+
# Resets the tracker file to the default state. Note, does not indicate that the runtime completed nor
|
|
355
|
+
# that it has encountered an error.
|
|
356
|
+
self._running = False
|
|
357
|
+
self._manager_id = -1
|
|
358
|
+
self._complete = False
|
|
359
|
+
self._encountered_error = False
|
|
360
|
+
self._save_state()
|
|
361
|
+
|
|
362
|
+
@property
|
|
363
|
+
def is_complete(self) -> bool:
|
|
364
|
+
"""Returns True if the tracker wrapped by the instance indicates that the processing runtime has been completed
|
|
365
|
+
successfully and that the runtime is not currently ongoing."""
|
|
366
|
+
lock = FileLock(self._lock_path)
|
|
367
|
+
with lock.acquire(timeout=10.0):
|
|
368
|
+
# Loads tracker state from the .yaml file
|
|
369
|
+
self._load_state()
|
|
370
|
+
return self._complete
|
|
371
|
+
|
|
372
|
+
@property
|
|
373
|
+
def encountered_error(self) -> bool:
|
|
374
|
+
"""Returns True if the tracker wrapped by the instance indicates that the processing runtime has aborted due
|
|
375
|
+
to encountering an error."""
|
|
376
|
+
lock = FileLock(self._lock_path)
|
|
377
|
+
with lock.acquire(timeout=10.0):
|
|
378
|
+
# Loads tracker state from the .yaml file
|
|
379
|
+
self._load_state()
|
|
380
|
+
return self._encountered_error
|
|
381
|
+
|
|
382
|
+
@property
|
|
383
|
+
def is_running(self) -> bool:
|
|
384
|
+
"""Returns True if the tracker wrapped by the instance indicates that the processing runtime is currently
|
|
385
|
+
ongoing."""
|
|
386
|
+
lock = FileLock(self._lock_path)
|
|
387
|
+
with lock.acquire(timeout=10.0):
|
|
388
|
+
# Loads tracker state from the .yaml file
|
|
389
|
+
self._load_state()
|
|
390
|
+
return self._running
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
@dataclass()
|
|
394
|
+
class ProcessingPipeline:
|
|
395
|
+
"""Encapsulates access to a processing pipeline running on the remote compute server.
|
|
396
|
+
|
|
397
|
+
This class functions as an interface for all data processing pipelines running on Sun lab compute servers. It is
|
|
398
|
+
pipeline-type-agnostic and works for all data processing pipelines supported by this library. After instantiation,
|
|
399
|
+
the class automatically handles all interactions with the server necessary to run the remote processing pipeline and
|
|
400
|
+
verify the runtime outcome via the runtime_cycle() method that has to be called cyclically until the pipeline is
|
|
401
|
+
complete.
|
|
402
|
+
|
|
403
|
+
Notes:
|
|
404
|
+
Each pipeline may be executed in one or more stages, each stage using one or more parallel jobs. As such, each
|
|
405
|
+
pipeline can be seen as an execution graph that sequentially submits batches of jobs to the remote server. The
|
|
406
|
+
processing graph for each pipeline is fully resolved at the instantiation of this class instance, so each
|
|
407
|
+
instance contains the necessary data to run the entire processing pipeline.
|
|
408
|
+
|
|
409
|
+
The minimum self-contained unit of the processing pipeline is a single job. Since jobs can depend on the output
|
|
410
|
+
of other jobs, they are organized into stages based on the dependency graph between jobs. Combined with cluster
|
|
411
|
+
management software, such as SLURM, this class can efficiently execute processing pipelines on scalable compute
|
|
412
|
+
clusters.
|
|
413
|
+
"""
|
|
414
|
+
|
|
415
|
+
pipeline_type: ProcessingPipelines
|
|
416
|
+
"""Stores the name of the processing pipeline managed by this instance. Primarily, this is used to identify the
|
|
417
|
+
pipeline to the user in terminal messages and logs."""
|
|
418
|
+
server: Server
|
|
419
|
+
"""Stores the reference to the Server object that maintains bidirectional communication with the remote server
|
|
420
|
+
running the pipeline."""
|
|
421
|
+
manager_id: int
|
|
422
|
+
"""The unique identifier for the manager process that constructs and manages the runtime of the tracked pipeline.
|
|
423
|
+
This is used to ensure that only a single pipeline instance can work with each session's data at the same time on
|
|
424
|
+
the remote server."""
|
|
425
|
+
jobs: dict[int, tuple[tuple[Job, Path], ...]]
|
|
426
|
+
"""Stores the dictionary that maps the pipeline processing stage integer-codes to two-element tuples. Each tuple
|
|
427
|
+
stores the Job objects and the paths to their remote working directories to be submitted to the server at each
|
|
428
|
+
stage."""
|
|
429
|
+
remote_tracker_path: Path
|
|
430
|
+
"""The path to the pipeline's processing tracker .yaml file stored on the remote compute server."""
|
|
431
|
+
local_tracker_path: Path
|
|
432
|
+
"""The path to the pipeline's processing tracker .yaml file on the local machine. The remote file is pulled to
|
|
433
|
+
this location when the instance verifies the outcome of each tracked pipeline's processing stage."""
|
|
434
|
+
session: str
|
|
435
|
+
"""The ID of the session whose data is being processed by the tracked pipeline."""
|
|
436
|
+
animal: str
|
|
437
|
+
"""The ID of the animal whose data is being processed by the tracked pipeline."""
|
|
438
|
+
project: str
|
|
439
|
+
"""The name of the project whose data is being processed by the tracked pipeline."""
|
|
440
|
+
keep_job_logs: bool = False
|
|
441
|
+
"""Determines whether to keep the logs for the jobs making up the pipeline execution graph or (default) to remove
|
|
442
|
+
them after pipeline successfully ends its runtime. If the pipeline fails to complete its runtime, the logs are kept
|
|
443
|
+
regardless of this setting."""
|
|
444
|
+
pipeline_status: ProcessingStatus | int = ProcessingStatus.RUNNING
|
|
445
|
+
"""Stores the current status of the tracked remote pipeline. This field is updated each time runtime_cycle()
|
|
446
|
+
instance method is called."""
|
|
447
|
+
_pipeline_stage: int = 0
|
|
448
|
+
"""Stores the current stage of the tracked pipeline. This field is monotonically incremented by the runtime_cycle()
|
|
449
|
+
method to sequentially submit batches of jobs to the server in a processing-stage-driven fashion."""
|
|
450
|
+
|
|
451
|
+
def __post_init__(self) -> None:
|
|
452
|
+
"""Carries out the necessary filesystem setup tasks to support pipeline execution."""
|
|
453
|
+
|
|
454
|
+
# Ensures that the input processing tracker file name is supported.
|
|
455
|
+
if self.pipeline_type not in tuple(ProcessingPipelines):
|
|
456
|
+
message = (
|
|
457
|
+
f"Unsupported processing pipeline type encountered when instantiating a ProcessingPipeline "
|
|
458
|
+
f"instance: {self.pipeline_type}. Currently, only the following pipeline types are "
|
|
459
|
+
f"supported: {', '.join(tuple(ProcessingPipelines))}."
|
|
460
|
+
)
|
|
461
|
+
console.error(message=message, error=ValueError)
|
|
462
|
+
|
|
463
|
+
ensure_directory_exists(self.local_tracker_path) # Ensures that the local temporary directory exists
|
|
464
|
+
|
|
465
|
+
def runtime_cycle(self) -> None:
|
|
466
|
+
"""Checks the current status of the tracked pipeline and, if necessary, submits additional batches of jobs to
|
|
467
|
+
the remote server to progress the pipeline.
|
|
468
|
+
|
|
469
|
+
This method is the main entry point for all interactions with the processing pipeline managed by this instance.
|
|
470
|
+
It checks the current state of the pipeline, advances the pipeline's processing stage, and submits the necessary
|
|
471
|
+
jobs to the remote server. The runtime manager process should call this method repeatedly (cyclically) to run
|
|
472
|
+
the pipeline until the 'is_running' property of the instance returns True.
|
|
473
|
+
|
|
474
|
+
Notes:
|
|
475
|
+
While the 'is_running' property can be used to determine whether the pipeline is still running, to resolve
|
|
476
|
+
the final status of the pipeline (success or failure), the manager process should access the
|
|
477
|
+
'status' instance property.
|
|
478
|
+
"""
|
|
479
|
+
|
|
480
|
+
# This clause is executed the first time the method is called for the newly initialized pipeline tracker
|
|
481
|
+
# instance. It submits the first batch of processing jobs (first stage) to the remote server. For one-stage
|
|
482
|
+
# pipelines, this is the only time when pipeline jobs are submitted to the server.
|
|
483
|
+
if self._pipeline_stage == 0:
|
|
484
|
+
self._pipeline_stage += 1
|
|
485
|
+
self._submit_jobs()
|
|
486
|
+
|
|
487
|
+
# Waits until all jobs submitted to the server as part of the current processing stage are completed before
|
|
488
|
+
# advancing further.
|
|
489
|
+
for job, _ in self.jobs[self._pipeline_stage]: # Ignores working directories as part of this iteration.
|
|
490
|
+
if not self.server.job_complete(job=job):
|
|
491
|
+
return
|
|
492
|
+
|
|
493
|
+
# If all jobs for the current processing stage have completed, checks the pipeline's processing tracker file to
|
|
494
|
+
# determine if all jobs completed successfully.
|
|
495
|
+
self.server.pull_file(remote_file_path=self.remote_tracker_path, local_file_path=self.local_tracker_path)
|
|
496
|
+
tracker = ProcessingTracker(self.local_tracker_path)
|
|
497
|
+
|
|
498
|
+
# If the stage failed due to encountering an error, removes the local tracker copy and marks the pipeline
|
|
499
|
+
# as 'failed'. It is expected that the pipeline state is then handed by the manager process to notify the
|
|
500
|
+
# user about the runtime failure.
|
|
501
|
+
if tracker.encountered_error:
|
|
502
|
+
sh.rmtree(self.local_tracker_path.parent) # Removes local temporary data
|
|
503
|
+
self.pipeline_status = ProcessingStatus.FAILED # Updates the processing status to 'failed'
|
|
504
|
+
|
|
505
|
+
# If this was the last processing stage, the tracker indicates that the processing has been completed. In this
|
|
506
|
+
# case, initializes the shutdown sequence:
|
|
507
|
+
elif tracker.is_complete:
|
|
508
|
+
sh.rmtree(self.local_tracker_path.parent) # Removes local temporary data
|
|
509
|
+
self.pipeline_status = ProcessingStatus.SUCCEEDED # Updates the job status to 'succeeded'
|
|
510
|
+
|
|
511
|
+
# If the pipeline was configured to remove logs after completing successfully, removes the runtime log for
|
|
512
|
+
# each job submitted as part of this pipeline from the remote server.
|
|
513
|
+
if not self.keep_job_logs:
|
|
514
|
+
for stage_jobs in self.jobs.values():
|
|
515
|
+
for _, directory in stage_jobs: # Ignores job objects as part of this iteration.
|
|
516
|
+
self.server.remove(remote_path=directory, recursive=True, is_dir=True)
|
|
517
|
+
|
|
518
|
+
# If the processing is not complete (according to the tracker), this indicates that the pipeline has more
|
|
519
|
+
# stages to execute. In this case, increments the processing stage tracker and submits the next batch of jobs
|
|
520
|
+
# to the server.
|
|
521
|
+
elif tracker.is_running:
|
|
522
|
+
self._pipeline_stage += 1
|
|
523
|
+
self._submit_jobs()
|
|
524
|
+
|
|
525
|
+
# The final and the rarest state: the pipeline was aborted before it finished the runtime. Generally, this state
|
|
526
|
+
# should not be encountered during most runtimes.
|
|
527
|
+
else:
|
|
528
|
+
self.pipeline_status = ProcessingStatus.ABORTED
|
|
529
|
+
|
|
530
|
+
def _submit_jobs(self) -> None:
|
|
531
|
+
"""This worker method submits the processing jobs for the currently active processing stage to the remote
|
|
532
|
+
server.
|
|
533
|
+
|
|
534
|
+
It is used internally by the runtime_cycle() method to iteratively execute all stages of the managed processing
|
|
535
|
+
pipeline on the remote server.
|
|
536
|
+
"""
|
|
537
|
+
for job, _ in self.jobs[self._pipeline_stage]:
|
|
538
|
+
self.server.submit_job(job=job, verbose=False) # Silences terminal printouts
|
|
539
|
+
|
|
540
|
+
@property
|
|
541
|
+
def is_running(self) -> bool:
|
|
542
|
+
"""Returns True if the pipeline is currently running, False otherwise."""
|
|
543
|
+
if self.pipeline_status == ProcessingStatus.RUNNING:
|
|
544
|
+
return True
|
|
545
|
+
return False
|
|
546
|
+
|
|
547
|
+
@property
|
|
548
|
+
def status(self) -> ProcessingStatus:
|
|
549
|
+
"""Returns the current status of the pipeline packaged into a ProcessingStatus instance."""
|
|
550
|
+
return ProcessingStatus(self.pipeline_status)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def generate_manager_id() -> int:
|
|
554
|
+
"""Generates and returns a unique integer identifier that can be used to identify the manager process that calls
|
|
555
|
+
this function.
|
|
556
|
+
|
|
557
|
+
The identifier is generated based on the current timestamp, accurate to microseconds, and a random number between 1
|
|
558
|
+
and 9999999999999. This ensures that the identifier is unique for each function call. The generated identifier
|
|
559
|
+
string is converted to a unique integer value using the xxHash-64 algorithm before it is returned to the caller.
|
|
560
|
+
|
|
561
|
+
Notes:
|
|
562
|
+
This function should be used to generate manager process identifiers for working with ProcessingTracker
|
|
563
|
+
instances from sl-shared-assets version 4.0.0 and above.
|
|
564
|
+
"""
|
|
565
|
+
timestamp = get_timestamp()
|
|
566
|
+
random_number = randint(1, 9999999999999)
|
|
567
|
+
manager_id = f"{timestamp}_{random_number}"
|
|
568
|
+
id_hash = xxh3_64()
|
|
569
|
+
id_hash.update(manager_id)
|
|
570
|
+
return id_hash.intdigest()
|