sl-shared-assets 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +96 -0
- sl_shared_assets/__init__.pyi +87 -0
- sl_shared_assets/cli.py +72 -0
- sl_shared_assets/cli.pyi +17 -0
- sl_shared_assets/data_classes.py +1435 -0
- sl_shared_assets/data_classes.pyi +646 -0
- sl_shared_assets/packaging_tools.py +133 -0
- sl_shared_assets/packaging_tools.pyi +52 -0
- sl_shared_assets/py.typed +0 -0
- sl_shared_assets/server.py +293 -0
- sl_shared_assets/server.pyi +112 -0
- sl_shared_assets/suite2p.py +449 -0
- sl_shared_assets/suite2p.pyi +188 -0
- sl_shared_assets/transfer_tools.py +119 -0
- sl_shared_assets/transfer_tools.pyi +53 -0
- sl_shared_assets-1.0.0rc1.dist-info/METADATA +849 -0
- sl_shared_assets-1.0.0rc1.dist-info/RECORD +20 -0
- sl_shared_assets-1.0.0rc1.dist-info/WHEEL +4 -0
- sl_shared_assets-1.0.0rc1.dist-info/entry_points.txt +3 -0
- sl_shared_assets-1.0.0rc1.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""This module provides methods for packaging session runtime data for transmission over the network. The methods from
|
|
2
|
+
this module work in tandem with methods offered by transfer_tools.py to ensure the integrity of the transferred data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from functools import partial
|
|
8
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
9
|
+
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
import xxhash
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
|
|
15
|
+
"""Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
|
|
16
|
+
|
|
17
|
+
This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
|
|
18
|
+
calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
|
|
19
|
+
target file, which includes both the contents of the file and its path relative to the base directory.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
base_directory: The path to the base (root) directory which is being checksummed by the main
|
|
23
|
+
'calculate_directory_checksum' function.
|
|
24
|
+
file_path: The absolute path to the target file.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
A tuple with two elements. The first element is the path to the file relative to the base directory. The second
|
|
28
|
+
element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
|
|
29
|
+
"""
|
|
30
|
+
# Initializes the hashsum object.
|
|
31
|
+
checksum = xxhash.xxh3_128()
|
|
32
|
+
|
|
33
|
+
# Encodes the relative path and appends it to the checksum. This ensures that the hashsum reflects both the state
|
|
34
|
+
# of individual files and the layout of the overall encoded directory structure.
|
|
35
|
+
relative_path = str(file_path.relative_to(base_directory))
|
|
36
|
+
checksum.update(relative_path.encode())
|
|
37
|
+
|
|
38
|
+
# Extends the checksum to reflect the file data state. Uses 8 MB chunks to avoid excessive RAM hogging at the cost
|
|
39
|
+
# of slightly reduced throughput.
|
|
40
|
+
with open(file_path, "rb") as f:
|
|
41
|
+
for chunk in iter(lambda: f.read(1024 * 1024 * 8), b""):
|
|
42
|
+
checksum.update(chunk)
|
|
43
|
+
|
|
44
|
+
# Returns both path and file checksum. Although the relative path information is already encoded in the hashsum, the
|
|
45
|
+
# relative path information is re-encoded at the directory level to protect against future changes to the per-file
|
|
46
|
+
# hashsum calculation logic. It is extra work, but it improves the overall checksum security.
|
|
47
|
+
return relative_path, checksum.digest()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def calculate_directory_checksum(
|
|
51
|
+
directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
|
|
52
|
+
) -> str:
|
|
53
|
+
"""Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
|
|
54
|
+
the directory structure information.
|
|
55
|
+
|
|
56
|
+
This function is used to generate a checksum for the raw_data directory of each experiment or training session.
|
|
57
|
+
Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
|
|
58
|
+
and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
|
|
59
|
+
write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
|
|
60
|
+
input directory.
|
|
61
|
+
|
|
62
|
+
Note:
|
|
63
|
+
This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
|
|
64
|
+
combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
|
|
65
|
+
SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
|
|
66
|
+
|
|
67
|
+
The method notifies the user about the checksum calculation process via the terminal.
|
|
68
|
+
|
|
69
|
+
The returned checksum accounts for both the contents of each file and the layout of the input directory
|
|
70
|
+
structure.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
directory: The Path to the directory to be checksummed.
|
|
74
|
+
num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
|
|
75
|
+
function defaults to using (logical CPU count - 4).
|
|
76
|
+
batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
|
|
77
|
+
to optimize progress reporting to avoid cluttering the terminal.
|
|
78
|
+
save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
The xxHash3-128 checksum for the input directory as a hexadecimal string.
|
|
82
|
+
"""
|
|
83
|
+
# Determines the number of parallel processes to use.
|
|
84
|
+
if num_processes is None:
|
|
85
|
+
num_processes = max(1, os.cpu_count() - 4) # type: ignore
|
|
86
|
+
|
|
87
|
+
# Determines the path to each file inside the input directory structure and sorts them for consistency
|
|
88
|
+
path: Path
|
|
89
|
+
files = sorted(
|
|
90
|
+
path
|
|
91
|
+
for path in directory.rglob("*")
|
|
92
|
+
if path.is_file() and path.stem != "ax_checksum" and path.suffix != ".txt" # Excludes checksum files
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Precreates the directory checksum
|
|
96
|
+
checksum = xxhash.xxh3_128()
|
|
97
|
+
|
|
98
|
+
# Process files in parallel
|
|
99
|
+
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
|
100
|
+
# Creates the partial function with fixed base_directory (the first argument of _calculate_file_hash())
|
|
101
|
+
process_file = partial(_calculate_file_checksum, directory)
|
|
102
|
+
|
|
103
|
+
# Submits all tasks to be executed in parallel
|
|
104
|
+
future_to_path = {executor.submit(process_file, file): file for file in files}
|
|
105
|
+
|
|
106
|
+
# Collects results as they complete
|
|
107
|
+
results = []
|
|
108
|
+
if not batch:
|
|
109
|
+
with tqdm(
|
|
110
|
+
total=len(files), desc=f"Calculating checksum for {Path(*directory.parts[-6:])}", unit="files"
|
|
111
|
+
) as pbar:
|
|
112
|
+
for future in as_completed(future_to_path):
|
|
113
|
+
results.append(future.result())
|
|
114
|
+
pbar.update(1)
|
|
115
|
+
else:
|
|
116
|
+
# For batch mode, uses a direct list comprehension with as_completed. This avoids the overhead of progress
|
|
117
|
+
# tracking while maintaining parallel processing, avoiding terminal clutter in batched contexts.
|
|
118
|
+
results = [future.result() for future in as_completed(future_to_path)]
|
|
119
|
+
|
|
120
|
+
# Sorts results for consistency and combines them into the final checksum
|
|
121
|
+
for file_path, file_checksum in sorted(results):
|
|
122
|
+
checksum.update(file_path.encode())
|
|
123
|
+
checksum.update(file_checksum)
|
|
124
|
+
|
|
125
|
+
checksum_hexstr = checksum.hexdigest()
|
|
126
|
+
|
|
127
|
+
# Writes the hash to ax_checksum.txt in the root directory
|
|
128
|
+
if save_checksum:
|
|
129
|
+
checksum_path = directory / "ax_checksum.txt"
|
|
130
|
+
with open(checksum_path, "w") as f:
|
|
131
|
+
f.write(checksum_hexstr)
|
|
132
|
+
|
|
133
|
+
return checksum_hexstr
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
|
|
4
|
+
"""Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
|
|
5
|
+
|
|
6
|
+
This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
|
|
7
|
+
calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
|
|
8
|
+
target file, which includes both the contents of the file and its path relative to the base directory.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
base_directory: The path to the base (root) directory which is being checksummed by the main
|
|
12
|
+
'calculate_directory_checksum' function.
|
|
13
|
+
file_path: The absolute path to the target file.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
A tuple with two elements. The first element is the path to the file relative to the base directory. The second
|
|
17
|
+
element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def calculate_directory_checksum(
|
|
21
|
+
directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
|
|
22
|
+
) -> str:
|
|
23
|
+
"""Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
|
|
24
|
+
the directory structure information.
|
|
25
|
+
|
|
26
|
+
This function is used to generate a checksum for the raw_data directory of each experiment or training session.
|
|
27
|
+
Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
|
|
28
|
+
and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
|
|
29
|
+
write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
|
|
30
|
+
input directory.
|
|
31
|
+
|
|
32
|
+
Note:
|
|
33
|
+
This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
|
|
34
|
+
combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
|
|
35
|
+
SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
|
|
36
|
+
|
|
37
|
+
The method notifies the user about the checksum calculation process via the terminal.
|
|
38
|
+
|
|
39
|
+
The returned checksum accounts for both the contents of each file and the layout of the input directory
|
|
40
|
+
structure.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
directory: The Path to the directory to be checksummed.
|
|
44
|
+
num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
|
|
45
|
+
function defaults to using (logical CPU count - 4).
|
|
46
|
+
batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
|
|
47
|
+
to optimize progress reporting to avoid cluttering the terminal.
|
|
48
|
+
save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The xxHash3-128 checksum for the input directory as a hexadecimal string.
|
|
52
|
+
"""
|
|
File without changes
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""This module provides the tools for working with the Sun lab BioHPC cluster. Specifically, the classes from this
|
|
2
|
+
module establish an API for submitting jobs to the shared data processing cluster (managed via SLURM) and monitoring
|
|
3
|
+
the running job status. All lab processing and analysis pipelines use this interface for accessing shared compute
|
|
4
|
+
resources.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import datetime
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
import paramiko
|
|
14
|
+
from simple_slurm import Slurm # type: ignore
|
|
15
|
+
from paramiko.client import SSHClient
|
|
16
|
+
from ataraxis_base_utilities import LogLevel, console
|
|
17
|
+
from ataraxis_data_structures import YamlConfig
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def generate_server_credentials(
|
|
21
|
+
output_directory: Path, username: str, password: str, host: str = "cbsuwsun.biohpc.cornell.edu"
|
|
22
|
+
) -> None:
|
|
23
|
+
"""Generates a new server_credentials.yaml file under the specified directory, using input information.
|
|
24
|
+
|
|
25
|
+
This function provides a convenience interface for generating new BioHPC server credential files. Generally, this is
|
|
26
|
+
only used when setting up new host-computers in the lab.
|
|
27
|
+
"""
|
|
28
|
+
ServerCredentials(username=username, password=password, host=host).to_yaml(
|
|
29
|
+
file_path=output_directory.joinpath("server_credentials.yaml")
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass()
|
|
34
|
+
class ServerCredentials(YamlConfig):
|
|
35
|
+
"""This class stores the hostname and credentials used to log into the BioHPC cluster to run Sun lab processing
|
|
36
|
+
pipelines.
|
|
37
|
+
|
|
38
|
+
Primarily, this is used as part of the sl-experiment library runtime to start data processing once it is
|
|
39
|
+
transferred to the BioHPC server during preprocessing.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
username: str = "YourNetID"
|
|
43
|
+
"""The username to use for server authentication."""
|
|
44
|
+
password: str = "YourPassword"
|
|
45
|
+
"""The password to use for server authentication."""
|
|
46
|
+
host: str = "cbsuwsun.biohpc.cornell.edu"
|
|
47
|
+
"""The hostname or IP address of the server to connect to."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Server:
|
|
51
|
+
"""Encapsulates access to the Sun lab BioHPC processing server.
|
|
52
|
+
|
|
53
|
+
This class provides the API that allows accessing the BioHPC server and creating and submitting various
|
|
54
|
+
SLURM-managed jobs to the server. It functions as the central interface used by all processing pipelines in the
|
|
55
|
+
lab to execute costly data processing on the server.
|
|
56
|
+
|
|
57
|
+
Notes:
|
|
58
|
+
All lab processing pipelines expect the data to be stored on the server and all processing logic to be packaged
|
|
59
|
+
and installed into dedicated conda environments on the server.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
credentials_path: The path to the.yaml file containing the server hostname and access credentials.
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
_open: Tracks whether the connection to the server is open or not.
|
|
66
|
+
_client: Stores the initialized SSHClient instance used to interface with the server.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(self, credentials_path: Path) -> None:
|
|
70
|
+
# Tracker used to prevent __del__ from classing stop() for a partially initialized class.
|
|
71
|
+
self._open: bool = False
|
|
72
|
+
|
|
73
|
+
# Loads the credentials from the provided .yaml file
|
|
74
|
+
self._credentials: ServerCredentials = ServerCredentials.from_yaml(credentials_path) # type: ignore
|
|
75
|
+
|
|
76
|
+
# Establishes the SSH connection to the specified processing server. At most, attempts to connect to the server
|
|
77
|
+
# 30 times before terminating with an error
|
|
78
|
+
attempt = 0
|
|
79
|
+
while True:
|
|
80
|
+
console.echo(
|
|
81
|
+
f"Trying to connect to {self._credentials.host} (attempt {attempt}/30)...", level=LogLevel.INFO
|
|
82
|
+
)
|
|
83
|
+
try:
|
|
84
|
+
self._client: SSHClient = paramiko.SSHClient()
|
|
85
|
+
self._client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
86
|
+
self._client.connect(
|
|
87
|
+
self._credentials.host, username=self._credentials.username, password=self._credentials.password
|
|
88
|
+
)
|
|
89
|
+
console.echo(f"Connected to {self._credentials.host}", level=LogLevel.SUCCESS)
|
|
90
|
+
break
|
|
91
|
+
except paramiko.AuthenticationException:
|
|
92
|
+
message = (
|
|
93
|
+
f"Authentication failed when connecting to {self._credentials.host} using "
|
|
94
|
+
f"{self._credentials.username} user."
|
|
95
|
+
)
|
|
96
|
+
console.error(message, RuntimeError)
|
|
97
|
+
raise RuntimeError
|
|
98
|
+
except:
|
|
99
|
+
if attempt == 30:
|
|
100
|
+
message = f"Could not connect to {self._credentials.host} after 30 attempts. Aborting runtime."
|
|
101
|
+
console.error(message, RuntimeError)
|
|
102
|
+
raise RuntimeError
|
|
103
|
+
|
|
104
|
+
console.echo(
|
|
105
|
+
f"Could not SSH to {self._credentials.host}, retrying after a 2-second delay...",
|
|
106
|
+
level=LogLevel.WARNING,
|
|
107
|
+
)
|
|
108
|
+
attempt += 1
|
|
109
|
+
time.sleep(2)
|
|
110
|
+
|
|
111
|
+
def __del__(self) -> None:
|
|
112
|
+
"""If the instance is connected to the server, terminates the connection before the instance is destroyed."""
|
|
113
|
+
self.close()
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def generate_slurm_header(
|
|
117
|
+
job_name: str, output_log: Path, error_log: Path, cpus_to_use: int = 20, ram_gb: int = 4, time_limit: int = 60
|
|
118
|
+
) -> Slurm:
|
|
119
|
+
"""Creates a SLURM command object and fills it with initial job configuration data.
|
|
120
|
+
|
|
121
|
+
This method is used to generate the initial SLURM command object and fill it with job (SLURM) configuration and
|
|
122
|
+
(general!) conda initialization data. It is used by all processing pipelines in the lab as the initial
|
|
123
|
+
configuration point when writing job shell scripts.
|
|
124
|
+
|
|
125
|
+
Notes:
|
|
126
|
+
The command header generated by this method does not contain the command to initialize the specific conda
|
|
127
|
+
environment to be used during processing. This has to be provided as part of the additional command
|
|
128
|
+
configuration, typically by adding the "source activate {ENV_NAME}" subcommand to the end of the header
|
|
129
|
+
returned by this method.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
job_name: The descriptive name of the SLURM job to be created.
|
|
133
|
+
output_log: The path to the .txt file on the processing server, where to store the standard output of the
|
|
134
|
+
job.
|
|
135
|
+
error_log: The path to the .txt file on the processing server, where to store the standard error of the
|
|
136
|
+
job.
|
|
137
|
+
cpus_to_use: The number of CPUs to use for the job.
|
|
138
|
+
ram_gb: The amount of RAM to allocate for the job in Gigabytes.
|
|
139
|
+
time_limit: The maximum time limit for the job, in minutes. It is highly advised to set an adequate maximum
|
|
140
|
+
runtime limit to prevent jobs from hogging the server for a long period of time.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
# Builds the slurm command object filled with configuration information
|
|
144
|
+
slurm_command = Slurm(
|
|
145
|
+
cpus_per_task=cpus_to_use,
|
|
146
|
+
job_name=job_name,
|
|
147
|
+
output=str(output_log),
|
|
148
|
+
error=str(error_log),
|
|
149
|
+
mem=f"{ram_gb}G",
|
|
150
|
+
time=datetime.timedelta(minutes=time_limit),
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Adds commands to initialize conda as part of the job runtime
|
|
154
|
+
slurm_command.add_cmd("eval $(conda shell.bash hook)")
|
|
155
|
+
slurm_command.add_cmd("conda init bash")
|
|
156
|
+
|
|
157
|
+
return slurm_command
|
|
158
|
+
|
|
159
|
+
def submit_job(self, slurm_command: Slurm, working_directory: Path) -> str:
|
|
160
|
+
"""Submits the input SLURM command to the managed BioHPC server via the shell script.
|
|
161
|
+
|
|
162
|
+
This method submits various commands for execution via SLURM-managed BioHPC cluster. As part of its runtime, the
|
|
163
|
+
method translates the Slurm object into the shell script, moves the script to the target working directory on
|
|
164
|
+
the server, and instructs the server to execute the shell script (via SLURM).
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
slurm_command: The Slurm (command) object containing the job configuration and individual commands to run
|
|
168
|
+
as part of the processing pipeline.
|
|
169
|
+
working_directory: The path to the working directory on the server where the shell script is moved
|
|
170
|
+
and executed.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
The job ID assigned to the job by SLURM manager if the command submission is successful.
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
RuntimeError: If the command submission to the server fails.
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
# Extracts the job name from the slurm command text and uses it to generate the name for the remote script
|
|
180
|
+
job_name_pattern = r"#SBATCH\s+--job-name\s+(\S+)"
|
|
181
|
+
match = re.search(job_name_pattern, str(slurm_command))
|
|
182
|
+
if match is None:
|
|
183
|
+
message = (
|
|
184
|
+
f"Failed to submit the job to the BioHPC cluster. It appears that the job does not contain the "
|
|
185
|
+
f"expected SLURM job header. All jobs submitted via this method have to be initialized using the "
|
|
186
|
+
f"generate_slurm_header() Server class method."
|
|
187
|
+
)
|
|
188
|
+
console.error(message, RuntimeError)
|
|
189
|
+
raise RuntimeError(message) # This is a fallback to appease mypy, it should not be reachable.
|
|
190
|
+
job_name = match.group(1)
|
|
191
|
+
|
|
192
|
+
# Resolves the paths to the local and remote (server-side) .sh script files.
|
|
193
|
+
local_script_path = Path("temp_script.sh")
|
|
194
|
+
remote_script_path = str(working_directory.joinpath(f"{job_name}.sh"))
|
|
195
|
+
|
|
196
|
+
# Appends the command to clean up (remove) the temporary script file after processing runtime is over
|
|
197
|
+
slurm_command.add_cmd(f"rm -f {remote_script_path}")
|
|
198
|
+
|
|
199
|
+
# Translates the command to string format
|
|
200
|
+
script_content = str(slurm_command)
|
|
201
|
+
|
|
202
|
+
# Replaces escaped $ (/$) with $. This is essential, as without this correction things like conda
|
|
203
|
+
# initialization would not work as expected.
|
|
204
|
+
fixed_script_content = script_content.replace("\\$", "$")
|
|
205
|
+
|
|
206
|
+
# Creates a temporary script file locally and dumps translated command data into the file
|
|
207
|
+
with open(local_script_path, "w") as f:
|
|
208
|
+
f.write(fixed_script_content)
|
|
209
|
+
|
|
210
|
+
# Uploads the command script to the server
|
|
211
|
+
sftp = self._client.open_sftp()
|
|
212
|
+
sftp.put(localpath=local_script_path, remotepath=remote_script_path)
|
|
213
|
+
sftp.close()
|
|
214
|
+
|
|
215
|
+
# Removes the temporary local .sh file
|
|
216
|
+
local_script_path.unlink()
|
|
217
|
+
|
|
218
|
+
# Makes the server-side script executable
|
|
219
|
+
self._client.exec_command(f"chmod +x {remote_script_path}")
|
|
220
|
+
|
|
221
|
+
# Submits the job to SLURM with sbatch and verifies submission state by returning either the ID of the job or
|
|
222
|
+
# None to indicate no job has been submitted.
|
|
223
|
+
job_output = self._client.exec_command(f"sbatch {remote_script_path}")[1].read().strip().decode()
|
|
224
|
+
if "Submitted batch job" in job_output:
|
|
225
|
+
return job_output.split()[-1]
|
|
226
|
+
else:
|
|
227
|
+
message = f"Failed to submit the {job_name} job to the BioHPC cluster."
|
|
228
|
+
console.error(message, RuntimeError)
|
|
229
|
+
|
|
230
|
+
# Fallback to appease mypy, should not be reachable
|
|
231
|
+
raise RuntimeError(message)
|
|
232
|
+
|
|
233
|
+
def job_complete(self, job_id: str) -> bool:
|
|
234
|
+
"""Returns True if the job with the given ID has been completed or terminated its runtime due to an error.
|
|
235
|
+
|
|
236
|
+
If the job is still running or is waiting inside the execution queue, returns False.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
job_id: The numeric ID of the job to check, assigned by SLURM.
|
|
240
|
+
"""
|
|
241
|
+
if j_id not in self._client.exec_command(f"squeue -j {job_id}")[1].read().decode().strip():
|
|
242
|
+
return True
|
|
243
|
+
else:
|
|
244
|
+
return False
|
|
245
|
+
|
|
246
|
+
def close(self) -> None:
|
|
247
|
+
"""Closes the SSH connection to the server.
|
|
248
|
+
|
|
249
|
+
This method has to be called before destroying the class instance to ensure proper resource cleanup.
|
|
250
|
+
"""
|
|
251
|
+
# Prevents closing already closed connections
|
|
252
|
+
if self._open:
|
|
253
|
+
self._client.close()
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
if __name__ == "__main__":
|
|
257
|
+
# Creates SSHClient for server access
|
|
258
|
+
console.enable()
|
|
259
|
+
cred_path = Path("/home/cyberaxolotl/Desktop/test/server_credentials.yaml")
|
|
260
|
+
server = Server(credentials_path=cred_path)
|
|
261
|
+
|
|
262
|
+
# Generates SLURM job header
|
|
263
|
+
slurm = server.generate_slurm_header(
|
|
264
|
+
job_name="test_job",
|
|
265
|
+
output_log=Path("/workdir/cbsuwsun/test_job_stdout.txt"),
|
|
266
|
+
error_log=Path("/workdir/cbsuwsun/test_job_stderr.txt"),
|
|
267
|
+
cpus_to_use=1,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Adds test runtime command
|
|
271
|
+
slurm.add_cmd("python --version > /workdir/cbsuwsun/mamba_version.txt")
|
|
272
|
+
|
|
273
|
+
# Submits the job to the server
|
|
274
|
+
j_id = server.submit_job(slurm_command=slurm, working_directory=Path("/workdir/cbsuwsun/"))
|
|
275
|
+
|
|
276
|
+
if j_id:
|
|
277
|
+
console.echo(f"Successfully submitted job with ID {j_id} to the server.", level=LogLevel.SUCCESS)
|
|
278
|
+
|
|
279
|
+
max_wait_time = 60 # Maximum wait time in seconds
|
|
280
|
+
wait_interval = 1 # Check every 1 second
|
|
281
|
+
elapsed_time = 0
|
|
282
|
+
|
|
283
|
+
while elapsed_time < max_wait_time:
|
|
284
|
+
if server.job_complete(job_id=j_id):
|
|
285
|
+
console.echo("Job completed", level=LogLevel.SUCCESS)
|
|
286
|
+
break
|
|
287
|
+
|
|
288
|
+
console.echo(f"Job still running. Waiting {wait_interval} seconds...", level=LogLevel.INFO)
|
|
289
|
+
time.sleep(wait_interval)
|
|
290
|
+
elapsed_time += wait_interval
|
|
291
|
+
|
|
292
|
+
# Close the connection
|
|
293
|
+
server.close()
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
from simple_slurm import Slurm
|
|
5
|
+
from paramiko.client import SSHClient as SSHClient
|
|
6
|
+
from ataraxis_data_structures import YamlConfig
|
|
7
|
+
|
|
8
|
+
def generate_server_credentials(
|
|
9
|
+
output_directory: Path, username: str, password: str, host: str = "cbsuwsun.biohpc.cornell.edu"
|
|
10
|
+
) -> None:
|
|
11
|
+
"""Generates a new server_credentials.yaml file under the specified directory, using input information.
|
|
12
|
+
|
|
13
|
+
This function provides a convenience interface for generating new BioHPC server credential files. Generally, this is
|
|
14
|
+
only used when setting up new host-computers in the lab.
|
|
15
|
+
"""
|
|
16
|
+
@dataclass()
|
|
17
|
+
class ServerCredentials(YamlConfig):
|
|
18
|
+
"""This class stores the hostname and credentials used to log into the BioHPC cluster to run Sun lab processing
|
|
19
|
+
pipelines.
|
|
20
|
+
|
|
21
|
+
Primarily, this is used as part of the sl-experiment library runtime to start data processing once it is
|
|
22
|
+
transferred to the BioHPC server during preprocessing.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
username: str = ...
|
|
26
|
+
password: str = ...
|
|
27
|
+
host: str = ...
|
|
28
|
+
|
|
29
|
+
class Server:
|
|
30
|
+
"""Encapsulates access to the Sun lab BioHPC processing server.
|
|
31
|
+
|
|
32
|
+
This class provides the API that allows accessing the BioHPC server and creating and submitting various
|
|
33
|
+
SLURM-managed jobs to the server. It functions as the central interface used by all processing pipelines in the
|
|
34
|
+
lab to execute costly data processing on the server.
|
|
35
|
+
|
|
36
|
+
Notes:
|
|
37
|
+
All lab processing pipelines expect the data to be stored on the server and all processing logic to be packaged
|
|
38
|
+
and installed into dedicated conda environments on the server.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
credentials_path: The path to the.yaml file containing the server hostname and access credentials.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
_open: Tracks whether the connection to the server is open or not.
|
|
45
|
+
_client: Stores the initialized SSHClient instance used to interface with the server.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
_open: bool
|
|
49
|
+
_credentials: ServerCredentials
|
|
50
|
+
_client: SSHClient
|
|
51
|
+
def __init__(self, credentials_path: Path) -> None: ...
|
|
52
|
+
def __del__(self) -> None:
|
|
53
|
+
"""If the instance is connected to the server, terminates the connection before the instance is destroyed."""
|
|
54
|
+
@staticmethod
|
|
55
|
+
def generate_slurm_header(
|
|
56
|
+
job_name: str, output_log: Path, error_log: Path, cpus_to_use: int = 20, ram_gb: int = 4, time_limit: int = 60
|
|
57
|
+
) -> Slurm:
|
|
58
|
+
"""Creates a SLURM command object and fills it with initial job configuration data.
|
|
59
|
+
|
|
60
|
+
This method is used to generate the initial SLURM command object and fill it with job (SLURM) configuration and
|
|
61
|
+
(general!) conda initialization data. It is used by all processing pipelines in the lab as the initial
|
|
62
|
+
configuration point when writing job shell scripts.
|
|
63
|
+
|
|
64
|
+
Notes:
|
|
65
|
+
The command header generated by this method does not contain the command to initialize the specific conda
|
|
66
|
+
environment to be used during processing. This has to be provided as part of the additional command
|
|
67
|
+
configuration, typically by adding the "source activate {ENV_NAME}" subcommand to the end of the header
|
|
68
|
+
returned by this method.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
job_name: The descriptive name of the SLURM job to be created.
|
|
72
|
+
output_log: The path to the .txt file on the processing server, where to store the standard output of the
|
|
73
|
+
job.
|
|
74
|
+
error_log: The path to the .txt file on the processing server, where to store the standard error of the
|
|
75
|
+
job.
|
|
76
|
+
cpus_to_use: The number of CPUs to use for the job.
|
|
77
|
+
ram_gb: The amount of RAM to allocate for the job in Gigabytes.
|
|
78
|
+
time_limit: The maximum time limit for the job, in minutes. It is highly advised to set an adequate maximum
|
|
79
|
+
runtime limit to prevent jobs from hogging the server for a long period of time.
|
|
80
|
+
"""
|
|
81
|
+
def submit_job(self, slurm_command: Slurm, working_directory: Path) -> str:
|
|
82
|
+
"""Submits the input SLURM command to the managed BioHPC server via the shell script.
|
|
83
|
+
|
|
84
|
+
This method submits various commands for execution via SLURM-managed BioHPC cluster. As part of its runtime, the
|
|
85
|
+
method translates the Slurm object into the shell script, moves the script to the target working directory on
|
|
86
|
+
the server, and instructs the server to execute the shell script (via SLURM).
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
slurm_command: The Slurm (command) object containing the job configuration and individual commands to run
|
|
90
|
+
as part of the processing pipeline.
|
|
91
|
+
working_directory: The path to the working directory on the server where the shell script is moved
|
|
92
|
+
and executed.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
The job ID assigned to the job by SLURM manager if the command submission is successful.
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
RuntimeError: If the command submission to the server fails.
|
|
99
|
+
"""
|
|
100
|
+
def job_complete(self, job_id: str) -> bool:
|
|
101
|
+
"""Returns True if the job with the given ID has been completed or terminated its runtime due to an error.
|
|
102
|
+
|
|
103
|
+
If the job is still running or is waiting inside the execution queue, returns False.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
job_id: The numeric ID of the job to check, assigned by SLURM.
|
|
107
|
+
"""
|
|
108
|
+
def close(self) -> None:
|
|
109
|
+
"""Closes the SSH connection to the server.
|
|
110
|
+
|
|
111
|
+
This method has to be called before destroying the class instance to ensure proper resource cleanup.
|
|
112
|
+
"""
|