sl-shared-assets 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +80 -0
- sl_shared_assets/__init__.pyi +73 -0
- sl_shared_assets/cli.py +384 -0
- sl_shared_assets/cli.pyi +94 -0
- sl_shared_assets/data_classes/__init__.py +66 -0
- sl_shared_assets/data_classes/__init__.pyi +61 -0
- sl_shared_assets/data_classes/configuration_data.py +479 -0
- sl_shared_assets/data_classes/configuration_data.pyi +199 -0
- sl_shared_assets/data_classes/runtime_data.py +251 -0
- sl_shared_assets/data_classes/runtime_data.pyi +145 -0
- sl_shared_assets/data_classes/session_data.py +625 -0
- sl_shared_assets/data_classes/session_data.pyi +252 -0
- sl_shared_assets/data_classes/surgery_data.py +152 -0
- sl_shared_assets/data_classes/surgery_data.pyi +89 -0
- sl_shared_assets/py.typed +0 -0
- sl_shared_assets/server/__init__.py +8 -0
- sl_shared_assets/server/__init__.pyi +8 -0
- sl_shared_assets/server/job.py +140 -0
- sl_shared_assets/server/job.pyi +94 -0
- sl_shared_assets/server/server.py +214 -0
- sl_shared_assets/server/server.pyi +95 -0
- sl_shared_assets/tools/__init__.py +15 -0
- sl_shared_assets/tools/__init__.pyi +15 -0
- sl_shared_assets/tools/ascension_tools.py +277 -0
- sl_shared_assets/tools/ascension_tools.pyi +68 -0
- sl_shared_assets/tools/packaging_tools.py +148 -0
- sl_shared_assets/tools/packaging_tools.pyi +56 -0
- sl_shared_assets/tools/project_management_tools.py +201 -0
- sl_shared_assets/tools/project_management_tools.pyi +54 -0
- sl_shared_assets/tools/transfer_tools.py +119 -0
- sl_shared_assets/tools/transfer_tools.pyi +53 -0
- sl_shared_assets-1.0.0.dist-info/METADATA +869 -0
- sl_shared_assets-1.0.0.dist-info/RECORD +36 -0
- sl_shared_assets-1.0.0.dist-info/WHEEL +4 -0
- sl_shared_assets-1.0.0.dist-info/entry_points.txt +8 -0
- sl_shared_assets-1.0.0.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""This module provides tools for managing the data of any Sun lab project. Tools from this module extend the
|
|
2
|
+
functionality of SessionData class via a convenient API that allows working with the data of multiple sessions making
|
|
3
|
+
up a given project."""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from ataraxis_base_utilities import console
|
|
9
|
+
|
|
10
|
+
from ..data_classes import SessionData
|
|
11
|
+
from .packaging_tools import calculate_directory_checksum
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def generate_project_manifest(
|
|
15
|
+
raw_project_directory: Path, output_directory: Path, processed_project_directory: Path | None = None
|
|
16
|
+
) -> None:
|
|
17
|
+
"""Builds and saves the project manifest .feather file under the specified output directory.
|
|
18
|
+
|
|
19
|
+
This function evaluates the input project directory and builds the 'manifest' file for the project. The file
|
|
20
|
+
includes the descriptive information about every session stored inside the input project folder and the state of
|
|
21
|
+
session's data processing (which processing pipelines have been applied to each session). The file will be created
|
|
22
|
+
under the 'output_path' directory and use the following name pattern: {ProjectName}}_manifest.feather.
|
|
23
|
+
|
|
24
|
+
Notes:
|
|
25
|
+
The manifest file is primarily used to capture and move project state information between machines, typically
|
|
26
|
+
in the context of working with data stored on a remote compute server or cluster. However, it can also be used
|
|
27
|
+
on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
|
|
28
|
+
lab regardless of the runtime context.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
raw_project_directory: The path to the root project directory used to store raw session data.
|
|
32
|
+
output_directory: The path to the directory where to save the generated manifest file.
|
|
33
|
+
processed_project_directory: The path to the root project directory used to store processed session data if it
|
|
34
|
+
is different from the 'raw_project_directory'. Typically, this would be the case on remote compute server(s)
|
|
35
|
+
and not on local machines.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
if not raw_project_directory.exists():
|
|
39
|
+
message = (
|
|
40
|
+
f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. The "
|
|
41
|
+
f"specified project directory does not exist."
|
|
42
|
+
)
|
|
43
|
+
console.error(message=message, error=FileNotFoundError)
|
|
44
|
+
|
|
45
|
+
# Finds all raw data directories
|
|
46
|
+
session_directories = [directory.parent for directory in raw_project_directory.rglob("raw_data")]
|
|
47
|
+
|
|
48
|
+
if len(session_directories) == 0:
|
|
49
|
+
message = (
|
|
50
|
+
f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. The "
|
|
51
|
+
f"project does not contain any raw session data. To generate the manifest file, the project must contain "
|
|
52
|
+
f"at least one valid experiment or training session."
|
|
53
|
+
)
|
|
54
|
+
console.error(message=message, error=FileNotFoundError)
|
|
55
|
+
|
|
56
|
+
# Precreates the 'manifest' dictionary structure
|
|
57
|
+
manifest: dict[str, list[str | bool]] = {
|
|
58
|
+
"animal": [], # Animal IDs.
|
|
59
|
+
"session": [], # Session names.
|
|
60
|
+
"type": [], # Type of the session (e.g., Experiment, Training, etc.).
|
|
61
|
+
"raw_data": [], # Server-side raw_data folder path.
|
|
62
|
+
"processed_data": [], # Server-side processed_data folder path.
|
|
63
|
+
"complete": [], # Determines if the session data is complete. Incomplete sessions are excluded from processing.
|
|
64
|
+
"verified": [], # Determines if the session data integrity has been verified upon transfer to storage machine.
|
|
65
|
+
"single_day_suite2p": [], # Determines whether the session has been processed with the single-day s2p pipeline.
|
|
66
|
+
"multi_day_suite2p": [], # Determines whether the session has been processed with the multi-day s2p pipeline.
|
|
67
|
+
"behavior": [], # Determines whether the session has been processed with the behavior extraction pipeline.
|
|
68
|
+
"dlc": [], # Determines whether the session has been processed with the DeepLabCut pipeline.
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# Loops over each session of every animal in the project and extracts session ID information and information
|
|
72
|
+
# about which processing steps have been successfully applied to the session.
|
|
73
|
+
for directory in session_directories:
|
|
74
|
+
# Instantiates the SessionData instance to resolve the paths to all session's data files and locations.
|
|
75
|
+
session_data = SessionData.load(
|
|
76
|
+
session_path=directory, processed_data_root=processed_project_directory, make_processed_data_directory=False
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Fills the manifest dictionary with data for the processed session:
|
|
80
|
+
|
|
81
|
+
# Extracts ID and data path information from the SessionData instance
|
|
82
|
+
manifest["animal"].append(session_data.animal_id)
|
|
83
|
+
manifest["session"].append(session_data.session_name)
|
|
84
|
+
manifest["type"].append(session_data.session_type)
|
|
85
|
+
manifest["raw_data"].append(str(session_data.raw_data.raw_data_path))
|
|
86
|
+
manifest["processed_data"].append(str(session_data.processed_data.processed_data_path))
|
|
87
|
+
|
|
88
|
+
# If the session raw_data folder contains the telomere.bin file, marks the session as complete.
|
|
89
|
+
manifest["complete"].append(session_data.raw_data.telomere_path.exists())
|
|
90
|
+
|
|
91
|
+
# If the session raw_data folder contains the verified.bin file, marks the session as verified.
|
|
92
|
+
manifest["verified"].append(session_data.raw_data.verified_bin_path.exists())
|
|
93
|
+
|
|
94
|
+
# If the session is incomplete or unverified, marks all processing steps as FALSE, as automatic processing is
|
|
95
|
+
# disabled for incomplete sessions. If the session unverified, the case is even more severe, as its data may be
|
|
96
|
+
# corrupted.
|
|
97
|
+
if not manifest["complete"][-1] or not not manifest["verified"][-1]:
|
|
98
|
+
manifest["single_day_suite2p"].append(False)
|
|
99
|
+
manifest["multi_day_suite2p"].append(False)
|
|
100
|
+
manifest["behavior"].append(False)
|
|
101
|
+
manifest["dlc"].append(False)
|
|
102
|
+
continue # Cycles to the next session
|
|
103
|
+
|
|
104
|
+
# If the session processed_data folder contains the single-day suite2p.bin file, marks the single-day suite2p
|
|
105
|
+
# processing step as complete.
|
|
106
|
+
manifest["single_day_suite2p"].append(session_data.processed_data.single_day_suite2p_bin_path.exists())
|
|
107
|
+
|
|
108
|
+
# If the session processed_data folder contains the multi-day suite2p.bin file, marks the multi-day suite2p
|
|
109
|
+
# processing step as complete.
|
|
110
|
+
manifest["multi_day_suite2p"].append(session_data.processed_data.multi_day_suite2p_bin_path.exists())
|
|
111
|
+
|
|
112
|
+
# If the session processed_data folder contains the behavior.bin file, marks the behavior processing step as
|
|
113
|
+
# complete.
|
|
114
|
+
manifest["behavior"].append(session_data.processed_data.behavior_data_path.exists())
|
|
115
|
+
|
|
116
|
+
# If the session processed_data folder contains the dlc.bin file, marks the dlc processing step as
|
|
117
|
+
# complete.
|
|
118
|
+
manifest["dlc"].append(session_data.processed_data.dlc_bin_path.exists())
|
|
119
|
+
|
|
120
|
+
# Converts the manifest dictionary to a Polars Dataframe
|
|
121
|
+
schema = {
|
|
122
|
+
"animal": pl.String,
|
|
123
|
+
"session": pl.String,
|
|
124
|
+
"raw_data": pl.String,
|
|
125
|
+
"processed_data": pl.String,
|
|
126
|
+
"type": pl.String,
|
|
127
|
+
"complete": pl.Boolean,
|
|
128
|
+
"verified": pl.Boolean,
|
|
129
|
+
"single_day_suite2p": pl.Boolean,
|
|
130
|
+
"multi_day_suite2p": pl.Boolean,
|
|
131
|
+
"behavior": pl.Boolean,
|
|
132
|
+
"dlc": pl.Boolean,
|
|
133
|
+
}
|
|
134
|
+
df = pl.DataFrame(manifest, schema=schema)
|
|
135
|
+
|
|
136
|
+
# Sorts the DataFrame by animal and then session. Since we assign animal IDs sequentially and 'name' sessions based
|
|
137
|
+
# on acquisition timestamps, the sort order is chronological.
|
|
138
|
+
sorted_df = df.sort(["animal", "session"])
|
|
139
|
+
|
|
140
|
+
# Saves the generated manifest to the project-specific manifest .feather file for further processing.
|
|
141
|
+
sorted_df.write_ipc(
|
|
142
|
+
file=output_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather"), compression="lz4"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def verify_session_checksum(
|
|
147
|
+
session_path: Path, create_processed_data_directory: bool = True, processed_data_root: None | Path = None
|
|
148
|
+
) -> None:
|
|
149
|
+
"""Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
|
|
150
|
+
comparing it against the checksum stored in the ax_checksum.txt file.
|
|
151
|
+
|
|
152
|
+
Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
|
|
153
|
+
server for long-term storage. This function is designed to create the 'verified.bin' marker file if the checksum
|
|
154
|
+
matches and to remove the 'telomere.bin' and 'verified.bin' marker files if it does not.
|
|
155
|
+
|
|
156
|
+
Notes:
|
|
157
|
+
Removing the telomere.bin marker file from session's raw_data folder marks the session as incomplete, excluding
|
|
158
|
+
it from all further automatic processing.
|
|
159
|
+
|
|
160
|
+
This function is also used to create the processed data hierarchy on the BioHPC server, when it is called as
|
|
161
|
+
part of the data preprocessing runtime performed by a data acquisition system.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
session_path: The path to the session directory to be verified. Note, the input session directory must contain
|
|
165
|
+
the 'raw_data' subdirectory.
|
|
166
|
+
create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
|
|
167
|
+
processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
|
|
168
|
+
the root directory where to store the processed data from all projects, and it will be automatically
|
|
169
|
+
modified to include the project name, the animal name, and the session ID.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
# Loads session data layout. If configured to do so, also creates the processed data hierarchy
|
|
173
|
+
session_data = SessionData.load(
|
|
174
|
+
session_path=session_path,
|
|
175
|
+
processed_data_root=processed_data_root,
|
|
176
|
+
make_processed_data_directory=create_processed_data_directory,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Unlinks the verified.bin marker if it exists. The presence or absence of the marker is used as the
|
|
180
|
+
# primary heuristic for determining if the session data passed verification. Unlinking it early helps in the case
|
|
181
|
+
# the verification procedure aborts unexpectedly for any reason.
|
|
182
|
+
session_data.raw_data.verified_bin_path.unlink(missing_ok=True)
|
|
183
|
+
|
|
184
|
+
# Re-calculates the checksum for the raw_data directory
|
|
185
|
+
calculated_checksum = calculate_directory_checksum(
|
|
186
|
+
directory=session_data.raw_data.raw_data_path, batch=False, save_checksum=False
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Loads the checksum stored inside the ax_checksum.txt file
|
|
190
|
+
with open(session_data.raw_data.checksum_path, "r") as f:
|
|
191
|
+
stored_checksum = f.read().strip()
|
|
192
|
+
|
|
193
|
+
# If the two checksums do not match, this likely indicates data corruption.
|
|
194
|
+
if stored_checksum != calculated_checksum:
|
|
195
|
+
# If the telomere.bin file exists, removes this file. This automatically marks the session as incomplete for
|
|
196
|
+
# all other Sun lab runtimes.
|
|
197
|
+
session_data.raw_data.telomere_path.unlink(missing_ok=True)
|
|
198
|
+
|
|
199
|
+
# Otherwise, ensures that the session is marked with the verified.bin marker file.
|
|
200
|
+
else:
|
|
201
|
+
session_data.raw_data.verified_bin_path.touch(exist_ok=True)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from ..data_classes import SessionData as SessionData
|
|
4
|
+
from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
|
|
5
|
+
|
|
6
|
+
def generate_project_manifest(
|
|
7
|
+
raw_project_directory: Path, output_directory: Path, processed_project_directory: Path | None = None
|
|
8
|
+
) -> None:
|
|
9
|
+
"""Builds and saves the project manifest .feather file under the specified output directory.
|
|
10
|
+
|
|
11
|
+
This function evaluates the input project directory and builds the 'manifest' file for the project. The file
|
|
12
|
+
includes the descriptive information about every session stored inside the input project folder and the state of
|
|
13
|
+
session's data processing (which processing pipelines have been applied to each session). The file will be created
|
|
14
|
+
under the 'output_path' directory and use the following name pattern: {ProjectName}}_manifest.feather.
|
|
15
|
+
|
|
16
|
+
Notes:
|
|
17
|
+
The manifest file is primarily used to capture and move project state information between machines, typically
|
|
18
|
+
in the context of working with data stored on a remote compute server or cluster. However, it can also be used
|
|
19
|
+
on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
|
|
20
|
+
lab regardless of the runtime context.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
raw_project_directory: The path to the root project directory used to store raw session data.
|
|
24
|
+
output_directory: The path to the directory where to save the generated manifest file.
|
|
25
|
+
processed_project_directory: The path to the root project directory used to store processed session data if it
|
|
26
|
+
is different from the 'raw_project_directory'. Typically, this would be the case on remote compute server(s)
|
|
27
|
+
and not on local machines.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def verify_session_checksum(
|
|
31
|
+
session_path: Path, create_processed_data_directory: bool = True, processed_data_root: None | Path = None
|
|
32
|
+
) -> None:
|
|
33
|
+
"""Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
|
|
34
|
+
comparing it against the checksum stored in the ax_checksum.txt file.
|
|
35
|
+
|
|
36
|
+
Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
|
|
37
|
+
server for long-term storage. This function is designed to create the 'verified.bin' marker file if the checksum
|
|
38
|
+
matches and to remove the 'telomere.bin' and 'verified.bin' marker files if it does not.
|
|
39
|
+
|
|
40
|
+
Notes:
|
|
41
|
+
Removing the telomere.bin marker file from session's raw_data folder marks the session as incomplete, excluding
|
|
42
|
+
it from all further automatic processing.
|
|
43
|
+
|
|
44
|
+
This function is also used to create the processed data hierarchy on the BioHPC server, when it is called as
|
|
45
|
+
part of the data preprocessing runtime performed by a data acquisition system.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
session_path: The path to the session directory to be verified. Note, the input session directory must contain
|
|
49
|
+
the 'raw_data' subdirectory.
|
|
50
|
+
create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
|
|
51
|
+
processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
|
|
52
|
+
the root directory where to store the processed data from all projects, and it will be automatically
|
|
53
|
+
modified to include the project name, the animal name, and the session ID.
|
|
54
|
+
"""
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""This module provides methods for moving session runtime data between the local machine, the ScanImage (Mesoscope) PC,
|
|
2
|
+
the Synology NAS drive, and the lab BioHPC server. All methods in this module expect that the destinations and sources
|
|
3
|
+
are mounted on the host file-system via the SMB or an equivalent protocol.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import shutil
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
from ataraxis_base_utilities import console, ensure_directory_exists
|
|
12
|
+
|
|
13
|
+
from .packaging_tools import calculate_directory_checksum
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _transfer_file(source_file: Path, source_directory: Path, destination_directory: Path) -> None:
|
|
17
|
+
"""Copies the input file from the source directory to the destination directory while preserving the file metadata.
|
|
18
|
+
|
|
19
|
+
This is a worker method used by the transfer_directory() method to move multiple files in parallel.
|
|
20
|
+
|
|
21
|
+
Notes:
|
|
22
|
+
If the file is found under a hierarchy of subdirectories inside the input source_directory, that hierarchy will
|
|
23
|
+
be preserved in the destination directory.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source_file: The file to be copied.
|
|
27
|
+
source_directory: The root directory where the file is located.
|
|
28
|
+
destination_directory: The destination directory where to move the file.
|
|
29
|
+
"""
|
|
30
|
+
relative = source_file.relative_to(source_directory)
|
|
31
|
+
dest_file = destination_directory / relative
|
|
32
|
+
shutil.copy2(source_file, dest_file)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def transfer_directory(source: Path, destination: Path, num_threads: int = 1, verify_integrity: bool = True) -> None:
|
|
36
|
+
"""Copies the contents of the input directory tree from source to destination while preserving the folder
|
|
37
|
+
structure.
|
|
38
|
+
|
|
39
|
+
This function is used to assemble the experimental data from all remote machines used in the acquisition process on
|
|
40
|
+
the VRPC before the data is preprocessed. It is also used to transfer the preprocessed data from the VRPC to the
|
|
41
|
+
SynologyNAS and the Sun lab BioHPC server.
|
|
42
|
+
|
|
43
|
+
Notes:
|
|
44
|
+
This method recreates the moved directory hierarchy on the destination if the hierarchy does not exist. This is
|
|
45
|
+
done before copying the files.
|
|
46
|
+
|
|
47
|
+
The method executes a multithreading copy operation. It does not clean up the source files. That job is handed
|
|
48
|
+
to the specific preprocessing function from the sl_experiment or sl-forgery libraries that calls this function.
|
|
49
|
+
|
|
50
|
+
If the method is configured to verify transferred file integrity, it reruns the xxHash3-128 checksum calculation
|
|
51
|
+
and compares the returned checksum to the one stored in the source directory. The method assumes that all input
|
|
52
|
+
directories contain the 'ax_checksum.txt' file that stores the 'source' directory checksum at the highest level
|
|
53
|
+
of the input directory tree.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
source: The path to the directory that needs to be moved.
|
|
57
|
+
destination: The path to the destination directory where to move the contents of the source directory.
|
|
58
|
+
num_threads: The number of threads to use for parallel file transfer. This number should be set depending on the
|
|
59
|
+
type of transfer (local or remote) and is not guaranteed to provide improved transfer performance. For local
|
|
60
|
+
transfers, setting this number above 1 will likely provide a performance boost. For remote transfers using
|
|
61
|
+
a single TCP / IP socket (such as non-multichannel SMB protocol), the number should be set to 1.
|
|
62
|
+
verify_integrity: Determines whether to perform integrity verification for the transferred files. Note,
|
|
63
|
+
integrity verification is a time-consuming process and generally would not be a concern for most runtimes.
|
|
64
|
+
Therefore, it is often fine to disable this option to optimize method runtime speed.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
RuntimeError: If the transferred files do not pass the xxHas3-128 checksum integrity verification.
|
|
68
|
+
"""
|
|
69
|
+
if not source.exists():
|
|
70
|
+
message = f"Unable to move the directory {source}, as it does not exist."
|
|
71
|
+
console.error(message=message, error=FileNotFoundError)
|
|
72
|
+
|
|
73
|
+
# Ensures the destination root directory exists.
|
|
74
|
+
ensure_directory_exists(destination)
|
|
75
|
+
|
|
76
|
+
# Collects all items (files and directories) in the source directory.
|
|
77
|
+
all_items = tuple(source.rglob("*"))
|
|
78
|
+
|
|
79
|
+
# Loops over all items (files and directories). Adds files to the file_list variable. Uses directories to reinstate
|
|
80
|
+
# the source subdirectory hierarchy in the destination directory.
|
|
81
|
+
file_list = []
|
|
82
|
+
for item in sorted(all_items, key=lambda x: len(x.relative_to(source).parts)):
|
|
83
|
+
# Recreates directory structure on destination
|
|
84
|
+
if item.is_dir():
|
|
85
|
+
dest_dir = destination / item.relative_to(source)
|
|
86
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
# Also builds the list of files to be moved
|
|
88
|
+
else: # is_file()
|
|
89
|
+
file_list.append(item)
|
|
90
|
+
|
|
91
|
+
# Copies the data to the destination. For parallel workflows, the method uses the ThreadPoolExecutor to move
|
|
92
|
+
# multiple files at the same time. Since I/O operations do not hold GIL, we do not need to parallelize with
|
|
93
|
+
# Processes here.
|
|
94
|
+
if num_threads > 1:
|
|
95
|
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
|
96
|
+
futures = {executor.submit(_transfer_file, file, source, destination): file for file in file_list}
|
|
97
|
+
for future in tqdm(
|
|
98
|
+
as_completed(futures),
|
|
99
|
+
total=len(file_list),
|
|
100
|
+
desc=f"Transferring files to {Path(*destination.parts[-6:])}",
|
|
101
|
+
unit="file",
|
|
102
|
+
):
|
|
103
|
+
# Propagates any exceptions from the file transfer.
|
|
104
|
+
future.result()
|
|
105
|
+
else:
|
|
106
|
+
for file in tqdm(file_list, desc=f"Transferring files to {Path(*destination.parts[-6:])}", unit="file"):
|
|
107
|
+
_transfer_file(file, source, destination)
|
|
108
|
+
|
|
109
|
+
# Verifies the integrity of the transferred directory by rerunning xxHash3-128 calculation.
|
|
110
|
+
if verify_integrity:
|
|
111
|
+
destination_checksum = calculate_directory_checksum(directory=destination, batch=False, save_checksum=False)
|
|
112
|
+
with open(file=source.joinpath("ax_checksum.txt"), mode="r") as local_checksum:
|
|
113
|
+
message = (
|
|
114
|
+
f"Checksum mismatch detected when transferring {Path(*source.parts[-6:])} to "
|
|
115
|
+
f"{Path(*destination.parts[-6:])}! The data was likely corrupted in transmission. User intervention "
|
|
116
|
+
f"required."
|
|
117
|
+
)
|
|
118
|
+
if not destination_checksum == local_checksum.readline().strip():
|
|
119
|
+
console.error(message=message, error=RuntimeError)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
|
|
4
|
+
|
|
5
|
+
def _transfer_file(source_file: Path, source_directory: Path, destination_directory: Path) -> None:
|
|
6
|
+
"""Copies the input file from the source directory to the destination directory while preserving the file metadata.
|
|
7
|
+
|
|
8
|
+
This is a worker method used by the transfer_directory() method to move multiple files in parallel.
|
|
9
|
+
|
|
10
|
+
Notes:
|
|
11
|
+
If the file is found under a hierarchy of subdirectories inside the input source_directory, that hierarchy will
|
|
12
|
+
be preserved in the destination directory.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
source_file: The file to be copied.
|
|
16
|
+
source_directory: The root directory where the file is located.
|
|
17
|
+
destination_directory: The destination directory where to move the file.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def transfer_directory(source: Path, destination: Path, num_threads: int = 1, verify_integrity: bool = True) -> None:
|
|
21
|
+
"""Copies the contents of the input directory tree from source to destination while preserving the folder
|
|
22
|
+
structure.
|
|
23
|
+
|
|
24
|
+
This function is used to assemble the experimental data from all remote machines used in the acquisition process on
|
|
25
|
+
the VRPC before the data is preprocessed. It is also used to transfer the preprocessed data from the VRPC to the
|
|
26
|
+
SynologyNAS and the Sun lab BioHPC server.
|
|
27
|
+
|
|
28
|
+
Notes:
|
|
29
|
+
This method recreates the moved directory hierarchy on the destination if the hierarchy does not exist. This is
|
|
30
|
+
done before copying the files.
|
|
31
|
+
|
|
32
|
+
The method executes a multithreading copy operation. It does not clean up the source files. That job is handed
|
|
33
|
+
to the specific preprocessing function from the sl_experiment or sl-forgery libraries that calls this function.
|
|
34
|
+
|
|
35
|
+
If the method is configured to verify transferred file integrity, it reruns the xxHash3-128 checksum calculation
|
|
36
|
+
and compares the returned checksum to the one stored in the source directory. The method assumes that all input
|
|
37
|
+
directories contain the 'ax_checksum.txt' file that stores the 'source' directory checksum at the highest level
|
|
38
|
+
of the input directory tree.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
source: The path to the directory that needs to be moved.
|
|
42
|
+
destination: The path to the destination directory where to move the contents of the source directory.
|
|
43
|
+
num_threads: The number of threads to use for parallel file transfer. This number should be set depending on the
|
|
44
|
+
type of transfer (local or remote) and is not guaranteed to provide improved transfer performance. For local
|
|
45
|
+
transfers, setting this number above 1 will likely provide a performance boost. For remote transfers using
|
|
46
|
+
a single TCP / IP socket (such as non-multichannel SMB protocol), the number should be set to 1.
|
|
47
|
+
verify_integrity: Determines whether to perform integrity verification for the transferred files. Note,
|
|
48
|
+
integrity verification is a time-consuming process and generally would not be a concern for most runtimes.
|
|
49
|
+
Therefore, it is often fine to disable this option to optimize method runtime speed.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
RuntimeError: If the transferred files do not pass the xxHas3-128 checksum integrity verification.
|
|
53
|
+
"""
|