sl-shared-assets 1.0.0rc19__py3-none-any.whl → 1.0.0rc21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +27 -27
- sl_shared_assets/__init__.pyi +73 -0
- sl_shared_assets/cli.py +266 -40
- sl_shared_assets/cli.pyi +87 -0
- sl_shared_assets/data_classes/__init__.py +23 -20
- sl_shared_assets/data_classes/__init__.pyi +61 -0
- sl_shared_assets/data_classes/configuration_data.py +407 -26
- sl_shared_assets/data_classes/configuration_data.pyi +194 -0
- sl_shared_assets/data_classes/runtime_data.py +59 -41
- sl_shared_assets/data_classes/runtime_data.pyi +145 -0
- sl_shared_assets/data_classes/session_data.py +168 -914
- sl_shared_assets/data_classes/session_data.pyi +249 -0
- sl_shared_assets/data_classes/surgery_data.py +3 -3
- sl_shared_assets/data_classes/surgery_data.pyi +89 -0
- sl_shared_assets/server/__init__.pyi +8 -0
- sl_shared_assets/server/job.pyi +94 -0
- sl_shared_assets/server/server.pyi +95 -0
- sl_shared_assets/tools/__init__.py +8 -1
- sl_shared_assets/tools/__init__.pyi +15 -0
- sl_shared_assets/tools/ascension_tools.py +27 -26
- sl_shared_assets/tools/ascension_tools.pyi +68 -0
- sl_shared_assets/tools/packaging_tools.py +14 -1
- sl_shared_assets/tools/packaging_tools.pyi +56 -0
- sl_shared_assets/tools/project_management_tools.py +164 -0
- sl_shared_assets/tools/project_management_tools.pyi +48 -0
- sl_shared_assets/tools/transfer_tools.pyi +53 -0
- {sl_shared_assets-1.0.0rc19.dist-info → sl_shared_assets-1.0.0rc21.dist-info}/METADATA +21 -4
- sl_shared_assets-1.0.0rc21.dist-info/RECORD +36 -0
- sl_shared_assets-1.0.0rc21.dist-info/entry_points.txt +8 -0
- sl_shared_assets/suite2p/__init__.py +0 -8
- sl_shared_assets/suite2p/multi_day.py +0 -225
- sl_shared_assets/suite2p/single_day.py +0 -563
- sl_shared_assets-1.0.0rc19.dist-info/RECORD +0 -23
- sl_shared_assets-1.0.0rc19.dist-info/entry_points.txt +0 -4
- {sl_shared_assets-1.0.0rc19.dist-info → sl_shared_assets-1.0.0rc21.dist-info}/WHEEL +0 -0
- {sl_shared_assets-1.0.0rc19.dist-info → sl_shared_assets-1.0.0rc21.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,13 +5,12 @@ an example for how to convert other data formats to match use the Sun lab data s
|
|
|
5
5
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
import datetime
|
|
8
|
-
import tempfile
|
|
9
8
|
|
|
10
9
|
import numpy as np
|
|
11
|
-
from ataraxis_base_utilities import LogLevel, console
|
|
10
|
+
from ataraxis_base_utilities import LogLevel, console, ensure_directory_exists
|
|
12
11
|
from ataraxis_time.time_helpers import extract_timestamp_from_bytes
|
|
13
12
|
|
|
14
|
-
from ..data_classes import SessionData, ProjectConfiguration
|
|
13
|
+
from ..data_classes import SessionData, ProjectConfiguration, get_system_configuration_data
|
|
15
14
|
from .transfer_tools import transfer_directory
|
|
16
15
|
from .packaging_tools import calculate_directory_checksum
|
|
17
16
|
|
|
@@ -170,7 +169,7 @@ def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
|
|
|
170
169
|
return True
|
|
171
170
|
|
|
172
171
|
|
|
173
|
-
def ascend_tyche_data(root_directory: Path
|
|
172
|
+
def ascend_tyche_data(root_directory: Path) -> None:
|
|
174
173
|
"""Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
|
|
175
174
|
|
|
176
175
|
This function is used to convert old Tyche data to the modern data management standard. This is used to make the
|
|
@@ -188,30 +187,24 @@ def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_
|
|
|
188
187
|
this function for a large number of sessions will result in a long processing time due to the network data
|
|
189
188
|
transfer.
|
|
190
189
|
|
|
190
|
+
Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
|
|
191
|
+
only work on a machine that is part of an active Sun lab acquisition system.
|
|
192
|
+
|
|
191
193
|
Args:
|
|
192
194
|
root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
|
|
193
195
|
root directory for the Tyche project.
|
|
194
|
-
output_root_directory: The path to the local directory where to generate the converted Tyche project hierarchy.
|
|
195
|
-
Typically, this is the 'root' directory where all other Sun lab projects are stored.
|
|
196
|
-
server_root_directory: The path to the local filesystem-mounted BioHPC server storage directory. Note, this
|
|
197
|
-
directory hs to be mapped to the local filesystem via the SMB or equivalent protocol.
|
|
198
196
|
"""
|
|
199
197
|
# Generates a (shared) project configuration file.
|
|
200
198
|
project_configuration = ProjectConfiguration()
|
|
201
199
|
|
|
202
|
-
#
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
temp_nas_dir = Path(tempfile.mkdtemp(prefix="nas_temp_"))
|
|
207
|
-
temp_mesoscope_dir = Path(tempfile.mkdtemp(prefix="mesoscope_temp_"))
|
|
200
|
+
# The acquisition system config resolves most paths and filesystem configuration arguments
|
|
201
|
+
acquisition_system = get_system_configuration_data()
|
|
202
|
+
output_root_directory = acquisition_system.paths.root_directory
|
|
203
|
+
server_root_directory = acquisition_system.paths.server_storage_directory
|
|
208
204
|
|
|
209
205
|
# Statically defines project name and local root paths
|
|
210
|
-
|
|
211
|
-
project_configuration.
|
|
212
|
-
project_configuration.local_server_directory = server_root_directory
|
|
213
|
-
project_configuration.local_nas_directory = temp_nas_dir
|
|
214
|
-
project_configuration.local_mesoscope_directory = temp_mesoscope_dir
|
|
206
|
+
project_name = "Tyche"
|
|
207
|
+
project_configuration.project_name = project_name
|
|
215
208
|
|
|
216
209
|
# Uses nonsensical google sheet IDs. Tyche project did not use Google Sheet processing like our modern projects do.
|
|
217
210
|
project_configuration.water_log_sheet_id = "1xFh9Q2zT7pL3mVkJdR8bN6yXoE4wS5aG0cHu2Kf7D3v"
|
|
@@ -219,13 +212,14 @@ def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_
|
|
|
219
212
|
|
|
220
213
|
# Dumps project configuration into the 'configuration' subfolder of the Tyche project.
|
|
221
214
|
configuration_path = output_root_directory.joinpath("Tyche", "configuration", "project_configuration.yaml")
|
|
215
|
+
ensure_directory_exists(configuration_path)
|
|
222
216
|
project_configuration.save(path=configuration_path)
|
|
223
217
|
|
|
224
218
|
# Assumes that root directory stores all animal folders to be processed
|
|
225
219
|
for animal_folder in root_directory.iterdir():
|
|
226
220
|
# Each animal folder is named to include project name and a static animal ID, e.g.: Tyche-A7. This extracts each
|
|
227
221
|
# animal ID.
|
|
228
|
-
animal_name = animal_folder.
|
|
222
|
+
animal_name = animal_folder.stem.split(sep="-")[1]
|
|
229
223
|
|
|
230
224
|
# Under each animal root folder, there are day folders that use YYYY-MM-DD timestamps
|
|
231
225
|
for session_folder in animal_folder.iterdir():
|
|
@@ -240,11 +234,11 @@ def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_
|
|
|
240
234
|
# session data hierarchy using the output root. This generates a 'standard' Sun lab directory structure
|
|
241
235
|
# for the Tyche data.
|
|
242
236
|
session_data = SessionData.create(
|
|
237
|
+
project_name=project_configuration.project_name,
|
|
243
238
|
session_name=session_name,
|
|
244
239
|
animal_id=animal_name,
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
experiment_name=None, # Has to be none, otherwise the system tries to copy a configuration file.
|
|
240
|
+
session_type="mesoscope experiment",
|
|
241
|
+
experiment_name=None,
|
|
248
242
|
)
|
|
249
243
|
|
|
250
244
|
# Moves the data from the old hierarchy to the new hierarchy. If the process runs as expected, and
|
|
@@ -259,15 +253,22 @@ def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_
|
|
|
259
253
|
# noinspection PyTypeChecker
|
|
260
254
|
console.echo(message=message, level=LogLevel.WARNING)
|
|
261
255
|
else:
|
|
262
|
-
#
|
|
256
|
+
# Generates the telomere.bin file to mark the session as 'complete'
|
|
257
|
+
session_data.raw_data.telomere_path.touch()
|
|
258
|
+
|
|
259
|
+
# If the local transfer process was successful, generates a new checksum for the moved data
|
|
263
260
|
calculate_directory_checksum(directory=Path(session_data.raw_data.raw_data_path))
|
|
261
|
+
|
|
264
262
|
# Next, copies the data to the BioHPC server for further processing
|
|
265
263
|
transfer_directory(
|
|
266
264
|
source=Path(session_data.raw_data.raw_data_path),
|
|
267
|
-
destination=Path(
|
|
265
|
+
destination=Path(
|
|
266
|
+
server_root_directory.joinpath(project_name, animal_name, session_name, "raw_data")
|
|
267
|
+
),
|
|
268
268
|
verify_integrity=False,
|
|
269
269
|
)
|
|
270
|
-
|
|
270
|
+
|
|
271
|
+
# Removes the now-empty old session data directory.
|
|
271
272
|
acquisition_folder.rmdir()
|
|
272
273
|
|
|
273
274
|
# If the loop above removed all acquisition folders, all data for that day has been successfully converted
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from ..data_classes import (
|
|
4
|
+
SessionData as SessionData,
|
|
5
|
+
ProjectConfiguration as ProjectConfiguration,
|
|
6
|
+
get_system_configuration_data as get_system_configuration_data,
|
|
7
|
+
)
|
|
8
|
+
from .transfer_tools import transfer_directory as transfer_directory
|
|
9
|
+
from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
|
|
10
|
+
|
|
11
|
+
def _generate_session_name(acquisition_path: Path) -> str:
|
|
12
|
+
"""Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
|
|
13
|
+
|
|
14
|
+
This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
|
|
15
|
+
generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
|
|
16
|
+
pattern into the pattern used by all modern Sun lab projects and pipelines.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
|
|
20
|
+
folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
The modernized session name.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
|
|
27
|
+
"""Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
|
|
28
|
+
folder in the newly created modern hierarchy.
|
|
29
|
+
|
|
30
|
+
This worker function is used to physically rearrange the data from the original Tyche data structure to the
|
|
31
|
+
new data structure. It both moves the existing files to their new destinations and renames certain files to match
|
|
32
|
+
the modern naming convention used in the Sun lab.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
|
|
36
|
+
source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
True if the ascension process was successfully completed. False if the process encountered missing data or
|
|
40
|
+
otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
|
|
41
|
+
to finalize the process manually.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def ascend_tyche_data(root_directory: Path) -> None:
|
|
45
|
+
"""Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
|
|
46
|
+
|
|
47
|
+
This function is used to convert old Tyche data to the modern data management standard. This is used to make the
|
|
48
|
+
data compatible with the modern Sun lab data workflows.
|
|
49
|
+
|
|
50
|
+
Notes:
|
|
51
|
+
This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
|
|
52
|
+
https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
|
|
53
|
+
preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
|
|
54
|
+
project or data hierarchy.
|
|
55
|
+
|
|
56
|
+
As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
|
|
57
|
+
Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
|
|
58
|
+
mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
|
|
59
|
+
this function for a large number of sessions will result in a long processing time due to the network data
|
|
60
|
+
transfer.
|
|
61
|
+
|
|
62
|
+
Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
|
|
63
|
+
only work on a machine that is part of an active Sun lab acquisition system.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
|
|
67
|
+
root directory for the Tyche project.
|
|
68
|
+
"""
|
|
@@ -10,6 +10,19 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
import xxhash
|
|
12
12
|
|
|
13
|
+
# Defines a 'blacklist' set of files. Primarily, this lit contains the service files that may change after the session
|
|
14
|
+
# data has been acquired. Therefore, it does not make sense to include them in the checksum, as they do not reflect the
|
|
15
|
+
# data that should remain permanently unchanged. Note, make sure all service files are added to this set!
|
|
16
|
+
_excluded_files = {
|
|
17
|
+
"ax_checksum.txt",
|
|
18
|
+
"ubiquitin.bin",
|
|
19
|
+
"telomere.bin",
|
|
20
|
+
"single_day_suite2p.bin",
|
|
21
|
+
"multi_day_suite2p.bin",
|
|
22
|
+
"behavior.bin",
|
|
23
|
+
"dlc.bin",
|
|
24
|
+
}
|
|
25
|
+
|
|
13
26
|
|
|
14
27
|
def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
|
|
15
28
|
"""Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
|
|
@@ -89,7 +102,7 @@ def calculate_directory_checksum(
|
|
|
89
102
|
files = sorted(
|
|
90
103
|
path
|
|
91
104
|
for path in directory.rglob("*")
|
|
92
|
-
if path.is_file() and path.stem
|
|
105
|
+
if path.is_file() and f"{path.stem}{path.suffix}" not in _excluded_files # Excludes service files
|
|
93
106
|
)
|
|
94
107
|
|
|
95
108
|
# Precreates the directory checksum
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from _typeshed import Incomplete
|
|
4
|
+
|
|
5
|
+
_excluded_files: Incomplete
|
|
6
|
+
|
|
7
|
+
def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
|
|
8
|
+
"""Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
|
|
9
|
+
|
|
10
|
+
This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
|
|
11
|
+
calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
|
|
12
|
+
target file, which includes both the contents of the file and its path relative to the base directory.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
base_directory: The path to the base (root) directory which is being checksummed by the main
|
|
16
|
+
'calculate_directory_checksum' function.
|
|
17
|
+
file_path: The absolute path to the target file.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
A tuple with two elements. The first element is the path to the file relative to the base directory. The second
|
|
21
|
+
element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def calculate_directory_checksum(
|
|
25
|
+
directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
|
|
26
|
+
) -> str:
|
|
27
|
+
"""Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
|
|
28
|
+
the directory structure information.
|
|
29
|
+
|
|
30
|
+
This function is used to generate a checksum for the raw_data directory of each experiment or training session.
|
|
31
|
+
Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
|
|
32
|
+
and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
|
|
33
|
+
write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
|
|
34
|
+
input directory.
|
|
35
|
+
|
|
36
|
+
Note:
|
|
37
|
+
This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
|
|
38
|
+
combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
|
|
39
|
+
SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
|
|
40
|
+
|
|
41
|
+
The method notifies the user about the checksum calculation process via the terminal.
|
|
42
|
+
|
|
43
|
+
The returned checksum accounts for both the contents of each file and the layout of the input directory
|
|
44
|
+
structure.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
directory: The Path to the directory to be checksummed.
|
|
48
|
+
num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
|
|
49
|
+
function defaults to using (logical CPU count - 4).
|
|
50
|
+
batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
|
|
51
|
+
to optimize progress reporting to avoid cluttering the terminal.
|
|
52
|
+
save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
The xxHash3-128 checksum for the input directory as a hexadecimal string.
|
|
56
|
+
"""
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""This module provides tools for managing the data of any Sun lab project. Tools from this module extend the
|
|
2
|
+
functionality of SessionData class via a convenient API that allows working with the data of multiple sessions making
|
|
3
|
+
up a given project."""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from ..data_classes import SessionData
|
|
10
|
+
from .packaging_tools import calculate_directory_checksum
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_project_manifest(
|
|
14
|
+
raw_project_directory: Path, output_directory: Path, processed_project_directory: Path | None = None
|
|
15
|
+
) -> None:
|
|
16
|
+
"""Builds and saves the project manifest .feather file under the specified output directory.
|
|
17
|
+
|
|
18
|
+
This function evaluates the input project directory and builds the 'manifest' file for the project. The file
|
|
19
|
+
includes the descriptive information about every session stored inside the input project folder and the state of
|
|
20
|
+
session's data processing (which processing pipelines have been applied to each session). The file will be created
|
|
21
|
+
under the 'output_path' directory and use the following name pattern: {ProjectName}}_manifest.feather.
|
|
22
|
+
|
|
23
|
+
Notes:
|
|
24
|
+
The manifest file is primarily used to capture and move project state information between machines, typically
|
|
25
|
+
in the context of working with data stored on a remote compute server or cluster. However, it can also be used
|
|
26
|
+
on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
|
|
27
|
+
lab regardless of the runtime context.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
raw_project_directory: The path to the root project directory used to store raw session data.
|
|
31
|
+
output_directory: The path to the directory where to save the generated manifest file.
|
|
32
|
+
processed_project_directory: The path to the root project directory used to store processed session data if it
|
|
33
|
+
is different from the 'raw_project_directory'. Typically, this would be the case on remote compute server(s)
|
|
34
|
+
and not on local machines.
|
|
35
|
+
"""
|
|
36
|
+
# Finds all raw data directories
|
|
37
|
+
session_directories = [directory.parent for directory in raw_project_directory.rglob("raw_data")]
|
|
38
|
+
|
|
39
|
+
# Precreates the 'manifest' dictionary structure
|
|
40
|
+
manifest: dict[str, list[str | bool]] = {
|
|
41
|
+
"animal": [], # Animal IDs.
|
|
42
|
+
"session": [], # Session names.
|
|
43
|
+
"type": [], # Type of the session (e.g., Experiment, Training, etc.).
|
|
44
|
+
"raw_data": [], # Server-side raw_data folder path.
|
|
45
|
+
"processed_data": [], # Server-side processed_data folder path.
|
|
46
|
+
"complete": [], # Determines if the session data is complete. Incomplete sessions are excluded from processing.
|
|
47
|
+
"single_day_suite2p": [], # Determines whether the session has been processed with the single-day s2p pipeline.
|
|
48
|
+
"multi_day_suite2p": [], # Determines whether the session has been processed with the multi-day s2p pipeline.
|
|
49
|
+
"behavior": [], # Determines whether the session has been processed with the behavior extraction pipeline.
|
|
50
|
+
"dlc": [], # Determines whether the session has been processed with the DeepLabCut pipeline.
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Loops over each session of every animal in the project and extracts session ID information and information
|
|
54
|
+
# about which processing steps have been successfully applied to the session.
|
|
55
|
+
for directory in session_directories:
|
|
56
|
+
# Instantiates the SessionData instance to resolve the paths to all session's data files and locations.
|
|
57
|
+
session_data = SessionData.load(
|
|
58
|
+
session_path=directory, processed_data_root=processed_project_directory, make_processed_data_directory=False
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Fills the manifest dictionary with data for the processed session:
|
|
62
|
+
|
|
63
|
+
# Extracts ID and data path information from the SessionData instance
|
|
64
|
+
manifest["animal"].append(session_data.animal_id)
|
|
65
|
+
manifest["session"].append(session_data.session_name)
|
|
66
|
+
manifest["type"].append(session_data.session_type)
|
|
67
|
+
manifest["raw_data"].append(str(session_data.raw_data.raw_data_path))
|
|
68
|
+
manifest["processed_data"].append(str(session_data.processed_data.processed_data_path))
|
|
69
|
+
|
|
70
|
+
# If the session raw_data folder contains the telomere.bin file, marks the session as complete.
|
|
71
|
+
manifest["complete"].append(session_data.raw_data.telomere_path.exists())
|
|
72
|
+
|
|
73
|
+
# If the session is incomplete, marks all processing steps as FALSE, as automatic processing is disabled for
|
|
74
|
+
# incomplete sessions.
|
|
75
|
+
if not manifest["complete"][-1]:
|
|
76
|
+
manifest["single_day_suite2p"].append(False)
|
|
77
|
+
manifest["multi_day_suite2p"].append(False)
|
|
78
|
+
manifest["behavior"].append(False)
|
|
79
|
+
manifest["dlc"].append(False)
|
|
80
|
+
continue # Cycles to the next session
|
|
81
|
+
|
|
82
|
+
# If the session processed_data folder contains the single-day suite2p.bin file, marks the single-day suite2p
|
|
83
|
+
# processing step as complete.
|
|
84
|
+
manifest["single_day_suite2p"].append(session_data.processed_data.single_day_suite2p_bin_path.exists())
|
|
85
|
+
|
|
86
|
+
# If the session processed_data folder contains the multi-day suite2p.bin file, marks the multi-day suite2p
|
|
87
|
+
# processing step as complete.
|
|
88
|
+
manifest["multi_day_suite2p"].append(session_data.processed_data.multi_day_suite2p_bin_path.exists())
|
|
89
|
+
|
|
90
|
+
# If the session processed_data folder contains the behavior.bin file, marks the behavior processing step as
|
|
91
|
+
# complete.
|
|
92
|
+
manifest["behavior"].append(session_data.processed_data.behavior_data_path.exists())
|
|
93
|
+
|
|
94
|
+
# If the session processed_data folder contains the dlc.bin file, marks the dlc processing step as
|
|
95
|
+
# complete.
|
|
96
|
+
manifest["dlc"].append(session_data.processed_data.dlc_bin_path.exists())
|
|
97
|
+
|
|
98
|
+
# Converts the manifest dictionary to a Polars Dataframe
|
|
99
|
+
schema = {
|
|
100
|
+
"animal": pl.String,
|
|
101
|
+
"session": pl.String,
|
|
102
|
+
"raw_data": pl.String,
|
|
103
|
+
"processed_data": pl.String,
|
|
104
|
+
"type": pl.String,
|
|
105
|
+
"complete": pl.Boolean,
|
|
106
|
+
"single_day_suite2p": pl.Boolean,
|
|
107
|
+
"multi_day_suite2p": pl.Boolean,
|
|
108
|
+
"behavior": pl.Boolean,
|
|
109
|
+
"dlc": pl.Boolean,
|
|
110
|
+
}
|
|
111
|
+
df = pl.DataFrame(manifest, schema=schema)
|
|
112
|
+
|
|
113
|
+
# Sorts the DataFrame by animal and then session. Since we assign animal IDs sequentially and 'name' sessions based
|
|
114
|
+
# on acquisition timestamps, the sort order is chronological.
|
|
115
|
+
sorted_df = df.sort(["animal", "session"])
|
|
116
|
+
|
|
117
|
+
# Saves the generated manifest to the project-specific manifest .feather file for further processing.
|
|
118
|
+
sorted_df.write_ipc(
|
|
119
|
+
file=output_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather"), compression="lz4"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def verify_session_checksum(session_path: Path) -> bool:
|
|
124
|
+
"""Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
|
|
125
|
+
comparing it against the checksum stored in the ax_checksum.txt file.
|
|
126
|
+
|
|
127
|
+
Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
|
|
128
|
+
server for long-term storage. This function is designed to do nothing if the checksum matches and to remove the
|
|
129
|
+
'telomere.bin' marker file if it does not.
|
|
130
|
+
|
|
131
|
+
Notes:
|
|
132
|
+
Removing the telomere.bin marker file from session's raw_data folder marks the session as incomplete, excluding
|
|
133
|
+
it from all further automatic processing.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
session_path: The path to the session directory to be verified. Note, the input session directory must contain
|
|
137
|
+
the 'raw_data' subdirectory.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
True if the checksum matches, False otherwise.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
# Loads session data layout
|
|
144
|
+
session_data = SessionData.load(session_path=session_path)
|
|
145
|
+
|
|
146
|
+
# Re-calculates the checksum for the raw_data directory
|
|
147
|
+
calculated_checksum = calculate_directory_checksum(
|
|
148
|
+
directory=session_data.raw_data.raw_data_path, batch=False, save_checksum=False
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Loads the checksum stored inside the ax_checksum.txt file
|
|
152
|
+
with open(session_data.raw_data.checksum_path, "r") as f:
|
|
153
|
+
stored_checksum = f.read().strip()
|
|
154
|
+
|
|
155
|
+
# If the two checksums do not match, this likely indicates data corruption.
|
|
156
|
+
if stored_checksum != calculated_checksum:
|
|
157
|
+
# If the telomere.bin file exists, removes this file. This automatically marks the session as incomplete for
|
|
158
|
+
# all other Sun lab runtimes. The presence of the telomere.bin file after integrity verification is used as a
|
|
159
|
+
# heuristic for determining whether the session has passed the verification process.
|
|
160
|
+
if session_data.raw_data.telomere_path.exists():
|
|
161
|
+
session_data.raw_data.telomere_path.unlink()
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
return True
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from ..data_classes import SessionData as SessionData
|
|
4
|
+
from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
|
|
5
|
+
|
|
6
|
+
def generate_project_manifest(
|
|
7
|
+
raw_project_directory: Path, output_directory: Path, processed_project_directory: Path | None = None
|
|
8
|
+
) -> None:
|
|
9
|
+
"""Builds and saves the project manifest .feather file under the specified output directory.
|
|
10
|
+
|
|
11
|
+
This function evaluates the input project directory and builds the 'manifest' file for the project. The file
|
|
12
|
+
includes the descriptive information about every session stored inside the input project folder and the state of
|
|
13
|
+
session's data processing (which processing pipelines have been applied to each session). The file will be created
|
|
14
|
+
under the 'output_path' directory and use the following name pattern: {ProjectName}}_manifest.feather.
|
|
15
|
+
|
|
16
|
+
Notes:
|
|
17
|
+
The manifest file is primarily used to capture and move project state information between machines, typically
|
|
18
|
+
in the context of working with data stored on a remote compute server or cluster. However, it can also be used
|
|
19
|
+
on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
|
|
20
|
+
lab regardless of the runtime context.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
raw_project_directory: The path to the root project directory used to store raw session data.
|
|
24
|
+
output_directory: The path to the directory where to save the generated manifest file.
|
|
25
|
+
processed_project_directory: The path to the root project directory used to store processed session data if it
|
|
26
|
+
is different from the 'raw_project_directory'. Typically, this would be the case on remote compute server(s)
|
|
27
|
+
and not on local machines.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def verify_session_checksum(session_path: Path) -> bool:
|
|
31
|
+
"""Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
|
|
32
|
+
comparing it against the checksum stored in the ax_checksum.txt file.
|
|
33
|
+
|
|
34
|
+
Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
|
|
35
|
+
server for long-term storage. This function is designed to do nothing if the checksum matches and to remove the
|
|
36
|
+
'telomere.bin' marker file if it does not.
|
|
37
|
+
|
|
38
|
+
Notes:
|
|
39
|
+
Removing the telomere.bin marker file from session's raw_data folder marks the session as incomplete, excluding
|
|
40
|
+
it from all further automatic processing.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
session_path: The path to the session directory to be verified. Note, the input session directory must contain
|
|
44
|
+
the 'raw_data' subdirectory.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
True if the checksum matches, False otherwise.
|
|
48
|
+
"""
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
|
|
4
|
+
|
|
5
|
+
def _transfer_file(source_file: Path, source_directory: Path, destination_directory: Path) -> None:
|
|
6
|
+
"""Copies the input file from the source directory to the destination directory while preserving the file metadata.
|
|
7
|
+
|
|
8
|
+
This is a worker method used by the transfer_directory() method to move multiple files in parallel.
|
|
9
|
+
|
|
10
|
+
Notes:
|
|
11
|
+
If the file is found under a hierarchy of subdirectories inside the input source_directory, that hierarchy will
|
|
12
|
+
be preserved in the destination directory.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
source_file: The file to be copied.
|
|
16
|
+
source_directory: The root directory where the file is located.
|
|
17
|
+
destination_directory: The destination directory where to move the file.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def transfer_directory(source: Path, destination: Path, num_threads: int = 1, verify_integrity: bool = True) -> None:
|
|
21
|
+
"""Copies the contents of the input directory tree from source to destination while preserving the folder
|
|
22
|
+
structure.
|
|
23
|
+
|
|
24
|
+
This function is used to assemble the experimental data from all remote machines used in the acquisition process on
|
|
25
|
+
the VRPC before the data is preprocessed. It is also used to transfer the preprocessed data from the VRPC to the
|
|
26
|
+
SynologyNAS and the Sun lab BioHPC server.
|
|
27
|
+
|
|
28
|
+
Notes:
|
|
29
|
+
This method recreates the moved directory hierarchy on the destination if the hierarchy does not exist. This is
|
|
30
|
+
done before copying the files.
|
|
31
|
+
|
|
32
|
+
The method executes a multithreading copy operation. It does not clean up the source files. That job is handed
|
|
33
|
+
to the specific preprocessing function from the sl_experiment or sl-forgery libraries that calls this function.
|
|
34
|
+
|
|
35
|
+
If the method is configured to verify transferred file integrity, it reruns the xxHash3-128 checksum calculation
|
|
36
|
+
and compares the returned checksum to the one stored in the source directory. The method assumes that all input
|
|
37
|
+
directories contain the 'ax_checksum.txt' file that stores the 'source' directory checksum at the highest level
|
|
38
|
+
of the input directory tree.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
source: The path to the directory that needs to be moved.
|
|
42
|
+
destination: The path to the destination directory where to move the contents of the source directory.
|
|
43
|
+
num_threads: The number of threads to use for parallel file transfer. This number should be set depending on the
|
|
44
|
+
type of transfer (local or remote) and is not guaranteed to provide improved transfer performance. For local
|
|
45
|
+
transfers, setting this number above 1 will likely provide a performance boost. For remote transfers using
|
|
46
|
+
a single TCP / IP socket (such as non-multichannel SMB protocol), the number should be set to 1.
|
|
47
|
+
verify_integrity: Determines whether to perform integrity verification for the transferred files. Note,
|
|
48
|
+
integrity verification is a time-consuming process and generally would not be a concern for most runtimes.
|
|
49
|
+
Therefore, it is often fine to disable this option to optimize method runtime speed.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
RuntimeError: If the transferred files do not pass the xxHas3-128 checksum integrity verification.
|
|
53
|
+
"""
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sl-shared-assets
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc21
|
|
4
4
|
Summary: Stores assets shared between multiple Sun (NeuroAI) lab data pipelines.
|
|
5
5
|
Project-URL: Homepage, https://github.com/Sun-Lab-NBB/sl-shared-assets
|
|
6
6
|
Project-URL: Documentation, https://sl-shared-assets-api-docs.netlify.app/
|
|
7
|
-
Author: Ivan Kondratyev, Kushaan Gupta, Yuantao Deng
|
|
7
|
+
Author: Ivan Kondratyev, Kushaan Gupta, Yuantao Deng, Natalie Yeung
|
|
8
8
|
Maintainer-email: Ivan Kondratyev <ik278@cornell.edu>
|
|
9
9
|
License: GNU GENERAL PUBLIC LICENSE
|
|
10
10
|
Version 3, 29 June 2007
|
|
@@ -695,8 +695,10 @@ Requires-Dist: ataraxis-base-utilities<4,>=3
|
|
|
695
695
|
Requires-Dist: ataraxis-data-structures<4,>=3.1.1
|
|
696
696
|
Requires-Dist: ataraxis-time<4,>=3
|
|
697
697
|
Requires-Dist: click<9,>=8
|
|
698
|
-
Requires-Dist:
|
|
698
|
+
Requires-Dist: natsort<9,>=8
|
|
699
699
|
Requires-Dist: paramiko<4,>=3.5.1
|
|
700
|
+
Requires-Dist: polars<2,>=1
|
|
701
|
+
Requires-Dist: pyarrow<21,>=20
|
|
700
702
|
Requires-Dist: simple-slurm<1,>=0
|
|
701
703
|
Requires-Dist: tqdm<5,>=4
|
|
702
704
|
Requires-Dist: xxhash<4,>=3
|
|
@@ -717,8 +719,10 @@ Requires-Dist: types-tqdm<5,>=4; extra == 'conda'
|
|
|
717
719
|
Provides-Extra: condarun
|
|
718
720
|
Requires-Dist: appdirs<2,>=1; extra == 'condarun'
|
|
719
721
|
Requires-Dist: click<9,>=8; extra == 'condarun'
|
|
720
|
-
Requires-Dist:
|
|
722
|
+
Requires-Dist: natsort<9,>=8; extra == 'condarun'
|
|
721
723
|
Requires-Dist: paramiko<4,>=3.5.1; extra == 'condarun'
|
|
724
|
+
Requires-Dist: polars<2,>=1; extra == 'condarun'
|
|
725
|
+
Requires-Dist: pyarrow<21,>=20; extra == 'condarun'
|
|
722
726
|
Requires-Dist: tqdm<5,>=4; extra == 'condarun'
|
|
723
727
|
Provides-Extra: dev
|
|
724
728
|
Requires-Dist: ataraxis-automation<5,>=4; extra == 'dev'
|
|
@@ -781,6 +785,7 @@ acquisition and processing and provides the API for accessing the lab’s main c
|
|
|
781
785
|
|
|
782
786
|
- [Dependencies](#dependencies)
|
|
783
787
|
- [Installation](#installation)
|
|
788
|
+
- [Usage](#usage)
|
|
784
789
|
- [API Documentation](#api-documentation)
|
|
785
790
|
- [Versioning](#versioning)
|
|
786
791
|
- [Authors](#authors)
|
|
@@ -811,11 +816,22 @@ Use the following command to install the library using pip: ```pip install sl-sh
|
|
|
811
816
|
|
|
812
817
|
---
|
|
813
818
|
|
|
819
|
+
## Usage
|
|
820
|
+
|
|
821
|
+
All library components are intended to be used via other Sun lab libraries. Developers should study the API and CLI
|
|
822
|
+
documentation below to learn how to use library components in other Sun lab libraries.
|
|
823
|
+
|
|
824
|
+
---
|
|
825
|
+
|
|
814
826
|
## API Documentation
|
|
815
827
|
|
|
816
828
|
See the [API documentation](https://sl-shared-assets-api-docs.netlify.app/) for the
|
|
817
829
|
detailed description of the methods and classes exposed by components of this library.
|
|
818
830
|
|
|
831
|
+
**Note!** The API documentation includes important information about Command-Line-Interfaces (CLIs) exposed by this
|
|
832
|
+
library as part of installation into a Python environment. All users are highly encouraged to study the CLI
|
|
833
|
+
documentation to learn how to use library components via the terminal.
|
|
834
|
+
|
|
819
835
|
___
|
|
820
836
|
|
|
821
837
|
## Versioning
|
|
@@ -830,6 +846,7 @@ We use [semantic versioning](https://semver.org/) for this project. For the vers
|
|
|
830
846
|
- Ivan Kondratyev ([Inkaros](https://github.com/Inkaros))
|
|
831
847
|
- Kushaan Gupta ([kushaangupta](https://github.com/kushaangupta))
|
|
832
848
|
- Yuantao Deng ([YuantaoDeng](https://github.com/YuantaoDeng))
|
|
849
|
+
- Natalie Yeung
|
|
833
850
|
|
|
834
851
|
___
|
|
835
852
|
|