sl-shared-assets 4.0.0__py3-none-any.whl → 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +45 -42
- sl_shared_assets/command_line_interfaces/__init__.py +3 -0
- sl_shared_assets/command_line_interfaces/configure.py +173 -0
- sl_shared_assets/command_line_interfaces/manage.py +226 -0
- sl_shared_assets/data_classes/__init__.py +33 -32
- sl_shared_assets/data_classes/configuration_data.py +267 -79
- sl_shared_assets/data_classes/runtime_data.py +11 -11
- sl_shared_assets/data_classes/session_data.py +226 -289
- sl_shared_assets/data_classes/surgery_data.py +6 -6
- sl_shared_assets/server/__init__.py +24 -4
- sl_shared_assets/server/job.py +6 -7
- sl_shared_assets/server/pipeline.py +570 -0
- sl_shared_assets/server/server.py +57 -25
- sl_shared_assets/tools/__init__.py +9 -8
- sl_shared_assets/tools/packaging_tools.py +14 -25
- sl_shared_assets/tools/project_management_tools.py +602 -523
- sl_shared_assets/tools/transfer_tools.py +88 -23
- {sl_shared_assets-4.0.0.dist-info → sl_shared_assets-5.0.0.dist-info}/METADATA +46 -202
- sl_shared_assets-5.0.0.dist-info/RECORD +23 -0
- sl_shared_assets-5.0.0.dist-info/entry_points.txt +3 -0
- sl_shared_assets/__init__.pyi +0 -91
- sl_shared_assets/cli.py +0 -500
- sl_shared_assets/cli.pyi +0 -106
- sl_shared_assets/data_classes/__init__.pyi +0 -75
- sl_shared_assets/data_classes/configuration_data.pyi +0 -235
- sl_shared_assets/data_classes/runtime_data.pyi +0 -157
- sl_shared_assets/data_classes/session_data.pyi +0 -379
- sl_shared_assets/data_classes/surgery_data.pyi +0 -89
- sl_shared_assets/server/__init__.pyi +0 -11
- sl_shared_assets/server/job.pyi +0 -205
- sl_shared_assets/server/server.pyi +0 -298
- sl_shared_assets/tools/__init__.pyi +0 -19
- sl_shared_assets/tools/ascension_tools.py +0 -265
- sl_shared_assets/tools/ascension_tools.pyi +0 -68
- sl_shared_assets/tools/packaging_tools.pyi +0 -58
- sl_shared_assets/tools/project_management_tools.pyi +0 -239
- sl_shared_assets/tools/transfer_tools.pyi +0 -53
- sl_shared_assets-4.0.0.dist-info/RECORD +0 -36
- sl_shared_assets-4.0.0.dist-info/entry_points.txt +0 -7
- {sl_shared_assets-4.0.0.dist-info → sl_shared_assets-5.0.0.dist-info}/WHEEL +0 -0
- {sl_shared_assets-4.0.0.dist-info → sl_shared_assets-5.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,265 +0,0 @@
|
|
|
1
|
-
"""This module provides tools for translating ('ascending') old Tyche data to use the modern data structure used in the
|
|
2
|
-
Sun lab. The tools from this module will not work for any other data and also assume that the Tyche data has been
|
|
3
|
-
preprocessed with an early version of the Sun lab mesoscope processing pipeline. However, this module can be used as
|
|
4
|
-
an example for how to convert other data formats to match use the Sun lab data structure."""
|
|
5
|
-
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
import datetime
|
|
8
|
-
|
|
9
|
-
import numpy as np
|
|
10
|
-
from ataraxis_base_utilities import LogLevel, console
|
|
11
|
-
from ataraxis_time.time_helpers import extract_timestamp_from_bytes
|
|
12
|
-
|
|
13
|
-
from ..data_classes import SessionData, SessionTypes, get_system_configuration_data
|
|
14
|
-
from .transfer_tools import transfer_directory
|
|
15
|
-
from .packaging_tools import calculate_directory_checksum
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def _generate_session_name(acquisition_path: Path) -> str:
|
|
19
|
-
"""Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
|
|
20
|
-
|
|
21
|
-
This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
|
|
22
|
-
generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
|
|
23
|
-
pattern into the pattern used by all modern Sun lab projects and pipelines.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
|
|
27
|
-
folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
|
|
28
|
-
|
|
29
|
-
Returns:
|
|
30
|
-
The modernized session name.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
# All well-formed sessions are expected to contain both the zstack.mat and the MotionEstimator.me file.
|
|
34
|
-
# We use the last modification time from one of these files to infer when the session was carried out. This allows
|
|
35
|
-
# us to gather the time information, which is missing from the original session naming pattern.
|
|
36
|
-
source: Path
|
|
37
|
-
if acquisition_path.joinpath("zstack.mat").exists():
|
|
38
|
-
source = acquisition_path.joinpath("zstack.mat")
|
|
39
|
-
elif acquisition_path.joinpath("MotionEstimator.me").exists():
|
|
40
|
-
source = acquisition_path.joinpath("MotionEstimator.me")
|
|
41
|
-
else:
|
|
42
|
-
message = (
|
|
43
|
-
f"Unable to find zstack.mat or MotionEstimator.me file in the target acquisition subfolder "
|
|
44
|
-
f"{acquisition_path} of the session {acquisition_path.parent}. Manual intervention is required to ascend "
|
|
45
|
-
f"the target session folder to the latest Sun lab data format."
|
|
46
|
-
)
|
|
47
|
-
console.error(message=message, error=FileNotFoundError)
|
|
48
|
-
raise FileNotFoundError(message) # Fall-back to appease mypy
|
|
49
|
-
|
|
50
|
-
# Gets the last modified time (available on all platforms) and converts it to a UTC timestamp object.
|
|
51
|
-
mod_time = source.stat().st_mtime
|
|
52
|
-
mod_datetime = datetime.datetime.fromtimestamp(mod_time)
|
|
53
|
-
|
|
54
|
-
# Converts the timestamp to microseconds as uint64, then to an array of 8 uint8 bytes. The array is then reformatted
|
|
55
|
-
# to match the session name pattern used in the modern Sun lab data pipelines.
|
|
56
|
-
timestamp_microseconds = np.uint64(int(mod_datetime.timestamp() * 1_000_000))
|
|
57
|
-
timestamp_bytes = np.array([(timestamp_microseconds >> (8 * i)) & 0xFF for i in range(8)], dtype=np.uint8)
|
|
58
|
-
stamp = extract_timestamp_from_bytes(timestamp_bytes=timestamp_bytes)
|
|
59
|
-
|
|
60
|
-
# Returns the generated session name to the caller.
|
|
61
|
-
return stamp
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
|
|
65
|
-
"""Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
|
|
66
|
-
folder in the newly created modern hierarchy.
|
|
67
|
-
|
|
68
|
-
This worker function is used to physically rearrange the data from the original Tyche data structure to the
|
|
69
|
-
new data structure. It both moves the existing files to their new destinations and renames certain files to match
|
|
70
|
-
the modern naming convention used in the Sun lab.
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
|
|
74
|
-
source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
|
|
75
|
-
|
|
76
|
-
Returns:
|
|
77
|
-
True if the ascension process was successfully completed. False if the process encountered missing data or
|
|
78
|
-
otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
|
|
79
|
-
to finalize the process manually.
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
# Resolves expected data targets:
|
|
83
|
-
|
|
84
|
-
# These files should be present in all well-formed session data folders. While not all session folders are
|
|
85
|
-
# well-formed, we will likely exclude any non-well-formed folders from processing.
|
|
86
|
-
zstack_path = source_root.joinpath("zstack.mat")
|
|
87
|
-
motion_estimator_path = source_root.joinpath("MotionEstimator.me")
|
|
88
|
-
ops_path = source_root.joinpath("ops.json")
|
|
89
|
-
mesoscope_frames_path = source_root.joinpath("mesoscope_frames")
|
|
90
|
-
ax_checksum_path = source_root.joinpath("ax_checksum.txt")
|
|
91
|
-
|
|
92
|
-
# These two file types are present for some, but not all folders. They are not as important as the files mentioned
|
|
93
|
-
# above, though, as, currently, the data stored in these files is not used during processing.
|
|
94
|
-
frame_metadata_path = source_root.joinpath("frame_metadata.npz")
|
|
95
|
-
metadata_path = source_root.joinpath("metadata.json")
|
|
96
|
-
|
|
97
|
-
# This tracker is used to mark the session for manual intervention if any expected data is missing from the source
|
|
98
|
-
# session folder. At the end of this function's runtime, it determines whether the function returns True or False
|
|
99
|
-
data_missing = False
|
|
100
|
-
|
|
101
|
-
# First, moves the mesoscope TIFF stacks to the newly created session data hierarchy as mesoscope_data subfolder
|
|
102
|
-
if mesoscope_frames_path.exists():
|
|
103
|
-
mesoscope_frames_path.rename(session_data.raw_data.mesoscope_data_path)
|
|
104
|
-
else:
|
|
105
|
-
data_missing = True
|
|
106
|
-
|
|
107
|
-
# Then, moves 'loose' mesoscope-related data files to the mesoscope_data folder.
|
|
108
|
-
if zstack_path.exists():
|
|
109
|
-
zstack_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("zstack.mat"))
|
|
110
|
-
else:
|
|
111
|
-
data_missing = True
|
|
112
|
-
|
|
113
|
-
if motion_estimator_path.exists():
|
|
114
|
-
motion_estimator_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("MotionEstimator.me"))
|
|
115
|
-
else:
|
|
116
|
-
data_missing = True
|
|
117
|
-
|
|
118
|
-
if ops_path.exists():
|
|
119
|
-
ops_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("ops.json"))
|
|
120
|
-
else:
|
|
121
|
-
data_missing = True
|
|
122
|
-
|
|
123
|
-
# If variant and invariant metadata files exist, also moves them to the mesoscope data folder and renames the
|
|
124
|
-
# files to use the latest naming convention. Missing any of these files is not considered a user-intervention-worthy
|
|
125
|
-
# situation.
|
|
126
|
-
if frame_metadata_path.exists():
|
|
127
|
-
frame_metadata_path.rename(
|
|
128
|
-
Path(session_data.raw_data.mesoscope_data_path).joinpath("frame_variant_metadata.npz")
|
|
129
|
-
)
|
|
130
|
-
if metadata_path.exists():
|
|
131
|
-
metadata_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("frame_invariant_metadata.json"))
|
|
132
|
-
|
|
133
|
-
# Loops over all camera video files (using the .avi extension) and moves them to the camera_data folder.
|
|
134
|
-
videos_found = 0
|
|
135
|
-
for video in source_root.glob("*.avi"):
|
|
136
|
-
videos_found += 1
|
|
137
|
-
video.rename(Path(session_data.raw_data.camera_data_path).joinpath(video.name))
|
|
138
|
-
if videos_found == 0:
|
|
139
|
-
data_missing = True
|
|
140
|
-
|
|
141
|
-
# Loops over all behavior log files (old GIMBL format) and moves them to the behavior_data folder.
|
|
142
|
-
logs_found = 0
|
|
143
|
-
for log in source_root.glob("Log Tyche-* ????-??-?? session *.json"):
|
|
144
|
-
logs_found += 1
|
|
145
|
-
log.rename(Path(session_data.raw_data.behavior_data_path).joinpath(log.name))
|
|
146
|
-
if logs_found == 0:
|
|
147
|
-
data_missing = True
|
|
148
|
-
|
|
149
|
-
# Removes the checksum file if it exists. Due to file name and location changes, the session data folder has to
|
|
150
|
-
# be re-checksummed after the reorganization anyway, so there is no need to keep the original file.
|
|
151
|
-
ax_checksum_path.unlink(missing_ok=True)
|
|
152
|
-
|
|
153
|
-
# Loops over all remaining contents of the directory.
|
|
154
|
-
for path in source_root.glob("*"):
|
|
155
|
-
# At this point, there should be no more subfolders left inside the root directory. If there are more
|
|
156
|
-
# subfolders, this case requires user intervention
|
|
157
|
-
if path.is_dir():
|
|
158
|
-
data_missing = True
|
|
159
|
-
|
|
160
|
-
# All non-subfolder files are moved to the root raw_data directory of the newly created session.
|
|
161
|
-
else:
|
|
162
|
-
path.rename(Path(session_data.raw_data.raw_data_path).joinpath(path.name))
|
|
163
|
-
|
|
164
|
-
# Session data has been fully reorganized. Depending on whether there was any missing data during processing,
|
|
165
|
-
# returns the boolean flag for whether user intervention is required
|
|
166
|
-
if data_missing:
|
|
167
|
-
return False
|
|
168
|
-
else:
|
|
169
|
-
return True
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
def ascend_tyche_data(root_directory: Path) -> None:
|
|
173
|
-
"""Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
|
|
174
|
-
|
|
175
|
-
This function is used to convert old Tyche data to the modern data management standard. This is used to make the
|
|
176
|
-
data compatible with the modern Sun lab data workflows.
|
|
177
|
-
|
|
178
|
-
Notes:
|
|
179
|
-
This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
|
|
180
|
-
https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
|
|
181
|
-
preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
|
|
182
|
-
project or data hierarchy.
|
|
183
|
-
|
|
184
|
-
As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
|
|
185
|
-
Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
|
|
186
|
-
mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
|
|
187
|
-
this function for a large number of sessions will result in a long processing time due to the network data
|
|
188
|
-
transfer.
|
|
189
|
-
|
|
190
|
-
Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
|
|
191
|
-
only work on a machine that is part of an active Sun lab acquisition system.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
|
|
195
|
-
root directory for the Tyche project.
|
|
196
|
-
"""
|
|
197
|
-
# The acquisition system config resolves most paths and filesystem configuration arguments
|
|
198
|
-
acquisition_system = get_system_configuration_data()
|
|
199
|
-
server_root_directory = acquisition_system.paths.server_storage_directory
|
|
200
|
-
|
|
201
|
-
# Statically defines project name and local root paths
|
|
202
|
-
project_name = "Tyche"
|
|
203
|
-
|
|
204
|
-
# Assumes that the root directory stores all animal folders to be processed
|
|
205
|
-
for animal_folder in root_directory.iterdir():
|
|
206
|
-
# Each animal folder is named to include a project name and a static animal ID, e.g.: Tyche-A7. This extracts
|
|
207
|
-
# each animal ID.
|
|
208
|
-
animal_name = animal_folder.stem.split(sep="-")[1]
|
|
209
|
-
|
|
210
|
-
# Under each animal root folder, there are day folders that use YYYY-MM-DD timestamps
|
|
211
|
-
for session_folder in animal_folder.iterdir():
|
|
212
|
-
# Inside each day folder, there are one or more acquisitions (sessions)
|
|
213
|
-
for acquisition_folder in session_folder.iterdir():
|
|
214
|
-
# For each session, we extract the modification time from either (preferentially) zstack.mat or
|
|
215
|
-
# MotionEstimator.me file. Any session without these files is flagged for additional user intervention.
|
|
216
|
-
# This procedure generates timestamp-based session names, analogous to how our modern pipeline does it.
|
|
217
|
-
session_name = _generate_session_name(acquisition_path=acquisition_folder)
|
|
218
|
-
|
|
219
|
-
# Uses derived session name and the derived project name to create the session data hierarchy using the
|
|
220
|
-
# output root. This generates a 'standard' Sun lab directory structure for the Tyche data.
|
|
221
|
-
session_data = SessionData.create(
|
|
222
|
-
project_name=project_name,
|
|
223
|
-
session_name=session_name,
|
|
224
|
-
animal_id=animal_name,
|
|
225
|
-
session_type=SessionTypes.MESOSCOPE_EXPERIMENT,
|
|
226
|
-
experiment_name=None,
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
# Since this runtime reprocesses already acquired data, marks the session as fully initialized.
|
|
230
|
-
session_data.runtime_initialized()
|
|
231
|
-
|
|
232
|
-
# Moves the data from the old hierarchy to the new hierarchy. If the process runs as expected, and
|
|
233
|
-
# fully empties the source acquisition folder, it destroys the folder. Otherwise, notifies the user that
|
|
234
|
-
# the runtime did not fully process the session data and requests intervention.
|
|
235
|
-
success = _reorganize_data(session_data, acquisition_folder)
|
|
236
|
-
if not success:
|
|
237
|
-
message = (
|
|
238
|
-
f"Encountered issues when reorganizing {animal_name} session {session_name}. "
|
|
239
|
-
f"User intervention is required to finish data reorganization process for this session."
|
|
240
|
-
)
|
|
241
|
-
# noinspection PyTypeChecker
|
|
242
|
-
console.echo(message=message, level=LogLevel.WARNING)
|
|
243
|
-
else:
|
|
244
|
-
# Generates the telomere.bin file to mark the session as 'complete'
|
|
245
|
-
session_data.raw_data.telomere_path.touch()
|
|
246
|
-
|
|
247
|
-
# If the local transfer process was successful, generates a new checksum for the moved data
|
|
248
|
-
calculate_directory_checksum(directory=Path(session_data.raw_data.raw_data_path))
|
|
249
|
-
|
|
250
|
-
# Next, copies the data to the BioHPC server for further processing
|
|
251
|
-
transfer_directory(
|
|
252
|
-
source=Path(session_data.raw_data.raw_data_path),
|
|
253
|
-
destination=Path(
|
|
254
|
-
server_root_directory.joinpath(project_name, animal_name, session_name, "raw_data")
|
|
255
|
-
),
|
|
256
|
-
verify_integrity=False,
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
# Removes the now-empty old session data directory.
|
|
260
|
-
acquisition_folder.rmdir()
|
|
261
|
-
|
|
262
|
-
# If the loop above removed all acquisition folders, all data for that day has been successfully converted
|
|
263
|
-
# to use the new session format. Removes the now-empty 'day' folder from the target animal
|
|
264
|
-
if len([folder for folder in session_folder.iterdir()]) == 0:
|
|
265
|
-
session_folder.rmdir()
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
from ..data_classes import (
|
|
4
|
-
SessionData as SessionData,
|
|
5
|
-
SessionTypes as SessionTypes,
|
|
6
|
-
get_system_configuration_data as get_system_configuration_data,
|
|
7
|
-
)
|
|
8
|
-
from .transfer_tools import transfer_directory as transfer_directory
|
|
9
|
-
from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
|
|
10
|
-
|
|
11
|
-
def _generate_session_name(acquisition_path: Path) -> str:
|
|
12
|
-
"""Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
|
|
13
|
-
|
|
14
|
-
This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
|
|
15
|
-
generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
|
|
16
|
-
pattern into the pattern used by all modern Sun lab projects and pipelines.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
|
|
20
|
-
folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
The modernized session name.
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
|
|
27
|
-
"""Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
|
|
28
|
-
folder in the newly created modern hierarchy.
|
|
29
|
-
|
|
30
|
-
This worker function is used to physically rearrange the data from the original Tyche data structure to the
|
|
31
|
-
new data structure. It both moves the existing files to their new destinations and renames certain files to match
|
|
32
|
-
the modern naming convention used in the Sun lab.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
|
|
36
|
-
source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
True if the ascension process was successfully completed. False if the process encountered missing data or
|
|
40
|
-
otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
|
|
41
|
-
to finalize the process manually.
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
def ascend_tyche_data(root_directory: Path) -> None:
|
|
45
|
-
"""Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
|
|
46
|
-
|
|
47
|
-
This function is used to convert old Tyche data to the modern data management standard. This is used to make the
|
|
48
|
-
data compatible with the modern Sun lab data workflows.
|
|
49
|
-
|
|
50
|
-
Notes:
|
|
51
|
-
This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
|
|
52
|
-
https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
|
|
53
|
-
preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
|
|
54
|
-
project or data hierarchy.
|
|
55
|
-
|
|
56
|
-
As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
|
|
57
|
-
Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
|
|
58
|
-
mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
|
|
59
|
-
this function for a large number of sessions will result in a long processing time due to the network data
|
|
60
|
-
transfer.
|
|
61
|
-
|
|
62
|
-
Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
|
|
63
|
-
only work on a machine that is part of an active Sun lab acquisition system.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
|
|
67
|
-
root directory for the Tyche project.
|
|
68
|
-
"""
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
from _typeshed import Incomplete
|
|
4
|
-
|
|
5
|
-
from ..data_classes import TrackerFileNames as TrackerFileNames
|
|
6
|
-
|
|
7
|
-
_excluded_files: Incomplete
|
|
8
|
-
|
|
9
|
-
def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
|
|
10
|
-
"""Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
|
|
11
|
-
|
|
12
|
-
This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
|
|
13
|
-
calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
|
|
14
|
-
target file, which includes both the contents of the file and its path relative to the base directory.
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
base_directory: The path to the base (root) directory which is being checksummed by the main
|
|
18
|
-
'calculate_directory_checksum' function.
|
|
19
|
-
file_path: The absolute path to the target file.
|
|
20
|
-
|
|
21
|
-
Returns:
|
|
22
|
-
A tuple with two elements. The first element is the path to the file relative to the base directory. The second
|
|
23
|
-
element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def calculate_directory_checksum(
|
|
27
|
-
directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
|
|
28
|
-
) -> str:
|
|
29
|
-
"""Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
|
|
30
|
-
the directory structure information.
|
|
31
|
-
|
|
32
|
-
This function is used to generate a checksum for the raw_data directory of each experiment or training session.
|
|
33
|
-
Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
|
|
34
|
-
and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
|
|
35
|
-
write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
|
|
36
|
-
input directory.
|
|
37
|
-
|
|
38
|
-
Note:
|
|
39
|
-
This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
|
|
40
|
-
combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
|
|
41
|
-
SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
|
|
42
|
-
|
|
43
|
-
The method notifies the user about the checksum calculation process via the terminal.
|
|
44
|
-
|
|
45
|
-
The returned checksum accounts for both the contents of each file and the layout of the input directory
|
|
46
|
-
structure.
|
|
47
|
-
|
|
48
|
-
Args:
|
|
49
|
-
directory: The Path to the directory to be checksummed.
|
|
50
|
-
num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
|
|
51
|
-
function defaults to using (logical CPU count - 4).
|
|
52
|
-
batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
|
|
53
|
-
to optimize progress reporting to avoid cluttering the terminal.
|
|
54
|
-
save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
The xxHash3-128 checksum for the input directory as a hexadecimal string.
|
|
58
|
-
"""
|
|
@@ -1,239 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import polars as pl
|
|
4
|
-
|
|
5
|
-
from ..data_classes import (
|
|
6
|
-
SessionData as SessionData,
|
|
7
|
-
SessionTypes as SessionTypes,
|
|
8
|
-
TrackerFileNames as TrackerFileNames,
|
|
9
|
-
RunTrainingDescriptor as RunTrainingDescriptor,
|
|
10
|
-
LickTrainingDescriptor as LickTrainingDescriptor,
|
|
11
|
-
WindowCheckingDescriptor as WindowCheckingDescriptor,
|
|
12
|
-
MesoscopeExperimentDescriptor as MesoscopeExperimentDescriptor,
|
|
13
|
-
get_processing_tracker as get_processing_tracker,
|
|
14
|
-
)
|
|
15
|
-
from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
|
|
16
|
-
|
|
17
|
-
class ProjectManifest:
|
|
18
|
-
"""Wraps the contents of a Sun lab project manifest .feather file and exposes methods for visualizing and
|
|
19
|
-
working with the data stored inside the file.
|
|
20
|
-
|
|
21
|
-
This class functions as a high-level API for working with Sun lab projects. It is used both to visualize the
|
|
22
|
-
current state of various projects and during automated data processing to determine which processing steps to
|
|
23
|
-
apply to different sessions.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
manifest_file: The path to the .feather manifest file that stores the target project's state data.
|
|
27
|
-
|
|
28
|
-
Attributes:
|
|
29
|
-
_data: Stores the manifest data as a Polars DataFrame.
|
|
30
|
-
_animal_string: Determines whether animal IDs are stored as strings or unsigned integers.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
_data: pl.DataFrame
|
|
34
|
-
_animal_string: bool
|
|
35
|
-
def __init__(self, manifest_file: Path) -> None: ...
|
|
36
|
-
def print_data(self) -> None:
|
|
37
|
-
"""Prints the entire contents of the manifest file to the terminal."""
|
|
38
|
-
def print_summary(self, animal: str | int | None = None) -> None:
|
|
39
|
-
"""Prints a summary view of the manifest file to the terminal, excluding the 'experimenter notes' data for
|
|
40
|
-
each session.
|
|
41
|
-
|
|
42
|
-
This data view is optimized for tracking which processing steps have been applied to each session inside the
|
|
43
|
-
project.
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
animal: The ID of the animal for which to display the data. If an ID is provided, this method will only
|
|
47
|
-
display the data for that animal. Otherwise, it will display the data for all animals.
|
|
48
|
-
"""
|
|
49
|
-
def print_notes(self, animal: str | int | None = None) -> None:
|
|
50
|
-
"""Prints only animal, session, and notes data from the manifest file.
|
|
51
|
-
|
|
52
|
-
This data view is optimized for experimenters to check what sessions have been recorded for each animal in the
|
|
53
|
-
project and refresh their memory on the outcomes of each session using experimenter notes.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
animal: The ID of the animal for which to display the data. If an ID is provided, this method will only
|
|
57
|
-
display the data for that animal. Otherwise, it will display the data for all animals.
|
|
58
|
-
"""
|
|
59
|
-
@property
|
|
60
|
-
def animals(self) -> tuple[str, ...]:
|
|
61
|
-
"""Returns all unique animal IDs stored inside the manifest file.
|
|
62
|
-
|
|
63
|
-
This provides a tuple of all animal IDs participating in the target project.
|
|
64
|
-
"""
|
|
65
|
-
def _get_filtered_sessions(
|
|
66
|
-
self,
|
|
67
|
-
animal: str | int | None = None,
|
|
68
|
-
exclude_incomplete: bool = True,
|
|
69
|
-
dataset_ready_only: bool = False,
|
|
70
|
-
not_dataset_ready_only: bool = False,
|
|
71
|
-
) -> tuple[str, ...]:
|
|
72
|
-
"""This worker method is used to get a list of sessions with optional filtering.
|
|
73
|
-
|
|
74
|
-
User-facing methods call this worker under-the-hood to fetch the filtered tuple of sessions.
|
|
75
|
-
|
|
76
|
-
Args:
|
|
77
|
-
animal: An optional animal ID to filter the sessions. If set to None, the method returns sessions for all
|
|
78
|
-
animals.
|
|
79
|
-
exclude_incomplete: Determines whether to exclude sessions not marked as 'complete' from the output
|
|
80
|
-
list.
|
|
81
|
-
dataset_ready_only: Determines whether to exclude sessions not marked as 'dataset' integration ready from
|
|
82
|
-
the output list. Enabling this option only shows sessions that can be integrated into a dataset.
|
|
83
|
-
not_dataset_ready_only: The opposite of 'dataset_ready_only'. Determines whether to exclude sessions marked
|
|
84
|
-
as 'dataset' integration ready from the output list. Note, when both this and 'dataset_ready_only' are
|
|
85
|
-
enabled, the 'dataset_ready_only' option takes precedence.
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
The tuple of session IDs matching the filter criteria.
|
|
89
|
-
|
|
90
|
-
Raises:
|
|
91
|
-
ValueError: If the specified animal is not found in the manifest file.
|
|
92
|
-
"""
|
|
93
|
-
@property
|
|
94
|
-
def sessions(self) -> tuple[str, ...]:
|
|
95
|
-
"""Returns all session IDs stored inside the manifest file.
|
|
96
|
-
|
|
97
|
-
This property provides a tuple of all sessions, independent of the participating animal, that were recorded as
|
|
98
|
-
part of the target project. Use the get_sessions() method to get the list of session tuples with filtering.
|
|
99
|
-
"""
|
|
100
|
-
def get_sessions(
|
|
101
|
-
self,
|
|
102
|
-
animal: str | int | None = None,
|
|
103
|
-
exclude_incomplete: bool = True,
|
|
104
|
-
dataset_ready_only: bool = False,
|
|
105
|
-
not_dataset_ready_only: bool = False,
|
|
106
|
-
) -> tuple[str, ...]:
|
|
107
|
-
"""Returns requested session IDs based on selected filtering criteria.
|
|
108
|
-
|
|
109
|
-
This method provides a tuple of sessions based on the specified filters. If no animal is specified, returns
|
|
110
|
-
sessions for all animals in the project.
|
|
111
|
-
|
|
112
|
-
Args:
|
|
113
|
-
animal: An optional animal ID to filter the sessions. If set to None, the method returns sessions for all
|
|
114
|
-
animals.
|
|
115
|
-
exclude_incomplete: Determines whether to exclude sessions not marked as 'complete' from the output
|
|
116
|
-
list.
|
|
117
|
-
dataset_ready_only: Determines whether to exclude sessions not marked as 'dataset' integration ready from
|
|
118
|
-
the output list. Enabling this option only shows sessions that can be integrated into a dataset.
|
|
119
|
-
not_dataset_ready_only: The opposite of 'dataset_ready_only'. Determines whether to exclude sessions marked
|
|
120
|
-
as 'dataset' integration ready from the output list. Note, when both this and 'dataset_ready_only' are
|
|
121
|
-
enabled, the 'dataset_ready_only' option takes precedence.
|
|
122
|
-
|
|
123
|
-
Returns:
|
|
124
|
-
The tuple of session IDs matching the filter criteria.
|
|
125
|
-
|
|
126
|
-
Raises:
|
|
127
|
-
ValueError: If the specified animal is not found in the manifest file.
|
|
128
|
-
"""
|
|
129
|
-
def get_session_info(self, session: str) -> pl.DataFrame:
|
|
130
|
-
"""Returns a Polars DataFrame that stores detailed information for the specified session.
|
|
131
|
-
|
|
132
|
-
Since session IDs are unique, it is expected that filtering by session ID is enough to get the requested
|
|
133
|
-
information.
|
|
134
|
-
|
|
135
|
-
Args:
|
|
136
|
-
session: The ID of the session for which to retrieve the data.
|
|
137
|
-
|
|
138
|
-
Returns:
|
|
139
|
-
A Polars DataFrame with the following columns: 'animal', 'date', 'notes', 'session', 'type', 'complete',
|
|
140
|
-
'intensity_verification', 'suite2p', 'behavior', 'video', 'dataset'.
|
|
141
|
-
"""
|
|
142
|
-
|
|
143
|
-
def generate_project_manifest(
|
|
144
|
-
raw_project_directory: Path, output_directory: Path, processed_data_root: Path | None = None
|
|
145
|
-
) -> None:
|
|
146
|
-
"""Builds and saves the project manifest .feather file under the specified output directory.
|
|
147
|
-
|
|
148
|
-
This function evaluates the input project directory and builds the 'manifest' file for the project. The file
|
|
149
|
-
includes the descriptive information about every session stored inside the input project folder and the state of
|
|
150
|
-
the session's data processing (which processing pipelines have been applied to each session). The file will be
|
|
151
|
-
created under the 'output_path' directory and use the following name pattern: ProjectName_manifest.feather.
|
|
152
|
-
|
|
153
|
-
Notes:
|
|
154
|
-
The manifest file is primarily used to capture and move project state information between machines, typically
|
|
155
|
-
in the context of working with data stored on a remote compute server or cluster. However, it can also be used
|
|
156
|
-
on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
|
|
157
|
-
lab regardless of the runtime context.
|
|
158
|
-
|
|
159
|
-
Args:
|
|
160
|
-
raw_project_directory: The path to the root project directory used to store raw session data.
|
|
161
|
-
output_directory: The path to the directory where to save the generated manifest file.
|
|
162
|
-
processed_data_root: The path to the root directory (volume) used to store processed data for all Sun lab
|
|
163
|
-
projects if it is different from the parent of the 'raw_project_directory'. Typically, this would be the
|
|
164
|
-
case on remote compute server(s) and not on local machines.
|
|
165
|
-
"""
|
|
166
|
-
|
|
167
|
-
def verify_session_checksum(
|
|
168
|
-
session_path: Path,
|
|
169
|
-
manager_id: int,
|
|
170
|
-
create_processed_data_directory: bool = True,
|
|
171
|
-
processed_data_root: None | Path = None,
|
|
172
|
-
update_manifest: bool = False,
|
|
173
|
-
) -> None:
|
|
174
|
-
"""Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
|
|
175
|
-
comparing it against the checksum stored in the ax_checksum.txt file.
|
|
176
|
-
|
|
177
|
-
Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
|
|
178
|
-
server for long-term storage. This function is designed to create the 'verified.bin' marker file if the checksum
|
|
179
|
-
matches and to remove the 'telomere.bin' and 'verified.bin' marker files if it does not.
|
|
180
|
-
|
|
181
|
-
Notes:
|
|
182
|
-
Removing the telomere.bin marker file from the session's raw_data folder marks the session as incomplete,
|
|
183
|
-
excluding it from all further automatic processing.
|
|
184
|
-
|
|
185
|
-
This function is also used to create the processed data hierarchy on the BioHPC server, when it is called as
|
|
186
|
-
part of the data preprocessing runtime performed by a data acquisition system.
|
|
187
|
-
|
|
188
|
-
Since version 3.1.0, this functon also supports (re) generating the processed session's project manifest file,
|
|
189
|
-
which is used to support further Sun lab data processing pipelines.
|
|
190
|
-
|
|
191
|
-
Args:
|
|
192
|
-
session_path: The path to the session directory to be verified. Note, the input session directory must contain
|
|
193
|
-
the 'raw_data' subdirectory.
|
|
194
|
-
manager_id: The xxHash-64 hash-value that specifies the unique identifier of the manager process that
|
|
195
|
-
manages the integrity verification runtime.
|
|
196
|
-
create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
|
|
197
|
-
processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
|
|
198
|
-
the root directory where to store the processed data from all projects, and it will be automatically
|
|
199
|
-
modified to include the project name, the animal name, and the session ID.
|
|
200
|
-
update_manifest: Determines whether to update (regenerate) the project manifest file for the processed session's
|
|
201
|
-
project. This should always be enabled when working with remote compute server(s) to ensure that the
|
|
202
|
-
project manifest file contains the most actual snapshot of the project's state.
|
|
203
|
-
"""
|
|
204
|
-
|
|
205
|
-
def resolve_p53_marker(
|
|
206
|
-
session_path: Path,
|
|
207
|
-
create_processed_data_directory: bool = True,
|
|
208
|
-
processed_data_root: None | Path = None,
|
|
209
|
-
remove: bool = False,
|
|
210
|
-
update_manifest: bool = False,
|
|
211
|
-
) -> None:
|
|
212
|
-
"""Depending on configuration, either creates or removes the p53.bin marker file for the target session.
|
|
213
|
-
|
|
214
|
-
The marker file statically determines whether the session can be targeted by data processing or dataset formation
|
|
215
|
-
pipelines.
|
|
216
|
-
|
|
217
|
-
Notes:
|
|
218
|
-
Since dataset integration relies on data processing outputs, it is essential to prevent processing pipelines
|
|
219
|
-
from altering the data while it is integrated into a dataset. The p53.bin marker solves this issue by ensuring
|
|
220
|
-
that only one type of runtimes (processing or dataset integration) is allowed to work with the session.
|
|
221
|
-
|
|
222
|
-
For the p53.bin marker to be created, the session must not be undergoing processing. For the p53 marker
|
|
223
|
-
to be removed, the session must not be undergoing dataset integration.
|
|
224
|
-
|
|
225
|
-
Since version 3.1.0, this functon also supports (re)generating the processed session's project manifest file,
|
|
226
|
-
which is used to support further Sun lab data processing pipelines.
|
|
227
|
-
|
|
228
|
-
Args:
|
|
229
|
-
session_path: The path to the session directory for which the p53.bin marker needs to be resolved. Note, the
|
|
230
|
-
input session directory must contain the 'raw_data' subdirectory.
|
|
231
|
-
create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
|
|
232
|
-
processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
|
|
233
|
-
the root directory where to store the processed data from all projects, and it will be automatically
|
|
234
|
-
modified to include the project name, the animal name, and the session ID.
|
|
235
|
-
remove: Determines whether this function is called to create or remove the p53.bin marker.
|
|
236
|
-
update_manifest: Determines whether to update (regenerate) the project manifest file for the processed session's
|
|
237
|
-
project. This should always be enabled when working with remote compute server(s) to ensure that the
|
|
238
|
-
project manifest file contains the most actual snapshot of the project's state.
|
|
239
|
-
"""
|