sl-shared-assets 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +80 -0
- sl_shared_assets/__init__.pyi +73 -0
- sl_shared_assets/cli.py +384 -0
- sl_shared_assets/cli.pyi +94 -0
- sl_shared_assets/data_classes/__init__.py +66 -0
- sl_shared_assets/data_classes/__init__.pyi +61 -0
- sl_shared_assets/data_classes/configuration_data.py +479 -0
- sl_shared_assets/data_classes/configuration_data.pyi +199 -0
- sl_shared_assets/data_classes/runtime_data.py +251 -0
- sl_shared_assets/data_classes/runtime_data.pyi +145 -0
- sl_shared_assets/data_classes/session_data.py +625 -0
- sl_shared_assets/data_classes/session_data.pyi +252 -0
- sl_shared_assets/data_classes/surgery_data.py +152 -0
- sl_shared_assets/data_classes/surgery_data.pyi +89 -0
- sl_shared_assets/py.typed +0 -0
- sl_shared_assets/server/__init__.py +8 -0
- sl_shared_assets/server/__init__.pyi +8 -0
- sl_shared_assets/server/job.py +140 -0
- sl_shared_assets/server/job.pyi +94 -0
- sl_shared_assets/server/server.py +214 -0
- sl_shared_assets/server/server.pyi +95 -0
- sl_shared_assets/tools/__init__.py +15 -0
- sl_shared_assets/tools/__init__.pyi +15 -0
- sl_shared_assets/tools/ascension_tools.py +277 -0
- sl_shared_assets/tools/ascension_tools.pyi +68 -0
- sl_shared_assets/tools/packaging_tools.py +148 -0
- sl_shared_assets/tools/packaging_tools.pyi +56 -0
- sl_shared_assets/tools/project_management_tools.py +201 -0
- sl_shared_assets/tools/project_management_tools.pyi +54 -0
- sl_shared_assets/tools/transfer_tools.py +119 -0
- sl_shared_assets/tools/transfer_tools.pyi +53 -0
- sl_shared_assets-1.0.0.dist-info/METADATA +869 -0
- sl_shared_assets-1.0.0.dist-info/RECORD +36 -0
- sl_shared_assets-1.0.0.dist-info/WHEEL +4 -0
- sl_shared_assets-1.0.0.dist-info/entry_points.txt +8 -0
- sl_shared_assets-1.0.0.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""This module provides tools for translating ('ascending') old Tyche data to use the modern data structure used in the
|
|
2
|
+
Sun lab. The tools from this module will not work for any other data and also assume that the Tyche data has been
|
|
3
|
+
preprocessed with an early version of the Sun lab mesoscope processing pipeline. However, this module can be used as
|
|
4
|
+
an example for how to convert other data formats to match use the Sun lab data structure."""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import datetime
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from ataraxis_base_utilities import LogLevel, console, ensure_directory_exists
|
|
11
|
+
from ataraxis_time.time_helpers import extract_timestamp_from_bytes
|
|
12
|
+
|
|
13
|
+
from ..data_classes import SessionData, ProjectConfiguration, get_system_configuration_data
|
|
14
|
+
from .transfer_tools import transfer_directory
|
|
15
|
+
from .packaging_tools import calculate_directory_checksum
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _generate_session_name(acquisition_path: Path) -> str:
|
|
19
|
+
"""Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
|
|
20
|
+
|
|
21
|
+
This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
|
|
22
|
+
generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
|
|
23
|
+
pattern into the pattern used by all modern Sun lab projects and pipelines.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
|
|
27
|
+
folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
The modernized session name.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# All well-formed sessions are expected to contain both the zstack.mat and the MotionEstimator.me file.
|
|
34
|
+
# We use the last modification time from one of these files to infer when the session was carried out. This allows
|
|
35
|
+
# us to gather the time information, which is missing from the original session naming pattern.
|
|
36
|
+
source: Path
|
|
37
|
+
if acquisition_path.joinpath("zstack.mat").exists():
|
|
38
|
+
source = acquisition_path.joinpath("zstack.mat")
|
|
39
|
+
elif acquisition_path.joinpath("MotionEstimator.me").exists():
|
|
40
|
+
source = acquisition_path.joinpath("MotionEstimator.me")
|
|
41
|
+
else:
|
|
42
|
+
message = (
|
|
43
|
+
f"Unable to find zstack.mat or MotionEstimator.me file in the target acquisition subfolder "
|
|
44
|
+
f"{acquisition_path} of the session {acquisition_path.parent}. Manual intervention is required to ascend "
|
|
45
|
+
f"the target session folder to the latest Sun lab data format."
|
|
46
|
+
)
|
|
47
|
+
console.error(message=message, error=FileNotFoundError)
|
|
48
|
+
raise FileNotFoundError(message) # Fall-back to appease mypy
|
|
49
|
+
|
|
50
|
+
# Gets last modified time (available on all platforms) and converts it to a UTC timestamp object.
|
|
51
|
+
mod_time = source.stat().st_mtime
|
|
52
|
+
mod_datetime = datetime.datetime.fromtimestamp(mod_time)
|
|
53
|
+
|
|
54
|
+
# Converts the timestamp to microseconds as uint64, then to an array of 8 uint8 bytes. The array is then reformatted
|
|
55
|
+
# to match the session name pattern used in the modern Sun lab data pipelines.
|
|
56
|
+
timestamp_microseconds = np.uint64(int(mod_datetime.timestamp() * 1_000_000))
|
|
57
|
+
timestamp_bytes = np.array([(timestamp_microseconds >> (8 * i)) & 0xFF for i in range(8)], dtype=np.uint8)
|
|
58
|
+
stamp = extract_timestamp_from_bytes(timestamp_bytes=timestamp_bytes)
|
|
59
|
+
|
|
60
|
+
# Returns the generated session name to caller.
|
|
61
|
+
return stamp
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
|
|
65
|
+
"""Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
|
|
66
|
+
folder in the newly created modern hierarchy.
|
|
67
|
+
|
|
68
|
+
This worker function is used to physically rearrange the data from the original Tyche data structure to the
|
|
69
|
+
new data structure. It both moves the existing files to their new destinations and renames certain files to match
|
|
70
|
+
the modern naming convention used in the Sun lab.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
|
|
74
|
+
source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
True if the ascension process was successfully completed. False if the process encountered missing data or
|
|
78
|
+
otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
|
|
79
|
+
to finalize the process manually.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
# Resolves expected data targets:
|
|
83
|
+
|
|
84
|
+
# These files should be present in all well-formed session data folders. While not all session folders are
|
|
85
|
+
# well-formed, we will likely exclude any non-well-formed folders from processing.
|
|
86
|
+
zstack_path = source_root.joinpath("zstack.mat")
|
|
87
|
+
motion_estimator_path = source_root.joinpath("MotionEstimator.me")
|
|
88
|
+
ops_path = source_root.joinpath("ops.json")
|
|
89
|
+
mesoscope_frames_path = source_root.joinpath("mesoscope_frames")
|
|
90
|
+
ax_checksum_path = source_root.joinpath("ax_checksum.txt")
|
|
91
|
+
|
|
92
|
+
# These two file types are present for some, but not all folders. They are not as important as the group of files
|
|
93
|
+
# above though, as, currently, the data stored in these files is not used during processing.
|
|
94
|
+
frame_metadata_path = source_root.joinpath("frame_metadata.npz")
|
|
95
|
+
metadata_path = source_root.joinpath("metadata.json")
|
|
96
|
+
|
|
97
|
+
# This tracker is used to mark the session for manual intervention if any expected data is missing from the source
|
|
98
|
+
# session folder. At the end of this function's runtime, it determines whether the function returns True or False
|
|
99
|
+
data_missing = False
|
|
100
|
+
|
|
101
|
+
# First, moves the mesoscope TIFF stacks to the newly created session data hierarchy as mesoscope_data subfolder
|
|
102
|
+
if mesoscope_frames_path.exists():
|
|
103
|
+
mesoscope_frames_path.rename(session_data.raw_data.mesoscope_data_path)
|
|
104
|
+
else:
|
|
105
|
+
data_missing = True
|
|
106
|
+
|
|
107
|
+
# Then, moves 'loose' mesoscope-related data files to the mesoscope_data folder.
|
|
108
|
+
if zstack_path.exists():
|
|
109
|
+
zstack_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("zstack.mat"))
|
|
110
|
+
else:
|
|
111
|
+
data_missing = True
|
|
112
|
+
|
|
113
|
+
if motion_estimator_path.exists():
|
|
114
|
+
motion_estimator_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("MotionEstimator.me"))
|
|
115
|
+
else:
|
|
116
|
+
data_missing = True
|
|
117
|
+
|
|
118
|
+
if ops_path.exists():
|
|
119
|
+
ops_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("ops.json"))
|
|
120
|
+
else:
|
|
121
|
+
data_missing = True
|
|
122
|
+
|
|
123
|
+
# If variant and invariant metadata files exist, also moves them to the mesoscope data folder and renames the
|
|
124
|
+
# files to use the latest naming convention. Missing any of these files is not considered a user-intervention-worthy
|
|
125
|
+
# situation.
|
|
126
|
+
if frame_metadata_path.exists():
|
|
127
|
+
frame_metadata_path.rename(
|
|
128
|
+
Path(session_data.raw_data.mesoscope_data_path).joinpath("frame_variant_metadata.npz")
|
|
129
|
+
)
|
|
130
|
+
if metadata_path.exists():
|
|
131
|
+
metadata_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("frame_invariant_metadata.json"))
|
|
132
|
+
|
|
133
|
+
# Loops over all camera video files (using the .avi extension) and moves them to the camera_data folder.
|
|
134
|
+
videos_found = 0
|
|
135
|
+
for video in source_root.glob("*.avi"):
|
|
136
|
+
videos_found += 1
|
|
137
|
+
video.rename(Path(session_data.raw_data.camera_data_path).joinpath(video.name))
|
|
138
|
+
if videos_found == 0:
|
|
139
|
+
data_missing = True
|
|
140
|
+
|
|
141
|
+
# Loops over all behavior log files (old GIMBL format) and moves them to the behavior_data folder.
|
|
142
|
+
logs_found = 0
|
|
143
|
+
for log in source_root.glob("Log Tyche-* ????-??-?? session *.json"):
|
|
144
|
+
logs_found += 1
|
|
145
|
+
log.rename(Path(session_data.raw_data.behavior_data_path).joinpath(log.name))
|
|
146
|
+
if logs_found == 0:
|
|
147
|
+
data_missing = True
|
|
148
|
+
|
|
149
|
+
# Removes the checksum file if it exists. Due to file name and location changes, the session data folder has to
|
|
150
|
+
# be re-checksummed after the reorganization anyway, so there is no need to keep the original file.
|
|
151
|
+
ax_checksum_path.unlink(missing_ok=True)
|
|
152
|
+
|
|
153
|
+
# Loops over all remaining contents of the directory.
|
|
154
|
+
for path in source_root.glob("*"):
|
|
155
|
+
# At this point, there should be no more subfolders left inside the root directory. If there are more
|
|
156
|
+
# subfolders, this case requires user intervention
|
|
157
|
+
if path.is_dir():
|
|
158
|
+
data_missing = True
|
|
159
|
+
|
|
160
|
+
# All non-subfolder files are moved to the root raw_data directory of the newly created session.
|
|
161
|
+
else:
|
|
162
|
+
path.rename(Path(session_data.raw_data.raw_data_path).joinpath(path.name))
|
|
163
|
+
|
|
164
|
+
# Session data has been fully reorganized. Depending on whether there was any missing data during processing,
|
|
165
|
+
# returns the boolean flag for whether user intervention is required
|
|
166
|
+
if data_missing:
|
|
167
|
+
return False
|
|
168
|
+
else:
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def ascend_tyche_data(root_directory: Path) -> None:
|
|
173
|
+
"""Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
|
|
174
|
+
|
|
175
|
+
This function is used to convert old Tyche data to the modern data management standard. This is used to make the
|
|
176
|
+
data compatible with the modern Sun lab data workflows.
|
|
177
|
+
|
|
178
|
+
Notes:
|
|
179
|
+
This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
|
|
180
|
+
https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
|
|
181
|
+
preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
|
|
182
|
+
project or data hierarchy.
|
|
183
|
+
|
|
184
|
+
As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
|
|
185
|
+
Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
|
|
186
|
+
mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
|
|
187
|
+
this function for a large number of sessions will result in a long processing time due to the network data
|
|
188
|
+
transfer.
|
|
189
|
+
|
|
190
|
+
Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
|
|
191
|
+
only work on a machine that is part of an active Sun lab acquisition system.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
|
|
195
|
+
root directory for the Tyche project.
|
|
196
|
+
"""
|
|
197
|
+
# Generates a (shared) project configuration file.
|
|
198
|
+
project_configuration = ProjectConfiguration()
|
|
199
|
+
|
|
200
|
+
# The acquisition system config resolves most paths and filesystem configuration arguments
|
|
201
|
+
acquisition_system = get_system_configuration_data()
|
|
202
|
+
output_root_directory = acquisition_system.paths.root_directory
|
|
203
|
+
server_root_directory = acquisition_system.paths.server_storage_directory
|
|
204
|
+
|
|
205
|
+
# Statically defines project name and local root paths
|
|
206
|
+
project_name = "Tyche"
|
|
207
|
+
project_configuration.project_name = project_name
|
|
208
|
+
|
|
209
|
+
# Uses nonsensical google sheet IDs. Tyche project did not use Google Sheet processing like our modern projects do.
|
|
210
|
+
project_configuration.water_log_sheet_id = "1xFh9Q2zT7pL3mVkJdR8bN6yXoE4wS5aG0cHu2Kf7D3v"
|
|
211
|
+
project_configuration.surgery_sheet_id = "1xFh9Q2zT7pL3mVkJdR8bN6yXoE4wS5aG0cHu2Kf7D3v"
|
|
212
|
+
|
|
213
|
+
# Dumps project configuration into the 'configuration' subfolder of the Tyche project.
|
|
214
|
+
configuration_path = output_root_directory.joinpath("Tyche", "configuration", "project_configuration.yaml")
|
|
215
|
+
ensure_directory_exists(configuration_path)
|
|
216
|
+
project_configuration.save(path=configuration_path)
|
|
217
|
+
|
|
218
|
+
# Assumes that root directory stores all animal folders to be processed
|
|
219
|
+
for animal_folder in root_directory.iterdir():
|
|
220
|
+
# Each animal folder is named to include project name and a static animal ID, e.g.: Tyche-A7. This extracts each
|
|
221
|
+
# animal ID.
|
|
222
|
+
animal_name = animal_folder.stem.split(sep="-")[1]
|
|
223
|
+
|
|
224
|
+
# Under each animal root folder, there are day folders that use YYYY-MM-DD timestamps
|
|
225
|
+
for session_folder in animal_folder.iterdir():
|
|
226
|
+
# Inside each day folder, there are one or more acquisitions (sessions)
|
|
227
|
+
for acquisition_folder in session_folder.iterdir():
|
|
228
|
+
# For each session, we extract the modification time from either (preferentially) zstack.mat or
|
|
229
|
+
# MotionEstimator.me file. Any session without these files is flagged for additional user intervention.
|
|
230
|
+
# This procedure generates timestamp-based session names, analogous to how our modern pipeline does it.
|
|
231
|
+
session_name = _generate_session_name(acquisition_path=acquisition_folder)
|
|
232
|
+
|
|
233
|
+
# Uses derived session name and the statically created project configuration file to create the
|
|
234
|
+
# session data hierarchy using the output root. This generates a 'standard' Sun lab directory structure
|
|
235
|
+
# for the Tyche data.
|
|
236
|
+
session_data = SessionData.create(
|
|
237
|
+
project_name=project_configuration.project_name,
|
|
238
|
+
session_name=session_name,
|
|
239
|
+
animal_id=animal_name,
|
|
240
|
+
session_type="mesoscope experiment",
|
|
241
|
+
experiment_name=None,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Moves the data from the old hierarchy to the new hierarchy. If the process runs as expected, and
|
|
245
|
+
# fully empties the source acquisition folder, destroys the folder. Otherwise, notifies the user that
|
|
246
|
+
# the runtime did not fully process the session data and requests intervention.
|
|
247
|
+
success = _reorganize_data(session_data, acquisition_folder)
|
|
248
|
+
if not success:
|
|
249
|
+
message = (
|
|
250
|
+
f"Encountered issues when reorganizing {animal_name} session {session_name}. "
|
|
251
|
+
f"User intervention is required to finish data reorganization process for this session."
|
|
252
|
+
)
|
|
253
|
+
# noinspection PyTypeChecker
|
|
254
|
+
console.echo(message=message, level=LogLevel.WARNING)
|
|
255
|
+
else:
|
|
256
|
+
# Generates the telomere.bin file to mark the session as 'complete'
|
|
257
|
+
session_data.raw_data.telomere_path.touch()
|
|
258
|
+
|
|
259
|
+
# If the local transfer process was successful, generates a new checksum for the moved data
|
|
260
|
+
calculate_directory_checksum(directory=Path(session_data.raw_data.raw_data_path))
|
|
261
|
+
|
|
262
|
+
# Next, copies the data to the BioHPC server for further processing
|
|
263
|
+
transfer_directory(
|
|
264
|
+
source=Path(session_data.raw_data.raw_data_path),
|
|
265
|
+
destination=Path(
|
|
266
|
+
server_root_directory.joinpath(project_name, animal_name, session_name, "raw_data")
|
|
267
|
+
),
|
|
268
|
+
verify_integrity=False,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Removes the now-empty old session data directory.
|
|
272
|
+
acquisition_folder.rmdir()
|
|
273
|
+
|
|
274
|
+
# If the loop above removed all acquisition folders, all data for that day has been successfully converted
|
|
275
|
+
# to use the new session format. Removes the now-empty 'day' folder from the target animal
|
|
276
|
+
if len([folder for folder in session_folder.iterdir()]) == 0:
|
|
277
|
+
session_folder.rmdir()
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from ..data_classes import (
|
|
4
|
+
SessionData as SessionData,
|
|
5
|
+
ProjectConfiguration as ProjectConfiguration,
|
|
6
|
+
get_system_configuration_data as get_system_configuration_data,
|
|
7
|
+
)
|
|
8
|
+
from .transfer_tools import transfer_directory as transfer_directory
|
|
9
|
+
from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
|
|
10
|
+
|
|
11
|
+
def _generate_session_name(acquisition_path: Path) -> str:
|
|
12
|
+
"""Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
|
|
13
|
+
|
|
14
|
+
This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
|
|
15
|
+
generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
|
|
16
|
+
pattern into the pattern used by all modern Sun lab projects and pipelines.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
|
|
20
|
+
folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
The modernized session name.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
|
|
27
|
+
"""Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
|
|
28
|
+
folder in the newly created modern hierarchy.
|
|
29
|
+
|
|
30
|
+
This worker function is used to physically rearrange the data from the original Tyche data structure to the
|
|
31
|
+
new data structure. It both moves the existing files to their new destinations and renames certain files to match
|
|
32
|
+
the modern naming convention used in the Sun lab.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
|
|
36
|
+
source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
True if the ascension process was successfully completed. False if the process encountered missing data or
|
|
40
|
+
otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
|
|
41
|
+
to finalize the process manually.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def ascend_tyche_data(root_directory: Path) -> None:
|
|
45
|
+
"""Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
|
|
46
|
+
|
|
47
|
+
This function is used to convert old Tyche data to the modern data management standard. This is used to make the
|
|
48
|
+
data compatible with the modern Sun lab data workflows.
|
|
49
|
+
|
|
50
|
+
Notes:
|
|
51
|
+
This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
|
|
52
|
+
https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
|
|
53
|
+
preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
|
|
54
|
+
project or data hierarchy.
|
|
55
|
+
|
|
56
|
+
As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
|
|
57
|
+
Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
|
|
58
|
+
mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
|
|
59
|
+
this function for a large number of sessions will result in a long processing time due to the network data
|
|
60
|
+
transfer.
|
|
61
|
+
|
|
62
|
+
Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
|
|
63
|
+
only work on a machine that is part of an active Sun lab acquisition system.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
|
|
67
|
+
root directory for the Tyche project.
|
|
68
|
+
"""
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""This module provides methods for packaging session runtime data for transmission over the network. The methods from
|
|
2
|
+
this module work in tandem with methods offered by transfer_tools.py to ensure the integrity of the transferred data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from functools import partial
|
|
8
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
9
|
+
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
import xxhash
|
|
12
|
+
|
|
13
|
+
# Defines a 'blacklist' set of files. Primarily, this lit contains the service files that may change after the session
|
|
14
|
+
# data has been acquired. Therefore, it does not make sense to include them in the checksum, as they do not reflect the
|
|
15
|
+
# data that should remain permanently unchanged. Note, make sure all service files are added to this set!
|
|
16
|
+
_excluded_files = {
|
|
17
|
+
"ax_checksum.txt",
|
|
18
|
+
"ubiquitin.bin",
|
|
19
|
+
"telomere.bin",
|
|
20
|
+
"single_day_suite2p.bin",
|
|
21
|
+
"multi_day_suite2p.bin",
|
|
22
|
+
"behavior.bin",
|
|
23
|
+
"dlc.bin",
|
|
24
|
+
"verified.bin",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
|
|
29
|
+
"""Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
|
|
30
|
+
|
|
31
|
+
This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
|
|
32
|
+
calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
|
|
33
|
+
target file, which includes both the contents of the file and its path relative to the base directory.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
base_directory: The path to the base (root) directory which is being checksummed by the main
|
|
37
|
+
'calculate_directory_checksum' function.
|
|
38
|
+
file_path: The absolute path to the target file.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
A tuple with two elements. The first element is the path to the file relative to the base directory. The second
|
|
42
|
+
element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
|
|
43
|
+
"""
|
|
44
|
+
# Initializes the hashsum object.
|
|
45
|
+
checksum = xxhash.xxh3_128()
|
|
46
|
+
|
|
47
|
+
# Encodes the relative path and appends it to the checksum. This ensures that the hashsum reflects both the state
|
|
48
|
+
# of individual files and the layout of the overall encoded directory structure.
|
|
49
|
+
relative_path = str(file_path.relative_to(base_directory))
|
|
50
|
+
checksum.update(relative_path.encode())
|
|
51
|
+
|
|
52
|
+
# Extends the checksum to reflect the file data state. Uses 8 MB chunks to avoid excessive RAM hogging at the cost
|
|
53
|
+
# of slightly reduced throughput.
|
|
54
|
+
with open(file_path, "rb") as f:
|
|
55
|
+
for chunk in iter(lambda: f.read(1024 * 1024 * 8), b""):
|
|
56
|
+
checksum.update(chunk)
|
|
57
|
+
|
|
58
|
+
# Returns both path and file checksum. Although the relative path information is already encoded in the hashsum, the
|
|
59
|
+
# relative path information is re-encoded at the directory level to protect against future changes to the per-file
|
|
60
|
+
# hashsum calculation logic. It is extra work, but it improves the overall checksum security.
|
|
61
|
+
return relative_path, checksum.digest()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def calculate_directory_checksum(
|
|
65
|
+
directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
|
|
66
|
+
) -> str:
|
|
67
|
+
"""Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
|
|
68
|
+
the directory structure information.
|
|
69
|
+
|
|
70
|
+
This function is used to generate a checksum for the raw_data directory of each experiment or training session.
|
|
71
|
+
Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
|
|
72
|
+
and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
|
|
73
|
+
write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
|
|
74
|
+
input directory.
|
|
75
|
+
|
|
76
|
+
Note:
|
|
77
|
+
This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
|
|
78
|
+
combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
|
|
79
|
+
SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
|
|
80
|
+
|
|
81
|
+
The method notifies the user about the checksum calculation process via the terminal.
|
|
82
|
+
|
|
83
|
+
The returned checksum accounts for both the contents of each file and the layout of the input directory
|
|
84
|
+
structure.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
directory: The Path to the directory to be checksummed.
|
|
88
|
+
num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
|
|
89
|
+
function defaults to using (logical CPU count - 4).
|
|
90
|
+
batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
|
|
91
|
+
to optimize progress reporting to avoid cluttering the terminal.
|
|
92
|
+
save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
The xxHash3-128 checksum for the input directory as a hexadecimal string.
|
|
96
|
+
"""
|
|
97
|
+
# Determines the number of parallel processes to use.
|
|
98
|
+
if num_processes is None:
|
|
99
|
+
num_processes = max(1, os.cpu_count() - 4) # type: ignore
|
|
100
|
+
|
|
101
|
+
# Determines the path to each file inside the input directory structure and sorts them for consistency
|
|
102
|
+
path: Path
|
|
103
|
+
files = sorted(
|
|
104
|
+
path
|
|
105
|
+
for path in directory.rglob("*")
|
|
106
|
+
if path.is_file() and f"{path.stem}{path.suffix}" not in _excluded_files # Excludes service files
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Precreates the directory checksum
|
|
110
|
+
checksum = xxhash.xxh3_128()
|
|
111
|
+
|
|
112
|
+
# Process files in parallel
|
|
113
|
+
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
|
114
|
+
# Creates the partial function with fixed base_directory (the first argument of _calculate_file_hash())
|
|
115
|
+
process_file = partial(_calculate_file_checksum, directory)
|
|
116
|
+
|
|
117
|
+
# Submits all tasks to be executed in parallel
|
|
118
|
+
# noinspection PyTypeChecker
|
|
119
|
+
future_to_path = {executor.submit(process_file, file): file for file in files}
|
|
120
|
+
|
|
121
|
+
# Collects results as they complete
|
|
122
|
+
results = []
|
|
123
|
+
if not batch:
|
|
124
|
+
with tqdm(
|
|
125
|
+
total=len(files), desc=f"Calculating checksum for {Path(*directory.parts[-6:])}", unit="files"
|
|
126
|
+
) as pbar:
|
|
127
|
+
for future in as_completed(future_to_path):
|
|
128
|
+
results.append(future.result())
|
|
129
|
+
pbar.update(1)
|
|
130
|
+
else:
|
|
131
|
+
# For batch mode, uses a direct list comprehension with as_completed. This avoids the overhead of progress
|
|
132
|
+
# tracking while maintaining parallel processing, avoiding terminal clutter in batched contexts.
|
|
133
|
+
results = [future.result() for future in as_completed(future_to_path)]
|
|
134
|
+
|
|
135
|
+
# Sorts results for consistency and combines them into the final checksum
|
|
136
|
+
for file_path, file_checksum in sorted(results):
|
|
137
|
+
checksum.update(file_path.encode())
|
|
138
|
+
checksum.update(file_checksum)
|
|
139
|
+
|
|
140
|
+
checksum_hexstr = checksum.hexdigest()
|
|
141
|
+
|
|
142
|
+
# Writes the hash to ax_checksum.txt in the root directory
|
|
143
|
+
if save_checksum:
|
|
144
|
+
checksum_path = directory / "ax_checksum.txt"
|
|
145
|
+
with open(checksum_path, "w") as f:
|
|
146
|
+
f.write(checksum_hexstr)
|
|
147
|
+
|
|
148
|
+
return checksum_hexstr
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from _typeshed import Incomplete
|
|
4
|
+
|
|
5
|
+
_excluded_files: Incomplete
|
|
6
|
+
|
|
7
|
+
def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
|
|
8
|
+
"""Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
|
|
9
|
+
|
|
10
|
+
This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
|
|
11
|
+
calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
|
|
12
|
+
target file, which includes both the contents of the file and its path relative to the base directory.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
base_directory: The path to the base (root) directory which is being checksummed by the main
|
|
16
|
+
'calculate_directory_checksum' function.
|
|
17
|
+
file_path: The absolute path to the target file.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
A tuple with two elements. The first element is the path to the file relative to the base directory. The second
|
|
21
|
+
element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def calculate_directory_checksum(
|
|
25
|
+
directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
|
|
26
|
+
) -> str:
|
|
27
|
+
"""Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
|
|
28
|
+
the directory structure information.
|
|
29
|
+
|
|
30
|
+
This function is used to generate a checksum for the raw_data directory of each experiment or training session.
|
|
31
|
+
Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
|
|
32
|
+
and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
|
|
33
|
+
write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
|
|
34
|
+
input directory.
|
|
35
|
+
|
|
36
|
+
Note:
|
|
37
|
+
This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
|
|
38
|
+
combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
|
|
39
|
+
SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
|
|
40
|
+
|
|
41
|
+
The method notifies the user about the checksum calculation process via the terminal.
|
|
42
|
+
|
|
43
|
+
The returned checksum accounts for both the contents of each file and the layout of the input directory
|
|
44
|
+
structure.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
directory: The Path to the directory to be checksummed.
|
|
48
|
+
num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
|
|
49
|
+
function defaults to using (logical CPU count - 4).
|
|
50
|
+
batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
|
|
51
|
+
to optimize progress reporting to avoid cluttering the terminal.
|
|
52
|
+
save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
The xxHash3-128 checksum for the input directory as a hexadecimal string.
|
|
56
|
+
"""
|