sl-shared-assets 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sl-shared-assets might be problematic. Click here for more details.

Files changed (36) hide show
  1. sl_shared_assets/__init__.py +80 -0
  2. sl_shared_assets/__init__.pyi +73 -0
  3. sl_shared_assets/cli.py +384 -0
  4. sl_shared_assets/cli.pyi +94 -0
  5. sl_shared_assets/data_classes/__init__.py +66 -0
  6. sl_shared_assets/data_classes/__init__.pyi +61 -0
  7. sl_shared_assets/data_classes/configuration_data.py +479 -0
  8. sl_shared_assets/data_classes/configuration_data.pyi +199 -0
  9. sl_shared_assets/data_classes/runtime_data.py +251 -0
  10. sl_shared_assets/data_classes/runtime_data.pyi +145 -0
  11. sl_shared_assets/data_classes/session_data.py +625 -0
  12. sl_shared_assets/data_classes/session_data.pyi +252 -0
  13. sl_shared_assets/data_classes/surgery_data.py +152 -0
  14. sl_shared_assets/data_classes/surgery_data.pyi +89 -0
  15. sl_shared_assets/py.typed +0 -0
  16. sl_shared_assets/server/__init__.py +8 -0
  17. sl_shared_assets/server/__init__.pyi +8 -0
  18. sl_shared_assets/server/job.py +140 -0
  19. sl_shared_assets/server/job.pyi +94 -0
  20. sl_shared_assets/server/server.py +214 -0
  21. sl_shared_assets/server/server.pyi +95 -0
  22. sl_shared_assets/tools/__init__.py +15 -0
  23. sl_shared_assets/tools/__init__.pyi +15 -0
  24. sl_shared_assets/tools/ascension_tools.py +277 -0
  25. sl_shared_assets/tools/ascension_tools.pyi +68 -0
  26. sl_shared_assets/tools/packaging_tools.py +148 -0
  27. sl_shared_assets/tools/packaging_tools.pyi +56 -0
  28. sl_shared_assets/tools/project_management_tools.py +201 -0
  29. sl_shared_assets/tools/project_management_tools.pyi +54 -0
  30. sl_shared_assets/tools/transfer_tools.py +119 -0
  31. sl_shared_assets/tools/transfer_tools.pyi +53 -0
  32. sl_shared_assets-1.0.0.dist-info/METADATA +869 -0
  33. sl_shared_assets-1.0.0.dist-info/RECORD +36 -0
  34. sl_shared_assets-1.0.0.dist-info/WHEEL +4 -0
  35. sl_shared_assets-1.0.0.dist-info/entry_points.txt +8 -0
  36. sl_shared_assets-1.0.0.dist-info/licenses/LICENSE +674 -0
@@ -0,0 +1,277 @@
1
+ """This module provides tools for translating ('ascending') old Tyche data to use the modern data structure used in the
2
+ Sun lab. The tools from this module will not work for any other data and also assume that the Tyche data has been
3
+ preprocessed with an early version of the Sun lab mesoscope processing pipeline. However, this module can be used as
4
+ an example for how to convert other data formats to match use the Sun lab data structure."""
5
+
6
+ from pathlib import Path
7
+ import datetime
8
+
9
+ import numpy as np
10
+ from ataraxis_base_utilities import LogLevel, console, ensure_directory_exists
11
+ from ataraxis_time.time_helpers import extract_timestamp_from_bytes
12
+
13
+ from ..data_classes import SessionData, ProjectConfiguration, get_system_configuration_data
14
+ from .transfer_tools import transfer_directory
15
+ from .packaging_tools import calculate_directory_checksum
16
+
17
+
18
+ def _generate_session_name(acquisition_path: Path) -> str:
19
+ """Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
20
+
21
+ This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
22
+ generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
23
+ pattern into the pattern used by all modern Sun lab projects and pipelines.
24
+
25
+ Args:
26
+ acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
27
+ folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
28
+
29
+ Returns:
30
+ The modernized session name.
31
+ """
32
+
33
+ # All well-formed sessions are expected to contain both the zstack.mat and the MotionEstimator.me file.
34
+ # We use the last modification time from one of these files to infer when the session was carried out. This allows
35
+ # us to gather the time information, which is missing from the original session naming pattern.
36
+ source: Path
37
+ if acquisition_path.joinpath("zstack.mat").exists():
38
+ source = acquisition_path.joinpath("zstack.mat")
39
+ elif acquisition_path.joinpath("MotionEstimator.me").exists():
40
+ source = acquisition_path.joinpath("MotionEstimator.me")
41
+ else:
42
+ message = (
43
+ f"Unable to find zstack.mat or MotionEstimator.me file in the target acquisition subfolder "
44
+ f"{acquisition_path} of the session {acquisition_path.parent}. Manual intervention is required to ascend "
45
+ f"the target session folder to the latest Sun lab data format."
46
+ )
47
+ console.error(message=message, error=FileNotFoundError)
48
+ raise FileNotFoundError(message) # Fall-back to appease mypy
49
+
50
+ # Gets last modified time (available on all platforms) and converts it to a UTC timestamp object.
51
+ mod_time = source.stat().st_mtime
52
+ mod_datetime = datetime.datetime.fromtimestamp(mod_time)
53
+
54
+ # Converts the timestamp to microseconds as uint64, then to an array of 8 uint8 bytes. The array is then reformatted
55
+ # to match the session name pattern used in the modern Sun lab data pipelines.
56
+ timestamp_microseconds = np.uint64(int(mod_datetime.timestamp() * 1_000_000))
57
+ timestamp_bytes = np.array([(timestamp_microseconds >> (8 * i)) & 0xFF for i in range(8)], dtype=np.uint8)
58
+ stamp = extract_timestamp_from_bytes(timestamp_bytes=timestamp_bytes)
59
+
60
+ # Returns the generated session name to caller.
61
+ return stamp
62
+
63
+
64
+ def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
65
+ """Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
66
+ folder in the newly created modern hierarchy.
67
+
68
+ This worker function is used to physically rearrange the data from the original Tyche data structure to the
69
+ new data structure. It both moves the existing files to their new destinations and renames certain files to match
70
+ the modern naming convention used in the Sun lab.
71
+
72
+ Args:
73
+ session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
74
+ source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
75
+
76
+ Returns:
77
+ True if the ascension process was successfully completed. False if the process encountered missing data or
78
+ otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
79
+ to finalize the process manually.
80
+ """
81
+
82
+ # Resolves expected data targets:
83
+
84
+ # These files should be present in all well-formed session data folders. While not all session folders are
85
+ # well-formed, we will likely exclude any non-well-formed folders from processing.
86
+ zstack_path = source_root.joinpath("zstack.mat")
87
+ motion_estimator_path = source_root.joinpath("MotionEstimator.me")
88
+ ops_path = source_root.joinpath("ops.json")
89
+ mesoscope_frames_path = source_root.joinpath("mesoscope_frames")
90
+ ax_checksum_path = source_root.joinpath("ax_checksum.txt")
91
+
92
+ # These two file types are present for some, but not all folders. They are not as important as the group of files
93
+ # above though, as, currently, the data stored in these files is not used during processing.
94
+ frame_metadata_path = source_root.joinpath("frame_metadata.npz")
95
+ metadata_path = source_root.joinpath("metadata.json")
96
+
97
+ # This tracker is used to mark the session for manual intervention if any expected data is missing from the source
98
+ # session folder. At the end of this function's runtime, it determines whether the function returns True or False
99
+ data_missing = False
100
+
101
+ # First, moves the mesoscope TIFF stacks to the newly created session data hierarchy as mesoscope_data subfolder
102
+ if mesoscope_frames_path.exists():
103
+ mesoscope_frames_path.rename(session_data.raw_data.mesoscope_data_path)
104
+ else:
105
+ data_missing = True
106
+
107
+ # Then, moves 'loose' mesoscope-related data files to the mesoscope_data folder.
108
+ if zstack_path.exists():
109
+ zstack_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("zstack.mat"))
110
+ else:
111
+ data_missing = True
112
+
113
+ if motion_estimator_path.exists():
114
+ motion_estimator_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("MotionEstimator.me"))
115
+ else:
116
+ data_missing = True
117
+
118
+ if ops_path.exists():
119
+ ops_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("ops.json"))
120
+ else:
121
+ data_missing = True
122
+
123
+ # If variant and invariant metadata files exist, also moves them to the mesoscope data folder and renames the
124
+ # files to use the latest naming convention. Missing any of these files is not considered a user-intervention-worthy
125
+ # situation.
126
+ if frame_metadata_path.exists():
127
+ frame_metadata_path.rename(
128
+ Path(session_data.raw_data.mesoscope_data_path).joinpath("frame_variant_metadata.npz")
129
+ )
130
+ if metadata_path.exists():
131
+ metadata_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("frame_invariant_metadata.json"))
132
+
133
+ # Loops over all camera video files (using the .avi extension) and moves them to the camera_data folder.
134
+ videos_found = 0
135
+ for video in source_root.glob("*.avi"):
136
+ videos_found += 1
137
+ video.rename(Path(session_data.raw_data.camera_data_path).joinpath(video.name))
138
+ if videos_found == 0:
139
+ data_missing = True
140
+
141
+ # Loops over all behavior log files (old GIMBL format) and moves them to the behavior_data folder.
142
+ logs_found = 0
143
+ for log in source_root.glob("Log Tyche-* ????-??-?? session *.json"):
144
+ logs_found += 1
145
+ log.rename(Path(session_data.raw_data.behavior_data_path).joinpath(log.name))
146
+ if logs_found == 0:
147
+ data_missing = True
148
+
149
+ # Removes the checksum file if it exists. Due to file name and location changes, the session data folder has to
150
+ # be re-checksummed after the reorganization anyway, so there is no need to keep the original file.
151
+ ax_checksum_path.unlink(missing_ok=True)
152
+
153
+ # Loops over all remaining contents of the directory.
154
+ for path in source_root.glob("*"):
155
+ # At this point, there should be no more subfolders left inside the root directory. If there are more
156
+ # subfolders, this case requires user intervention
157
+ if path.is_dir():
158
+ data_missing = True
159
+
160
+ # All non-subfolder files are moved to the root raw_data directory of the newly created session.
161
+ else:
162
+ path.rename(Path(session_data.raw_data.raw_data_path).joinpath(path.name))
163
+
164
+ # Session data has been fully reorganized. Depending on whether there was any missing data during processing,
165
+ # returns the boolean flag for whether user intervention is required
166
+ if data_missing:
167
+ return False
168
+ else:
169
+ return True
170
+
171
+
172
+ def ascend_tyche_data(root_directory: Path) -> None:
173
+ """Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
174
+
175
+ This function is used to convert old Tyche data to the modern data management standard. This is used to make the
176
+ data compatible with the modern Sun lab data workflows.
177
+
178
+ Notes:
179
+ This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
180
+ https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
181
+ preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
182
+ project or data hierarchy.
183
+
184
+ As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
185
+ Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
186
+ mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
187
+ this function for a large number of sessions will result in a long processing time due to the network data
188
+ transfer.
189
+
190
+ Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
191
+ only work on a machine that is part of an active Sun lab acquisition system.
192
+
193
+ Args:
194
+ root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
195
+ root directory for the Tyche project.
196
+ """
197
+ # Generates a (shared) project configuration file.
198
+ project_configuration = ProjectConfiguration()
199
+
200
+ # The acquisition system config resolves most paths and filesystem configuration arguments
201
+ acquisition_system = get_system_configuration_data()
202
+ output_root_directory = acquisition_system.paths.root_directory
203
+ server_root_directory = acquisition_system.paths.server_storage_directory
204
+
205
+ # Statically defines project name and local root paths
206
+ project_name = "Tyche"
207
+ project_configuration.project_name = project_name
208
+
209
+ # Uses nonsensical google sheet IDs. Tyche project did not use Google Sheet processing like our modern projects do.
210
+ project_configuration.water_log_sheet_id = "1xFh9Q2zT7pL3mVkJdR8bN6yXoE4wS5aG0cHu2Kf7D3v"
211
+ project_configuration.surgery_sheet_id = "1xFh9Q2zT7pL3mVkJdR8bN6yXoE4wS5aG0cHu2Kf7D3v"
212
+
213
+ # Dumps project configuration into the 'configuration' subfolder of the Tyche project.
214
+ configuration_path = output_root_directory.joinpath("Tyche", "configuration", "project_configuration.yaml")
215
+ ensure_directory_exists(configuration_path)
216
+ project_configuration.save(path=configuration_path)
217
+
218
+ # Assumes that root directory stores all animal folders to be processed
219
+ for animal_folder in root_directory.iterdir():
220
+ # Each animal folder is named to include project name and a static animal ID, e.g.: Tyche-A7. This extracts each
221
+ # animal ID.
222
+ animal_name = animal_folder.stem.split(sep="-")[1]
223
+
224
+ # Under each animal root folder, there are day folders that use YYYY-MM-DD timestamps
225
+ for session_folder in animal_folder.iterdir():
226
+ # Inside each day folder, there are one or more acquisitions (sessions)
227
+ for acquisition_folder in session_folder.iterdir():
228
+ # For each session, we extract the modification time from either (preferentially) zstack.mat or
229
+ # MotionEstimator.me file. Any session without these files is flagged for additional user intervention.
230
+ # This procedure generates timestamp-based session names, analogous to how our modern pipeline does it.
231
+ session_name = _generate_session_name(acquisition_path=acquisition_folder)
232
+
233
+ # Uses derived session name and the statically created project configuration file to create the
234
+ # session data hierarchy using the output root. This generates a 'standard' Sun lab directory structure
235
+ # for the Tyche data.
236
+ session_data = SessionData.create(
237
+ project_name=project_configuration.project_name,
238
+ session_name=session_name,
239
+ animal_id=animal_name,
240
+ session_type="mesoscope experiment",
241
+ experiment_name=None,
242
+ )
243
+
244
+ # Moves the data from the old hierarchy to the new hierarchy. If the process runs as expected, and
245
+ # fully empties the source acquisition folder, destroys the folder. Otherwise, notifies the user that
246
+ # the runtime did not fully process the session data and requests intervention.
247
+ success = _reorganize_data(session_data, acquisition_folder)
248
+ if not success:
249
+ message = (
250
+ f"Encountered issues when reorganizing {animal_name} session {session_name}. "
251
+ f"User intervention is required to finish data reorganization process for this session."
252
+ )
253
+ # noinspection PyTypeChecker
254
+ console.echo(message=message, level=LogLevel.WARNING)
255
+ else:
256
+ # Generates the telomere.bin file to mark the session as 'complete'
257
+ session_data.raw_data.telomere_path.touch()
258
+
259
+ # If the local transfer process was successful, generates a new checksum for the moved data
260
+ calculate_directory_checksum(directory=Path(session_data.raw_data.raw_data_path))
261
+
262
+ # Next, copies the data to the BioHPC server for further processing
263
+ transfer_directory(
264
+ source=Path(session_data.raw_data.raw_data_path),
265
+ destination=Path(
266
+ server_root_directory.joinpath(project_name, animal_name, session_name, "raw_data")
267
+ ),
268
+ verify_integrity=False,
269
+ )
270
+
271
+ # Removes the now-empty old session data directory.
272
+ acquisition_folder.rmdir()
273
+
274
+ # If the loop above removed all acquisition folders, all data for that day has been successfully converted
275
+ # to use the new session format. Removes the now-empty 'day' folder from the target animal
276
+ if len([folder for folder in session_folder.iterdir()]) == 0:
277
+ session_folder.rmdir()
@@ -0,0 +1,68 @@
1
+ from pathlib import Path
2
+
3
+ from ..data_classes import (
4
+ SessionData as SessionData,
5
+ ProjectConfiguration as ProjectConfiguration,
6
+ get_system_configuration_data as get_system_configuration_data,
7
+ )
8
+ from .transfer_tools import transfer_directory as transfer_directory
9
+ from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
10
+
11
+ def _generate_session_name(acquisition_path: Path) -> str:
12
+ """Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
13
+
14
+ This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
15
+ generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
16
+ pattern into the pattern used by all modern Sun lab projects and pipelines.
17
+
18
+ Args:
19
+ acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
20
+ folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
21
+
22
+ Returns:
23
+ The modernized session name.
24
+ """
25
+
26
+ def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
27
+ """Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
28
+ folder in the newly created modern hierarchy.
29
+
30
+ This worker function is used to physically rearrange the data from the original Tyche data structure to the
31
+ new data structure. It both moves the existing files to their new destinations and renames certain files to match
32
+ the modern naming convention used in the Sun lab.
33
+
34
+ Args:
35
+ session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
36
+ source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
37
+
38
+ Returns:
39
+ True if the ascension process was successfully completed. False if the process encountered missing data or
40
+ otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
41
+ to finalize the process manually.
42
+ """
43
+
44
+ def ascend_tyche_data(root_directory: Path) -> None:
45
+ """Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
46
+
47
+ This function is used to convert old Tyche data to the modern data management standard. This is used to make the
48
+ data compatible with the modern Sun lab data workflows.
49
+
50
+ Notes:
51
+ This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
52
+ https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
53
+ preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
54
+ project or data hierarchy.
55
+
56
+ As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
57
+ Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
58
+ mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
59
+ this function for a large number of sessions will result in a long processing time due to the network data
60
+ transfer.
61
+
62
+ Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
63
+ only work on a machine that is part of an active Sun lab acquisition system.
64
+
65
+ Args:
66
+ root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
67
+ root directory for the Tyche project.
68
+ """
@@ -0,0 +1,148 @@
1
+ """This module provides methods for packaging session runtime data for transmission over the network. The methods from
2
+ this module work in tandem with methods offered by transfer_tools.py to ensure the integrity of the transferred data.
3
+ """
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from functools import partial
8
+ from concurrent.futures import ProcessPoolExecutor, as_completed
9
+
10
+ from tqdm import tqdm
11
+ import xxhash
12
+
13
+ # Defines a 'blacklist' set of files. Primarily, this lit contains the service files that may change after the session
14
+ # data has been acquired. Therefore, it does not make sense to include them in the checksum, as they do not reflect the
15
+ # data that should remain permanently unchanged. Note, make sure all service files are added to this set!
16
+ _excluded_files = {
17
+ "ax_checksum.txt",
18
+ "ubiquitin.bin",
19
+ "telomere.bin",
20
+ "single_day_suite2p.bin",
21
+ "multi_day_suite2p.bin",
22
+ "behavior.bin",
23
+ "dlc.bin",
24
+ "verified.bin",
25
+ }
26
+
27
+
28
+ def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
29
+ """Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
30
+
31
+ This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
32
+ calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
33
+ target file, which includes both the contents of the file and its path relative to the base directory.
34
+
35
+ Args:
36
+ base_directory: The path to the base (root) directory which is being checksummed by the main
37
+ 'calculate_directory_checksum' function.
38
+ file_path: The absolute path to the target file.
39
+
40
+ Returns:
41
+ A tuple with two elements. The first element is the path to the file relative to the base directory. The second
42
+ element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
43
+ """
44
+ # Initializes the hashsum object.
45
+ checksum = xxhash.xxh3_128()
46
+
47
+ # Encodes the relative path and appends it to the checksum. This ensures that the hashsum reflects both the state
48
+ # of individual files and the layout of the overall encoded directory structure.
49
+ relative_path = str(file_path.relative_to(base_directory))
50
+ checksum.update(relative_path.encode())
51
+
52
+ # Extends the checksum to reflect the file data state. Uses 8 MB chunks to avoid excessive RAM hogging at the cost
53
+ # of slightly reduced throughput.
54
+ with open(file_path, "rb") as f:
55
+ for chunk in iter(lambda: f.read(1024 * 1024 * 8), b""):
56
+ checksum.update(chunk)
57
+
58
+ # Returns both path and file checksum. Although the relative path information is already encoded in the hashsum, the
59
+ # relative path information is re-encoded at the directory level to protect against future changes to the per-file
60
+ # hashsum calculation logic. It is extra work, but it improves the overall checksum security.
61
+ return relative_path, checksum.digest()
62
+
63
+
64
+ def calculate_directory_checksum(
65
+ directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
66
+ ) -> str:
67
+ """Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
68
+ the directory structure information.
69
+
70
+ This function is used to generate a checksum for the raw_data directory of each experiment or training session.
71
+ Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
72
+ and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
73
+ write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
74
+ input directory.
75
+
76
+ Note:
77
+ This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
78
+ combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
79
+ SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
80
+
81
+ The method notifies the user about the checksum calculation process via the terminal.
82
+
83
+ The returned checksum accounts for both the contents of each file and the layout of the input directory
84
+ structure.
85
+
86
+ Args:
87
+ directory: The Path to the directory to be checksummed.
88
+ num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
89
+ function defaults to using (logical CPU count - 4).
90
+ batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
91
+ to optimize progress reporting to avoid cluttering the terminal.
92
+ save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
93
+
94
+ Returns:
95
+ The xxHash3-128 checksum for the input directory as a hexadecimal string.
96
+ """
97
+ # Determines the number of parallel processes to use.
98
+ if num_processes is None:
99
+ num_processes = max(1, os.cpu_count() - 4) # type: ignore
100
+
101
+ # Determines the path to each file inside the input directory structure and sorts them for consistency
102
+ path: Path
103
+ files = sorted(
104
+ path
105
+ for path in directory.rglob("*")
106
+ if path.is_file() and f"{path.stem}{path.suffix}" not in _excluded_files # Excludes service files
107
+ )
108
+
109
+ # Precreates the directory checksum
110
+ checksum = xxhash.xxh3_128()
111
+
112
+ # Process files in parallel
113
+ with ProcessPoolExecutor(max_workers=num_processes) as executor:
114
+ # Creates the partial function with fixed base_directory (the first argument of _calculate_file_hash())
115
+ process_file = partial(_calculate_file_checksum, directory)
116
+
117
+ # Submits all tasks to be executed in parallel
118
+ # noinspection PyTypeChecker
119
+ future_to_path = {executor.submit(process_file, file): file for file in files}
120
+
121
+ # Collects results as they complete
122
+ results = []
123
+ if not batch:
124
+ with tqdm(
125
+ total=len(files), desc=f"Calculating checksum for {Path(*directory.parts[-6:])}", unit="files"
126
+ ) as pbar:
127
+ for future in as_completed(future_to_path):
128
+ results.append(future.result())
129
+ pbar.update(1)
130
+ else:
131
+ # For batch mode, uses a direct list comprehension with as_completed. This avoids the overhead of progress
132
+ # tracking while maintaining parallel processing, avoiding terminal clutter in batched contexts.
133
+ results = [future.result() for future in as_completed(future_to_path)]
134
+
135
+ # Sorts results for consistency and combines them into the final checksum
136
+ for file_path, file_checksum in sorted(results):
137
+ checksum.update(file_path.encode())
138
+ checksum.update(file_checksum)
139
+
140
+ checksum_hexstr = checksum.hexdigest()
141
+
142
+ # Writes the hash to ax_checksum.txt in the root directory
143
+ if save_checksum:
144
+ checksum_path = directory / "ax_checksum.txt"
145
+ with open(checksum_path, "w") as f:
146
+ f.write(checksum_hexstr)
147
+
148
+ return checksum_hexstr
@@ -0,0 +1,56 @@
1
+ from pathlib import Path
2
+
3
+ from _typeshed import Incomplete
4
+
5
+ _excluded_files: Incomplete
6
+
7
+ def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
8
+ """Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
9
+
10
+ This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
11
+ calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
12
+ target file, which includes both the contents of the file and its path relative to the base directory.
13
+
14
+ Args:
15
+ base_directory: The path to the base (root) directory which is being checksummed by the main
16
+ 'calculate_directory_checksum' function.
17
+ file_path: The absolute path to the target file.
18
+
19
+ Returns:
20
+ A tuple with two elements. The first element is the path to the file relative to the base directory. The second
21
+ element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
22
+ """
23
+
24
+ def calculate_directory_checksum(
25
+ directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
26
+ ) -> str:
27
+ """Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
28
+ the directory structure information.
29
+
30
+ This function is used to generate a checksum for the raw_data directory of each experiment or training session.
31
+ Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
32
+ and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
33
+ write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
34
+ input directory.
35
+
36
+ Note:
37
+ This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
38
+ combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
39
+ SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
40
+
41
+ The method notifies the user about the checksum calculation process via the terminal.
42
+
43
+ The returned checksum accounts for both the contents of each file and the layout of the input directory
44
+ structure.
45
+
46
+ Args:
47
+ directory: The Path to the directory to be checksummed.
48
+ num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
49
+ function defaults to using (logical CPU count - 4).
50
+ batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
51
+ to optimize progress reporting to avoid cluttering the terminal.
52
+ save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
53
+
54
+ Returns:
55
+ The xxHash3-128 checksum for the input directory as a hexadecimal string.
56
+ """