sl-shared-assets 4.0.1__py3-none-any.whl → 5.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sl-shared-assets might be problematic. Click here for more details.

Files changed (39) hide show
  1. sl_shared_assets/__init__.py +48 -41
  2. sl_shared_assets/command_line_interfaces/__init__.py +3 -0
  3. sl_shared_assets/command_line_interfaces/configure.py +173 -0
  4. sl_shared_assets/command_line_interfaces/manage.py +226 -0
  5. sl_shared_assets/data_classes/__init__.py +33 -32
  6. sl_shared_assets/data_classes/configuration_data.py +267 -79
  7. sl_shared_assets/data_classes/session_data.py +226 -289
  8. sl_shared_assets/server/__init__.py +24 -4
  9. sl_shared_assets/server/job.py +6 -7
  10. sl_shared_assets/server/pipeline.py +585 -0
  11. sl_shared_assets/server/server.py +57 -25
  12. sl_shared_assets/tools/__init__.py +9 -8
  13. sl_shared_assets/tools/packaging_tools.py +14 -25
  14. sl_shared_assets/tools/project_management_tools.py +602 -523
  15. sl_shared_assets/tools/transfer_tools.py +88 -23
  16. {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.1.dist-info}/METADATA +46 -203
  17. sl_shared_assets-5.0.1.dist-info/RECORD +23 -0
  18. sl_shared_assets-5.0.1.dist-info/entry_points.txt +3 -0
  19. sl_shared_assets/__init__.pyi +0 -91
  20. sl_shared_assets/cli.py +0 -501
  21. sl_shared_assets/cli.pyi +0 -106
  22. sl_shared_assets/data_classes/__init__.pyi +0 -75
  23. sl_shared_assets/data_classes/configuration_data.pyi +0 -235
  24. sl_shared_assets/data_classes/runtime_data.pyi +0 -157
  25. sl_shared_assets/data_classes/session_data.pyi +0 -379
  26. sl_shared_assets/data_classes/surgery_data.pyi +0 -89
  27. sl_shared_assets/server/__init__.pyi +0 -11
  28. sl_shared_assets/server/job.pyi +0 -205
  29. sl_shared_assets/server/server.pyi +0 -298
  30. sl_shared_assets/tools/__init__.pyi +0 -19
  31. sl_shared_assets/tools/ascension_tools.py +0 -265
  32. sl_shared_assets/tools/ascension_tools.pyi +0 -68
  33. sl_shared_assets/tools/packaging_tools.pyi +0 -58
  34. sl_shared_assets/tools/project_management_tools.pyi +0 -239
  35. sl_shared_assets/tools/transfer_tools.pyi +0 -53
  36. sl_shared_assets-4.0.1.dist-info/RECORD +0 -36
  37. sl_shared_assets-4.0.1.dist-info/entry_points.txt +0 -7
  38. {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.1.dist-info}/WHEEL +0 -0
  39. {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,265 +0,0 @@
1
- """This module provides tools for translating ('ascending') old Tyche data to use the modern data structure used in the
2
- Sun lab. The tools from this module will not work for any other data and also assume that the Tyche data has been
3
- preprocessed with an early version of the Sun lab mesoscope processing pipeline. However, this module can be used as
4
- an example for how to convert other data formats to match use the Sun lab data structure."""
5
-
6
- from pathlib import Path
7
- import datetime
8
-
9
- import numpy as np
10
- from ataraxis_base_utilities import LogLevel, console
11
- from ataraxis_time.time_helpers import extract_timestamp_from_bytes
12
-
13
- from ..data_classes import SessionData, SessionTypes, get_system_configuration_data
14
- from .transfer_tools import transfer_directory
15
- from .packaging_tools import calculate_directory_checksum
16
-
17
-
18
- def _generate_session_name(acquisition_path: Path) -> str:
19
- """Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
20
-
21
- This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
22
- generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
23
- pattern into the pattern used by all modern Sun lab projects and pipelines.
24
-
25
- Args:
26
- acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
27
- folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
28
-
29
- Returns:
30
- The modernized session name.
31
- """
32
-
33
- # All well-formed sessions are expected to contain both the zstack.mat and the MotionEstimator.me file.
34
- # We use the last modification time from one of these files to infer when the session was carried out. This allows
35
- # us to gather the time information, which is missing from the original session naming pattern.
36
- source: Path
37
- if acquisition_path.joinpath("zstack.mat").exists():
38
- source = acquisition_path.joinpath("zstack.mat")
39
- elif acquisition_path.joinpath("MotionEstimator.me").exists():
40
- source = acquisition_path.joinpath("MotionEstimator.me")
41
- else:
42
- message = (
43
- f"Unable to find zstack.mat or MotionEstimator.me file in the target acquisition subfolder "
44
- f"{acquisition_path} of the session {acquisition_path.parent}. Manual intervention is required to ascend "
45
- f"the target session folder to the latest Sun lab data format."
46
- )
47
- console.error(message=message, error=FileNotFoundError)
48
- raise FileNotFoundError(message) # Fall-back to appease mypy
49
-
50
- # Gets the last modified time (available on all platforms) and converts it to a UTC timestamp object.
51
- mod_time = source.stat().st_mtime
52
- mod_datetime = datetime.datetime.fromtimestamp(mod_time)
53
-
54
- # Converts the timestamp to microseconds as uint64, then to an array of 8 uint8 bytes. The array is then reformatted
55
- # to match the session name pattern used in the modern Sun lab data pipelines.
56
- timestamp_microseconds = np.uint64(int(mod_datetime.timestamp() * 1_000_000))
57
- timestamp_bytes = np.array([(timestamp_microseconds >> (8 * i)) & 0xFF for i in range(8)], dtype=np.uint8)
58
- stamp = extract_timestamp_from_bytes(timestamp_bytes=timestamp_bytes)
59
-
60
- # Returns the generated session name to the caller.
61
- return stamp
62
-
63
-
64
- def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
65
- """Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
66
- folder in the newly created modern hierarchy.
67
-
68
- This worker function is used to physically rearrange the data from the original Tyche data structure to the
69
- new data structure. It both moves the existing files to their new destinations and renames certain files to match
70
- the modern naming convention used in the Sun lab.
71
-
72
- Args:
73
- session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
74
- source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
75
-
76
- Returns:
77
- True if the ascension process was successfully completed. False if the process encountered missing data or
78
- otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
79
- to finalize the process manually.
80
- """
81
-
82
- # Resolves expected data targets:
83
-
84
- # These files should be present in all well-formed session data folders. While not all session folders are
85
- # well-formed, we will likely exclude any non-well-formed folders from processing.
86
- zstack_path = source_root.joinpath("zstack.mat")
87
- motion_estimator_path = source_root.joinpath("MotionEstimator.me")
88
- ops_path = source_root.joinpath("ops.json")
89
- mesoscope_frames_path = source_root.joinpath("mesoscope_frames")
90
- ax_checksum_path = source_root.joinpath("ax_checksum.txt")
91
-
92
- # These two file types are present for some, but not all folders. They are not as important as the files mentioned
93
- # above, though, as, currently, the data stored in these files is not used during processing.
94
- frame_metadata_path = source_root.joinpath("frame_metadata.npz")
95
- metadata_path = source_root.joinpath("metadata.json")
96
-
97
- # This tracker is used to mark the session for manual intervention if any expected data is missing from the source
98
- # session folder. At the end of this function's runtime, it determines whether the function returns True or False
99
- data_missing = False
100
-
101
- # First, moves the mesoscope TIFF stacks to the newly created session data hierarchy as mesoscope_data subfolder
102
- if mesoscope_frames_path.exists():
103
- mesoscope_frames_path.rename(session_data.raw_data.mesoscope_data_path)
104
- else:
105
- data_missing = True
106
-
107
- # Then, moves 'loose' mesoscope-related data files to the mesoscope_data folder.
108
- if zstack_path.exists():
109
- zstack_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("zstack.mat"))
110
- else:
111
- data_missing = True
112
-
113
- if motion_estimator_path.exists():
114
- motion_estimator_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("MotionEstimator.me"))
115
- else:
116
- data_missing = True
117
-
118
- if ops_path.exists():
119
- ops_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("ops.json"))
120
- else:
121
- data_missing = True
122
-
123
- # If variant and invariant metadata files exist, also moves them to the mesoscope data folder and renames the
124
- # files to use the latest naming convention. Missing any of these files is not considered a user-intervention-worthy
125
- # situation.
126
- if frame_metadata_path.exists():
127
- frame_metadata_path.rename(
128
- Path(session_data.raw_data.mesoscope_data_path).joinpath("frame_variant_metadata.npz")
129
- )
130
- if metadata_path.exists():
131
- metadata_path.rename(Path(session_data.raw_data.mesoscope_data_path).joinpath("frame_invariant_metadata.json"))
132
-
133
- # Loops over all camera video files (using the .avi extension) and moves them to the camera_data folder.
134
- videos_found = 0
135
- for video in source_root.glob("*.avi"):
136
- videos_found += 1
137
- video.rename(Path(session_data.raw_data.camera_data_path).joinpath(video.name))
138
- if videos_found == 0:
139
- data_missing = True
140
-
141
- # Loops over all behavior log files (old GIMBL format) and moves them to the behavior_data folder.
142
- logs_found = 0
143
- for log in source_root.glob("Log Tyche-* ????-??-?? session *.json"):
144
- logs_found += 1
145
- log.rename(Path(session_data.raw_data.behavior_data_path).joinpath(log.name))
146
- if logs_found == 0:
147
- data_missing = True
148
-
149
- # Removes the checksum file if it exists. Due to file name and location changes, the session data folder has to
150
- # be re-checksummed after the reorganization anyway, so there is no need to keep the original file.
151
- ax_checksum_path.unlink(missing_ok=True)
152
-
153
- # Loops over all remaining contents of the directory.
154
- for path in source_root.glob("*"):
155
- # At this point, there should be no more subfolders left inside the root directory. If there are more
156
- # subfolders, this case requires user intervention
157
- if path.is_dir():
158
- data_missing = True
159
-
160
- # All non-subfolder files are moved to the root raw_data directory of the newly created session.
161
- else:
162
- path.rename(Path(session_data.raw_data.raw_data_path).joinpath(path.name))
163
-
164
- # Session data has been fully reorganized. Depending on whether there was any missing data during processing,
165
- # returns the boolean flag for whether user intervention is required
166
- if data_missing:
167
- return False
168
- else:
169
- return True
170
-
171
-
172
- def ascend_tyche_data(root_directory: Path) -> None:
173
- """Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
174
-
175
- This function is used to convert old Tyche data to the modern data management standard. This is used to make the
176
- data compatible with the modern Sun lab data workflows.
177
-
178
- Notes:
179
- This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
180
- https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
181
- preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
182
- project or data hierarchy.
183
-
184
- As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
185
- Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
186
- mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
187
- this function for a large number of sessions will result in a long processing time due to the network data
188
- transfer.
189
-
190
- Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
191
- only work on a machine that is part of an active Sun lab acquisition system.
192
-
193
- Args:
194
- root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
195
- root directory for the Tyche project.
196
- """
197
- # The acquisition system config resolves most paths and filesystem configuration arguments
198
- acquisition_system = get_system_configuration_data()
199
- server_root_directory = acquisition_system.paths.server_storage_directory
200
-
201
- # Statically defines project name and local root paths
202
- project_name = "Tyche"
203
-
204
- # Assumes that the root directory stores all animal folders to be processed
205
- for animal_folder in root_directory.iterdir():
206
- # Each animal folder is named to include a project name and a static animal ID, e.g.: Tyche-A7. This extracts
207
- # each animal ID.
208
- animal_name = animal_folder.stem.split(sep="-")[1]
209
-
210
- # Under each animal root folder, there are day folders that use YYYY-MM-DD timestamps
211
- for session_folder in animal_folder.iterdir():
212
- # Inside each day folder, there are one or more acquisitions (sessions)
213
- for acquisition_folder in session_folder.iterdir():
214
- # For each session, we extract the modification time from either (preferentially) zstack.mat or
215
- # MotionEstimator.me file. Any session without these files is flagged for additional user intervention.
216
- # This procedure generates timestamp-based session names, analogous to how our modern pipeline does it.
217
- session_name = _generate_session_name(acquisition_path=acquisition_folder)
218
-
219
- # Uses derived session name and the derived project name to create the session data hierarchy using the
220
- # output root. This generates a 'standard' Sun lab directory structure for the Tyche data.
221
- session_data = SessionData.create(
222
- project_name=project_name,
223
- session_name=session_name,
224
- animal_id=animal_name,
225
- session_type=SessionTypes.MESOSCOPE_EXPERIMENT,
226
- experiment_name=None,
227
- )
228
-
229
- # Since this runtime reprocesses already acquired data, marks the session as fully initialized.
230
- session_data.runtime_initialized()
231
-
232
- # Moves the data from the old hierarchy to the new hierarchy. If the process runs as expected, and
233
- # fully empties the source acquisition folder, it destroys the folder. Otherwise, notifies the user that
234
- # the runtime did not fully process the session data and requests intervention.
235
- success = _reorganize_data(session_data, acquisition_folder)
236
- if not success:
237
- message = (
238
- f"Encountered issues when reorganizing {animal_name} session {session_name}. "
239
- f"User intervention is required to finish data reorganization process for this session."
240
- )
241
- # noinspection PyTypeChecker
242
- console.echo(message=message, level=LogLevel.WARNING)
243
- else:
244
- # Generates the telomere.bin file to mark the session as 'complete'
245
- session_data.raw_data.telomere_path.touch()
246
-
247
- # If the local transfer process was successful, generates a new checksum for the moved data
248
- calculate_directory_checksum(directory=Path(session_data.raw_data.raw_data_path))
249
-
250
- # Next, copies the data to the BioHPC server for further processing
251
- transfer_directory(
252
- source=Path(session_data.raw_data.raw_data_path),
253
- destination=Path(
254
- server_root_directory.joinpath(project_name, animal_name, session_name, "raw_data")
255
- ),
256
- verify_integrity=False,
257
- )
258
-
259
- # Removes the now-empty old session data directory.
260
- acquisition_folder.rmdir()
261
-
262
- # If the loop above removed all acquisition folders, all data for that day has been successfully converted
263
- # to use the new session format. Removes the now-empty 'day' folder from the target animal
264
- if len([folder for folder in session_folder.iterdir()]) == 0:
265
- session_folder.rmdir()
@@ -1,68 +0,0 @@
1
- from pathlib import Path
2
-
3
- from ..data_classes import (
4
- SessionData as SessionData,
5
- SessionTypes as SessionTypes,
6
- get_system_configuration_data as get_system_configuration_data,
7
- )
8
- from .transfer_tools import transfer_directory as transfer_directory
9
- from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
10
-
11
- def _generate_session_name(acquisition_path: Path) -> str:
12
- """Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
13
-
14
- This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
15
- generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
16
- pattern into the pattern used by all modern Sun lab projects and pipelines.
17
-
18
- Args:
19
- acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
20
- folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
21
-
22
- Returns:
23
- The modernized session name.
24
- """
25
-
26
- def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
27
- """Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
28
- folder in the newly created modern hierarchy.
29
-
30
- This worker function is used to physically rearrange the data from the original Tyche data structure to the
31
- new data structure. It both moves the existing files to their new destinations and renames certain files to match
32
- the modern naming convention used in the Sun lab.
33
-
34
- Args:
35
- session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
36
- source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
37
-
38
- Returns:
39
- True if the ascension process was successfully completed. False if the process encountered missing data or
40
- otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
41
- to finalize the process manually.
42
- """
43
-
44
- def ascend_tyche_data(root_directory: Path) -> None:
45
- """Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
46
-
47
- This function is used to convert old Tyche data to the modern data management standard. This is used to make the
48
- data compatible with the modern Sun lab data workflows.
49
-
50
- Notes:
51
- This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
52
- https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
53
- preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
54
- project or data hierarchy.
55
-
56
- As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
57
- Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
58
- mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
59
- this function for a large number of sessions will result in a long processing time due to the network data
60
- transfer.
61
-
62
- Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
63
- only work on a machine that is part of an active Sun lab acquisition system.
64
-
65
- Args:
66
- root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
67
- root directory for the Tyche project.
68
- """
@@ -1,58 +0,0 @@
1
- from pathlib import Path
2
-
3
- from _typeshed import Incomplete
4
-
5
- from ..data_classes import TrackerFileNames as TrackerFileNames
6
-
7
- _excluded_files: Incomplete
8
-
9
- def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
10
- """Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
11
-
12
- This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
13
- calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
14
- target file, which includes both the contents of the file and its path relative to the base directory.
15
-
16
- Args:
17
- base_directory: The path to the base (root) directory which is being checksummed by the main
18
- 'calculate_directory_checksum' function.
19
- file_path: The absolute path to the target file.
20
-
21
- Returns:
22
- A tuple with two elements. The first element is the path to the file relative to the base directory. The second
23
- element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
24
- """
25
-
26
- def calculate_directory_checksum(
27
- directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
28
- ) -> str:
29
- """Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
30
- the directory structure information.
31
-
32
- This function is used to generate a checksum for the raw_data directory of each experiment or training session.
33
- Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
34
- and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
35
- write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
36
- input directory.
37
-
38
- Note:
39
- This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
40
- combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
41
- SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
42
-
43
- The method notifies the user about the checksum calculation process via the terminal.
44
-
45
- The returned checksum accounts for both the contents of each file and the layout of the input directory
46
- structure.
47
-
48
- Args:
49
- directory: The Path to the directory to be checksummed.
50
- num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
51
- function defaults to using (logical CPU count - 4).
52
- batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
53
- to optimize progress reporting to avoid cluttering the terminal.
54
- save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
55
-
56
- Returns:
57
- The xxHash3-128 checksum for the input directory as a hexadecimal string.
58
- """
@@ -1,239 +0,0 @@
1
- from pathlib import Path
2
-
3
- import polars as pl
4
-
5
- from ..data_classes import (
6
- SessionData as SessionData,
7
- SessionTypes as SessionTypes,
8
- TrackerFileNames as TrackerFileNames,
9
- RunTrainingDescriptor as RunTrainingDescriptor,
10
- LickTrainingDescriptor as LickTrainingDescriptor,
11
- WindowCheckingDescriptor as WindowCheckingDescriptor,
12
- MesoscopeExperimentDescriptor as MesoscopeExperimentDescriptor,
13
- get_processing_tracker as get_processing_tracker,
14
- )
15
- from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
16
-
17
- class ProjectManifest:
18
- """Wraps the contents of a Sun lab project manifest .feather file and exposes methods for visualizing and
19
- working with the data stored inside the file.
20
-
21
- This class functions as a high-level API for working with Sun lab projects. It is used both to visualize the
22
- current state of various projects and during automated data processing to determine which processing steps to
23
- apply to different sessions.
24
-
25
- Args:
26
- manifest_file: The path to the .feather manifest file that stores the target project's state data.
27
-
28
- Attributes:
29
- _data: Stores the manifest data as a Polars DataFrame.
30
- _animal_string: Determines whether animal IDs are stored as strings or unsigned integers.
31
- """
32
-
33
- _data: pl.DataFrame
34
- _animal_string: bool
35
- def __init__(self, manifest_file: Path) -> None: ...
36
- def print_data(self) -> None:
37
- """Prints the entire contents of the manifest file to the terminal."""
38
- def print_summary(self, animal: str | int | None = None) -> None:
39
- """Prints a summary view of the manifest file to the terminal, excluding the 'experimenter notes' data for
40
- each session.
41
-
42
- This data view is optimized for tracking which processing steps have been applied to each session inside the
43
- project.
44
-
45
- Args:
46
- animal: The ID of the animal for which to display the data. If an ID is provided, this method will only
47
- display the data for that animal. Otherwise, it will display the data for all animals.
48
- """
49
- def print_notes(self, animal: str | int | None = None) -> None:
50
- """Prints only animal, session, and notes data from the manifest file.
51
-
52
- This data view is optimized for experimenters to check what sessions have been recorded for each animal in the
53
- project and refresh their memory on the outcomes of each session using experimenter notes.
54
-
55
- Args:
56
- animal: The ID of the animal for which to display the data. If an ID is provided, this method will only
57
- display the data for that animal. Otherwise, it will display the data for all animals.
58
- """
59
- @property
60
- def animals(self) -> tuple[str, ...]:
61
- """Returns all unique animal IDs stored inside the manifest file.
62
-
63
- This provides a tuple of all animal IDs participating in the target project.
64
- """
65
- def _get_filtered_sessions(
66
- self,
67
- animal: str | int | None = None,
68
- exclude_incomplete: bool = True,
69
- dataset_ready_only: bool = False,
70
- not_dataset_ready_only: bool = False,
71
- ) -> tuple[str, ...]:
72
- """This worker method is used to get a list of sessions with optional filtering.
73
-
74
- User-facing methods call this worker under-the-hood to fetch the filtered tuple of sessions.
75
-
76
- Args:
77
- animal: An optional animal ID to filter the sessions. If set to None, the method returns sessions for all
78
- animals.
79
- exclude_incomplete: Determines whether to exclude sessions not marked as 'complete' from the output
80
- list.
81
- dataset_ready_only: Determines whether to exclude sessions not marked as 'dataset' integration ready from
82
- the output list. Enabling this option only shows sessions that can be integrated into a dataset.
83
- not_dataset_ready_only: The opposite of 'dataset_ready_only'. Determines whether to exclude sessions marked
84
- as 'dataset' integration ready from the output list. Note, when both this and 'dataset_ready_only' are
85
- enabled, the 'dataset_ready_only' option takes precedence.
86
-
87
- Returns:
88
- The tuple of session IDs matching the filter criteria.
89
-
90
- Raises:
91
- ValueError: If the specified animal is not found in the manifest file.
92
- """
93
- @property
94
- def sessions(self) -> tuple[str, ...]:
95
- """Returns all session IDs stored inside the manifest file.
96
-
97
- This property provides a tuple of all sessions, independent of the participating animal, that were recorded as
98
- part of the target project. Use the get_sessions() method to get the list of session tuples with filtering.
99
- """
100
- def get_sessions(
101
- self,
102
- animal: str | int | None = None,
103
- exclude_incomplete: bool = True,
104
- dataset_ready_only: bool = False,
105
- not_dataset_ready_only: bool = False,
106
- ) -> tuple[str, ...]:
107
- """Returns requested session IDs based on selected filtering criteria.
108
-
109
- This method provides a tuple of sessions based on the specified filters. If no animal is specified, returns
110
- sessions for all animals in the project.
111
-
112
- Args:
113
- animal: An optional animal ID to filter the sessions. If set to None, the method returns sessions for all
114
- animals.
115
- exclude_incomplete: Determines whether to exclude sessions not marked as 'complete' from the output
116
- list.
117
- dataset_ready_only: Determines whether to exclude sessions not marked as 'dataset' integration ready from
118
- the output list. Enabling this option only shows sessions that can be integrated into a dataset.
119
- not_dataset_ready_only: The opposite of 'dataset_ready_only'. Determines whether to exclude sessions marked
120
- as 'dataset' integration ready from the output list. Note, when both this and 'dataset_ready_only' are
121
- enabled, the 'dataset_ready_only' option takes precedence.
122
-
123
- Returns:
124
- The tuple of session IDs matching the filter criteria.
125
-
126
- Raises:
127
- ValueError: If the specified animal is not found in the manifest file.
128
- """
129
- def get_session_info(self, session: str) -> pl.DataFrame:
130
- """Returns a Polars DataFrame that stores detailed information for the specified session.
131
-
132
- Since session IDs are unique, it is expected that filtering by session ID is enough to get the requested
133
- information.
134
-
135
- Args:
136
- session: The ID of the session for which to retrieve the data.
137
-
138
- Returns:
139
- A Polars DataFrame with the following columns: 'animal', 'date', 'notes', 'session', 'type', 'complete',
140
- 'intensity_verification', 'suite2p', 'behavior', 'video', 'dataset'.
141
- """
142
-
143
- def generate_project_manifest(
144
- raw_project_directory: Path, output_directory: Path, processed_data_root: Path | None = None
145
- ) -> None:
146
- """Builds and saves the project manifest .feather file under the specified output directory.
147
-
148
- This function evaluates the input project directory and builds the 'manifest' file for the project. The file
149
- includes the descriptive information about every session stored inside the input project folder and the state of
150
- the session's data processing (which processing pipelines have been applied to each session). The file will be
151
- created under the 'output_path' directory and use the following name pattern: ProjectName_manifest.feather.
152
-
153
- Notes:
154
- The manifest file is primarily used to capture and move project state information between machines, typically
155
- in the context of working with data stored on a remote compute server or cluster. However, it can also be used
156
- on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
157
- lab regardless of the runtime context.
158
-
159
- Args:
160
- raw_project_directory: The path to the root project directory used to store raw session data.
161
- output_directory: The path to the directory where to save the generated manifest file.
162
- processed_data_root: The path to the root directory (volume) used to store processed data for all Sun lab
163
- projects if it is different from the parent of the 'raw_project_directory'. Typically, this would be the
164
- case on remote compute server(s) and not on local machines.
165
- """
166
-
167
- def verify_session_checksum(
168
- session_path: Path,
169
- manager_id: int,
170
- create_processed_data_directory: bool = True,
171
- processed_data_root: None | Path = None,
172
- update_manifest: bool = False,
173
- ) -> None:
174
- """Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
175
- comparing it against the checksum stored in the ax_checksum.txt file.
176
-
177
- Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
178
- server for long-term storage. This function is designed to create the 'verified.bin' marker file if the checksum
179
- matches and to remove the 'telomere.bin' and 'verified.bin' marker files if it does not.
180
-
181
- Notes:
182
- Removing the telomere.bin marker file from the session's raw_data folder marks the session as incomplete,
183
- excluding it from all further automatic processing.
184
-
185
- This function is also used to create the processed data hierarchy on the BioHPC server, when it is called as
186
- part of the data preprocessing runtime performed by a data acquisition system.
187
-
188
- Since version 3.1.0, this functon also supports (re) generating the processed session's project manifest file,
189
- which is used to support further Sun lab data processing pipelines.
190
-
191
- Args:
192
- session_path: The path to the session directory to be verified. Note, the input session directory must contain
193
- the 'raw_data' subdirectory.
194
- manager_id: The xxHash-64 hash-value that specifies the unique identifier of the manager process that
195
- manages the integrity verification runtime.
196
- create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
197
- processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
198
- the root directory where to store the processed data from all projects, and it will be automatically
199
- modified to include the project name, the animal name, and the session ID.
200
- update_manifest: Determines whether to update (regenerate) the project manifest file for the processed session's
201
- project. This should always be enabled when working with remote compute server(s) to ensure that the
202
- project manifest file contains the most actual snapshot of the project's state.
203
- """
204
-
205
- def resolve_p53_marker(
206
- session_path: Path,
207
- create_processed_data_directory: bool = True,
208
- processed_data_root: None | Path = None,
209
- remove: bool = False,
210
- update_manifest: bool = False,
211
- ) -> None:
212
- """Depending on configuration, either creates or removes the p53.bin marker file for the target session.
213
-
214
- The marker file statically determines whether the session can be targeted by data processing or dataset formation
215
- pipelines.
216
-
217
- Notes:
218
- Since dataset integration relies on data processing outputs, it is essential to prevent processing pipelines
219
- from altering the data while it is integrated into a dataset. The p53.bin marker solves this issue by ensuring
220
- that only one type of runtimes (processing or dataset integration) is allowed to work with the session.
221
-
222
- For the p53.bin marker to be created, the session must not be undergoing processing. For the p53 marker
223
- to be removed, the session must not be undergoing dataset integration.
224
-
225
- Since version 3.1.0, this functon also supports (re)generating the processed session's project manifest file,
226
- which is used to support further Sun lab data processing pipelines.
227
-
228
- Args:
229
- session_path: The path to the session directory for which the p53.bin marker needs to be resolved. Note, the
230
- input session directory must contain the 'raw_data' subdirectory.
231
- create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
232
- processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
233
- the root directory where to store the processed data from all projects, and it will be automatically
234
- modified to include the project name, the animal name, and the session ID.
235
- remove: Determines whether this function is called to create or remove the p53.bin marker.
236
- update_manifest: Determines whether to update (regenerate) the project manifest file for the processed session's
237
- project. This should always be enabled when working with remote compute server(s) to ensure that the
238
- project manifest file contains the most actual snapshot of the project's state.
239
- """