sl-shared-assets 1.0.0rc19__py3-none-any.whl → 1.0.0rc21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sl-shared-assets might be problematic. Click here for more details.

Files changed (36) hide show
  1. sl_shared_assets/__init__.py +27 -27
  2. sl_shared_assets/__init__.pyi +73 -0
  3. sl_shared_assets/cli.py +266 -40
  4. sl_shared_assets/cli.pyi +87 -0
  5. sl_shared_assets/data_classes/__init__.py +23 -20
  6. sl_shared_assets/data_classes/__init__.pyi +61 -0
  7. sl_shared_assets/data_classes/configuration_data.py +407 -26
  8. sl_shared_assets/data_classes/configuration_data.pyi +194 -0
  9. sl_shared_assets/data_classes/runtime_data.py +59 -41
  10. sl_shared_assets/data_classes/runtime_data.pyi +145 -0
  11. sl_shared_assets/data_classes/session_data.py +168 -914
  12. sl_shared_assets/data_classes/session_data.pyi +249 -0
  13. sl_shared_assets/data_classes/surgery_data.py +3 -3
  14. sl_shared_assets/data_classes/surgery_data.pyi +89 -0
  15. sl_shared_assets/server/__init__.pyi +8 -0
  16. sl_shared_assets/server/job.pyi +94 -0
  17. sl_shared_assets/server/server.pyi +95 -0
  18. sl_shared_assets/tools/__init__.py +8 -1
  19. sl_shared_assets/tools/__init__.pyi +15 -0
  20. sl_shared_assets/tools/ascension_tools.py +27 -26
  21. sl_shared_assets/tools/ascension_tools.pyi +68 -0
  22. sl_shared_assets/tools/packaging_tools.py +14 -1
  23. sl_shared_assets/tools/packaging_tools.pyi +56 -0
  24. sl_shared_assets/tools/project_management_tools.py +164 -0
  25. sl_shared_assets/tools/project_management_tools.pyi +48 -0
  26. sl_shared_assets/tools/transfer_tools.pyi +53 -0
  27. {sl_shared_assets-1.0.0rc19.dist-info → sl_shared_assets-1.0.0rc21.dist-info}/METADATA +21 -4
  28. sl_shared_assets-1.0.0rc21.dist-info/RECORD +36 -0
  29. sl_shared_assets-1.0.0rc21.dist-info/entry_points.txt +8 -0
  30. sl_shared_assets/suite2p/__init__.py +0 -8
  31. sl_shared_assets/suite2p/multi_day.py +0 -225
  32. sl_shared_assets/suite2p/single_day.py +0 -563
  33. sl_shared_assets-1.0.0rc19.dist-info/RECORD +0 -23
  34. sl_shared_assets-1.0.0rc19.dist-info/entry_points.txt +0 -4
  35. {sl_shared_assets-1.0.0rc19.dist-info → sl_shared_assets-1.0.0rc21.dist-info}/WHEEL +0 -0
  36. {sl_shared_assets-1.0.0rc19.dist-info → sl_shared_assets-1.0.0rc21.dist-info}/licenses/LICENSE +0 -0
@@ -5,13 +5,12 @@ an example for how to convert other data formats to match use the Sun lab data s
5
5
 
6
6
  from pathlib import Path
7
7
  import datetime
8
- import tempfile
9
8
 
10
9
  import numpy as np
11
- from ataraxis_base_utilities import LogLevel, console
10
+ from ataraxis_base_utilities import LogLevel, console, ensure_directory_exists
12
11
  from ataraxis_time.time_helpers import extract_timestamp_from_bytes
13
12
 
14
- from ..data_classes import SessionData, ProjectConfiguration
13
+ from ..data_classes import SessionData, ProjectConfiguration, get_system_configuration_data
15
14
  from .transfer_tools import transfer_directory
16
15
  from .packaging_tools import calculate_directory_checksum
17
16
 
@@ -170,7 +169,7 @@ def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
170
169
  return True
171
170
 
172
171
 
173
- def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_root_directory: Path) -> None:
172
+ def ascend_tyche_data(root_directory: Path) -> None:
174
173
  """Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
175
174
 
176
175
  This function is used to convert old Tyche data to the modern data management standard. This is used to make the
@@ -188,30 +187,24 @@ def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_
188
187
  this function for a large number of sessions will result in a long processing time due to the network data
189
188
  transfer.
190
189
 
190
+ Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
191
+ only work on a machine that is part of an active Sun lab acquisition system.
192
+
191
193
  Args:
192
194
  root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
193
195
  root directory for the Tyche project.
194
- output_root_directory: The path to the local directory where to generate the converted Tyche project hierarchy.
195
- Typically, this is the 'root' directory where all other Sun lab projects are stored.
196
- server_root_directory: The path to the local filesystem-mounted BioHPC server storage directory. Note, this
197
- directory hs to be mapped to the local filesystem via the SMB or equivalent protocol.
198
196
  """
199
197
  # Generates a (shared) project configuration file.
200
198
  project_configuration = ProjectConfiguration()
201
199
 
202
- # Generates a temporary directory for NAS and Mesoscope paths. Since Tyche data is already backed up on the NAS and
203
- # we are not generating new data, these root paths are not needed, but have to be created as part of the pipeline.
204
- # Redirecting them to local temporary directories allows avoiding extra steps to manually remove these redundant
205
- # directories after runtime.
206
- temp_nas_dir = Path(tempfile.mkdtemp(prefix="nas_temp_"))
207
- temp_mesoscope_dir = Path(tempfile.mkdtemp(prefix="mesoscope_temp_"))
200
+ # The acquisition system config resolves most paths and filesystem configuration arguments
201
+ acquisition_system = get_system_configuration_data()
202
+ output_root_directory = acquisition_system.paths.root_directory
203
+ server_root_directory = acquisition_system.paths.server_storage_directory
208
204
 
209
205
  # Statically defines project name and local root paths
210
- project_configuration.project_name = "Tyche"
211
- project_configuration.local_root_directory = output_root_directory
212
- project_configuration.local_server_directory = server_root_directory
213
- project_configuration.local_nas_directory = temp_nas_dir
214
- project_configuration.local_mesoscope_directory = temp_mesoscope_dir
206
+ project_name = "Tyche"
207
+ project_configuration.project_name = project_name
215
208
 
216
209
  # Uses nonsensical google sheet IDs. Tyche project did not use Google Sheet processing like our modern projects do.
217
210
  project_configuration.water_log_sheet_id = "1xFh9Q2zT7pL3mVkJdR8bN6yXoE4wS5aG0cHu2Kf7D3v"
@@ -219,13 +212,14 @@ def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_
219
212
 
220
213
  # Dumps project configuration into the 'configuration' subfolder of the Tyche project.
221
214
  configuration_path = output_root_directory.joinpath("Tyche", "configuration", "project_configuration.yaml")
215
+ ensure_directory_exists(configuration_path)
222
216
  project_configuration.save(path=configuration_path)
223
217
 
224
218
  # Assumes that root directory stores all animal folders to be processed
225
219
  for animal_folder in root_directory.iterdir():
226
220
  # Each animal folder is named to include project name and a static animal ID, e.g.: Tyche-A7. This extracts each
227
221
  # animal ID.
228
- animal_name = animal_folder.name.split(sep="-")[1]
222
+ animal_name = animal_folder.stem.split(sep="-")[1]
229
223
 
230
224
  # Under each animal root folder, there are day folders that use YYYY-MM-DD timestamps
231
225
  for session_folder in animal_folder.iterdir():
@@ -240,11 +234,11 @@ def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_
240
234
  # session data hierarchy using the output root. This generates a 'standard' Sun lab directory structure
241
235
  # for the Tyche data.
242
236
  session_data = SessionData.create(
237
+ project_name=project_configuration.project_name,
243
238
  session_name=session_name,
244
239
  animal_id=animal_name,
245
- project_configuration=project_configuration,
246
- session_type="Experiment",
247
- experiment_name=None, # Has to be none, otherwise the system tries to copy a configuration file.
240
+ session_type="mesoscope experiment",
241
+ experiment_name=None,
248
242
  )
249
243
 
250
244
  # Moves the data from the old hierarchy to the new hierarchy. If the process runs as expected, and
@@ -259,15 +253,22 @@ def ascend_tyche_data(root_directory: Path, output_root_directory: Path, server_
259
253
  # noinspection PyTypeChecker
260
254
  console.echo(message=message, level=LogLevel.WARNING)
261
255
  else:
262
- # If the transfer process was successful, generates a new checksum for the moved data
256
+ # Generates the telomere.bin file to mark the session as 'complete'
257
+ session_data.raw_data.telomere_path.touch()
258
+
259
+ # If the local transfer process was successful, generates a new checksum for the moved data
263
260
  calculate_directory_checksum(directory=Path(session_data.raw_data.raw_data_path))
261
+
264
262
  # Next, copies the data to the BioHPC server for further processing
265
263
  transfer_directory(
266
264
  source=Path(session_data.raw_data.raw_data_path),
267
- destination=Path(session_data.destinations.server_raw_data_path),
265
+ destination=Path(
266
+ server_root_directory.joinpath(project_name, animal_name, session_name, "raw_data")
267
+ ),
268
268
  verify_integrity=False,
269
269
  )
270
- # Finally, removes the now-empty old session data directory.
270
+
271
+ # Removes the now-empty old session data directory.
271
272
  acquisition_folder.rmdir()
272
273
 
273
274
  # If the loop above removed all acquisition folders, all data for that day has been successfully converted
@@ -0,0 +1,68 @@
1
+ from pathlib import Path
2
+
3
+ from ..data_classes import (
4
+ SessionData as SessionData,
5
+ ProjectConfiguration as ProjectConfiguration,
6
+ get_system_configuration_data as get_system_configuration_data,
7
+ )
8
+ from .transfer_tools import transfer_directory as transfer_directory
9
+ from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
10
+
11
+ def _generate_session_name(acquisition_path: Path) -> str:
12
+ """Generates a session name using the last modification time of a zstack.mat or MotionEstimator.me file.
13
+
14
+ This worker function uses one of the motion estimation files stored in each Tyche 'acquisition' subfolder to
15
+ generate a modern Sun lab timestamp-based session name. This is used to translate the original Tyche session naming
16
+ pattern into the pattern used by all modern Sun lab projects and pipelines.
17
+
18
+ Args:
19
+ acquisition_path: The absolute path to the target acquisition folder. These folders are found under the 'day'
20
+ folders for each animal, e.g.: Tyche-A7/2022_01_03/1.
21
+
22
+ Returns:
23
+ The modernized session name.
24
+ """
25
+
26
+ def _reorganize_data(session_data: SessionData, source_root: Path) -> bool:
27
+ """Reorganizes and moves the session's data from the source folder in the old Tyche data hierarchy to the raw_data
28
+ folder in the newly created modern hierarchy.
29
+
30
+ This worker function is used to physically rearrange the data from the original Tyche data structure to the
31
+ new data structure. It both moves the existing files to their new destinations and renames certain files to match
32
+ the modern naming convention used in the Sun lab.
33
+
34
+ Args:
35
+ session_data: The initialized SessionData instance managing the 'ascended' (modernized) session data hierarchy.
36
+ source_root: The absolute path to the old Tyche data hierarchy folder that stores session's data.
37
+
38
+ Returns:
39
+ True if the ascension process was successfully completed. False if the process encountered missing data or
40
+ otherwise did not go as expected. When the method returns False, the runtime function requests user intervention
41
+ to finalize the process manually.
42
+ """
43
+
44
+ def ascend_tyche_data(root_directory: Path) -> None:
45
+ """Reformats the old Tyche data to use the modern Sun lab layout and metadata files.
46
+
47
+ This function is used to convert old Tyche data to the modern data management standard. This is used to make the
48
+ data compatible with the modern Sun lab data workflows.
49
+
50
+ Notes:
51
+ This function is statically written to work with the raw Tyche dataset featured in the OSM manuscript:
52
+ https://www.nature.com/articles/s41586-024-08548-w. Additionally, it assumes that the dataset has been
53
+ preprocessed with the early Sun lab mesoscope compression pipeline. The function will not work for any other
54
+ project or data hierarchy.
55
+
56
+ As part of its runtime, the function automatically transfers the ascended session data to the BioHPC server.
57
+ Since transferring the data over the network is the bottleneck of this pipeline, it runs in a single-threaded
58
+ mode and is constrained by the communication channel between the local machine and the BioHPC server. Calling
59
+ this function for a large number of sessions will result in a long processing time due to the network data
60
+ transfer.
61
+
62
+ Since SessionData can only be created on a PC that has a valid acquisition system config, this function will
63
+ only work on a machine that is part of an active Sun lab acquisition system.
64
+
65
+ Args:
66
+ root_directory: The directory that stores one or more Tyche animal folders. This can be conceptualized as the
67
+ root directory for the Tyche project.
68
+ """
@@ -10,6 +10,19 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
10
10
  from tqdm import tqdm
11
11
  import xxhash
12
12
 
13
+ # Defines a 'blacklist' set of files. Primarily, this lit contains the service files that may change after the session
14
+ # data has been acquired. Therefore, it does not make sense to include them in the checksum, as they do not reflect the
15
+ # data that should remain permanently unchanged. Note, make sure all service files are added to this set!
16
+ _excluded_files = {
17
+ "ax_checksum.txt",
18
+ "ubiquitin.bin",
19
+ "telomere.bin",
20
+ "single_day_suite2p.bin",
21
+ "multi_day_suite2p.bin",
22
+ "behavior.bin",
23
+ "dlc.bin",
24
+ }
25
+
13
26
 
14
27
  def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
15
28
  """Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
@@ -89,7 +102,7 @@ def calculate_directory_checksum(
89
102
  files = sorted(
90
103
  path
91
104
  for path in directory.rglob("*")
92
- if path.is_file() and path.stem != "ax_checksum" and path.suffix != ".txt" # Excludes checksum files
105
+ if path.is_file() and f"{path.stem}{path.suffix}" not in _excluded_files # Excludes service files
93
106
  )
94
107
 
95
108
  # Precreates the directory checksum
@@ -0,0 +1,56 @@
1
+ from pathlib import Path
2
+
3
+ from _typeshed import Incomplete
4
+
5
+ _excluded_files: Incomplete
6
+
7
+ def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]:
8
+ """Calculates xxHash3-128 checksum for a single file and its path relative to the base directory.
9
+
10
+ This function is passed to parallel workers used by the calculate_directory_hash() method that iteratively
11
+ calculates the checksum for all files inside a directory. Each call to this function returns the checksum for the
12
+ target file, which includes both the contents of the file and its path relative to the base directory.
13
+
14
+ Args:
15
+ base_directory: The path to the base (root) directory which is being checksummed by the main
16
+ 'calculate_directory_checksum' function.
17
+ file_path: The absolute path to the target file.
18
+
19
+ Returns:
20
+ A tuple with two elements. The first element is the path to the file relative to the base directory. The second
21
+ element is the xxHash3-128 checksum that covers the relative path and the contents of the file.
22
+ """
23
+
24
+ def calculate_directory_checksum(
25
+ directory: Path, num_processes: int | None = None, batch: bool = False, save_checksum: bool = True
26
+ ) -> str:
27
+ """Calculates xxHash3-128 checksum for the input directory, which includes the data of all contained files and
28
+ the directory structure information.
29
+
30
+ This function is used to generate a checksum for the raw_data directory of each experiment or training session.
31
+ Checksums are used to verify the session data integrity during transmission between the PC that acquired the data
32
+ and long-term storage locations, such as the Synology NAS or the BioHPC server. The function can be configured to
33
+ write the generated checksum as a hexadecimal string to the ax_checksum.txt file stored at the highest level of the
34
+ input directory.
35
+
36
+ Note:
37
+ This method uses multiprocessing to efficiently parallelize checksum calculation for multiple files. In
38
+ combination with xxHash3, this achieves a significant speedup over more common checksums, such as MD5 and
39
+ SHA256. Note that xxHash3 is not suitable for security purposes and is only used to ensure data integrity.
40
+
41
+ The method notifies the user about the checksum calculation process via the terminal.
42
+
43
+ The returned checksum accounts for both the contents of each file and the layout of the input directory
44
+ structure.
45
+
46
+ Args:
47
+ directory: The Path to the directory to be checksummed.
48
+ num_processes: The number of CPU processes to use for parallelizing checksum calculation. If set to None, the
49
+ function defaults to using (logical CPU count - 4).
50
+ batch: Determines whether the function is called as part of batch-processing multiple directories. This is used
51
+ to optimize progress reporting to avoid cluttering the terminal.
52
+ save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
53
+
54
+ Returns:
55
+ The xxHash3-128 checksum for the input directory as a hexadecimal string.
56
+ """
@@ -0,0 +1,164 @@
1
+ """This module provides tools for managing the data of any Sun lab project. Tools from this module extend the
2
+ functionality of SessionData class via a convenient API that allows working with the data of multiple sessions making
3
+ up a given project."""
4
+
5
+ from pathlib import Path
6
+
7
+ import polars as pl
8
+
9
+ from ..data_classes import SessionData
10
+ from .packaging_tools import calculate_directory_checksum
11
+
12
+
13
+ def generate_project_manifest(
14
+ raw_project_directory: Path, output_directory: Path, processed_project_directory: Path | None = None
15
+ ) -> None:
16
+ """Builds and saves the project manifest .feather file under the specified output directory.
17
+
18
+ This function evaluates the input project directory and builds the 'manifest' file for the project. The file
19
+ includes the descriptive information about every session stored inside the input project folder and the state of
20
+ session's data processing (which processing pipelines have been applied to each session). The file will be created
21
+ under the 'output_path' directory and use the following name pattern: {ProjectName}}_manifest.feather.
22
+
23
+ Notes:
24
+ The manifest file is primarily used to capture and move project state information between machines, typically
25
+ in the context of working with data stored on a remote compute server or cluster. However, it can also be used
26
+ on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
27
+ lab regardless of the runtime context.
28
+
29
+ Args:
30
+ raw_project_directory: The path to the root project directory used to store raw session data.
31
+ output_directory: The path to the directory where to save the generated manifest file.
32
+ processed_project_directory: The path to the root project directory used to store processed session data if it
33
+ is different from the 'raw_project_directory'. Typically, this would be the case on remote compute server(s)
34
+ and not on local machines.
35
+ """
36
+ # Finds all raw data directories
37
+ session_directories = [directory.parent for directory in raw_project_directory.rglob("raw_data")]
38
+
39
+ # Precreates the 'manifest' dictionary structure
40
+ manifest: dict[str, list[str | bool]] = {
41
+ "animal": [], # Animal IDs.
42
+ "session": [], # Session names.
43
+ "type": [], # Type of the session (e.g., Experiment, Training, etc.).
44
+ "raw_data": [], # Server-side raw_data folder path.
45
+ "processed_data": [], # Server-side processed_data folder path.
46
+ "complete": [], # Determines if the session data is complete. Incomplete sessions are excluded from processing.
47
+ "single_day_suite2p": [], # Determines whether the session has been processed with the single-day s2p pipeline.
48
+ "multi_day_suite2p": [], # Determines whether the session has been processed with the multi-day s2p pipeline.
49
+ "behavior": [], # Determines whether the session has been processed with the behavior extraction pipeline.
50
+ "dlc": [], # Determines whether the session has been processed with the DeepLabCut pipeline.
51
+ }
52
+
53
+ # Loops over each session of every animal in the project and extracts session ID information and information
54
+ # about which processing steps have been successfully applied to the session.
55
+ for directory in session_directories:
56
+ # Instantiates the SessionData instance to resolve the paths to all session's data files and locations.
57
+ session_data = SessionData.load(
58
+ session_path=directory, processed_data_root=processed_project_directory, make_processed_data_directory=False
59
+ )
60
+
61
+ # Fills the manifest dictionary with data for the processed session:
62
+
63
+ # Extracts ID and data path information from the SessionData instance
64
+ manifest["animal"].append(session_data.animal_id)
65
+ manifest["session"].append(session_data.session_name)
66
+ manifest["type"].append(session_data.session_type)
67
+ manifest["raw_data"].append(str(session_data.raw_data.raw_data_path))
68
+ manifest["processed_data"].append(str(session_data.processed_data.processed_data_path))
69
+
70
+ # If the session raw_data folder contains the telomere.bin file, marks the session as complete.
71
+ manifest["complete"].append(session_data.raw_data.telomere_path.exists())
72
+
73
+ # If the session is incomplete, marks all processing steps as FALSE, as automatic processing is disabled for
74
+ # incomplete sessions.
75
+ if not manifest["complete"][-1]:
76
+ manifest["single_day_suite2p"].append(False)
77
+ manifest["multi_day_suite2p"].append(False)
78
+ manifest["behavior"].append(False)
79
+ manifest["dlc"].append(False)
80
+ continue # Cycles to the next session
81
+
82
+ # If the session processed_data folder contains the single-day suite2p.bin file, marks the single-day suite2p
83
+ # processing step as complete.
84
+ manifest["single_day_suite2p"].append(session_data.processed_data.single_day_suite2p_bin_path.exists())
85
+
86
+ # If the session processed_data folder contains the multi-day suite2p.bin file, marks the multi-day suite2p
87
+ # processing step as complete.
88
+ manifest["multi_day_suite2p"].append(session_data.processed_data.multi_day_suite2p_bin_path.exists())
89
+
90
+ # If the session processed_data folder contains the behavior.bin file, marks the behavior processing step as
91
+ # complete.
92
+ manifest["behavior"].append(session_data.processed_data.behavior_data_path.exists())
93
+
94
+ # If the session processed_data folder contains the dlc.bin file, marks the dlc processing step as
95
+ # complete.
96
+ manifest["dlc"].append(session_data.processed_data.dlc_bin_path.exists())
97
+
98
+ # Converts the manifest dictionary to a Polars Dataframe
99
+ schema = {
100
+ "animal": pl.String,
101
+ "session": pl.String,
102
+ "raw_data": pl.String,
103
+ "processed_data": pl.String,
104
+ "type": pl.String,
105
+ "complete": pl.Boolean,
106
+ "single_day_suite2p": pl.Boolean,
107
+ "multi_day_suite2p": pl.Boolean,
108
+ "behavior": pl.Boolean,
109
+ "dlc": pl.Boolean,
110
+ }
111
+ df = pl.DataFrame(manifest, schema=schema)
112
+
113
+ # Sorts the DataFrame by animal and then session. Since we assign animal IDs sequentially and 'name' sessions based
114
+ # on acquisition timestamps, the sort order is chronological.
115
+ sorted_df = df.sort(["animal", "session"])
116
+
117
+ # Saves the generated manifest to the project-specific manifest .feather file for further processing.
118
+ sorted_df.write_ipc(
119
+ file=output_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather"), compression="lz4"
120
+ )
121
+
122
+
123
+ def verify_session_checksum(session_path: Path) -> bool:
124
+ """Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
125
+ comparing it against the checksum stored in the ax_checksum.txt file.
126
+
127
+ Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
128
+ server for long-term storage. This function is designed to do nothing if the checksum matches and to remove the
129
+ 'telomere.bin' marker file if it does not.
130
+
131
+ Notes:
132
+ Removing the telomere.bin marker file from session's raw_data folder marks the session as incomplete, excluding
133
+ it from all further automatic processing.
134
+
135
+ Args:
136
+ session_path: The path to the session directory to be verified. Note, the input session directory must contain
137
+ the 'raw_data' subdirectory.
138
+
139
+ Returns:
140
+ True if the checksum matches, False otherwise.
141
+ """
142
+
143
+ # Loads session data layout
144
+ session_data = SessionData.load(session_path=session_path)
145
+
146
+ # Re-calculates the checksum for the raw_data directory
147
+ calculated_checksum = calculate_directory_checksum(
148
+ directory=session_data.raw_data.raw_data_path, batch=False, save_checksum=False
149
+ )
150
+
151
+ # Loads the checksum stored inside the ax_checksum.txt file
152
+ with open(session_data.raw_data.checksum_path, "r") as f:
153
+ stored_checksum = f.read().strip()
154
+
155
+ # If the two checksums do not match, this likely indicates data corruption.
156
+ if stored_checksum != calculated_checksum:
157
+ # If the telomere.bin file exists, removes this file. This automatically marks the session as incomplete for
158
+ # all other Sun lab runtimes. The presence of the telomere.bin file after integrity verification is used as a
159
+ # heuristic for determining whether the session has passed the verification process.
160
+ if session_data.raw_data.telomere_path.exists():
161
+ session_data.raw_data.telomere_path.unlink()
162
+ return False
163
+
164
+ return True
@@ -0,0 +1,48 @@
1
+ from pathlib import Path
2
+
3
+ from ..data_classes import SessionData as SessionData
4
+ from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
5
+
6
+ def generate_project_manifest(
7
+ raw_project_directory: Path, output_directory: Path, processed_project_directory: Path | None = None
8
+ ) -> None:
9
+ """Builds and saves the project manifest .feather file under the specified output directory.
10
+
11
+ This function evaluates the input project directory and builds the 'manifest' file for the project. The file
12
+ includes the descriptive information about every session stored inside the input project folder and the state of
13
+ session's data processing (which processing pipelines have been applied to each session). The file will be created
14
+ under the 'output_path' directory and use the following name pattern: {ProjectName}}_manifest.feather.
15
+
16
+ Notes:
17
+ The manifest file is primarily used to capture and move project state information between machines, typically
18
+ in the context of working with data stored on a remote compute server or cluster. However, it can also be used
19
+ on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
20
+ lab regardless of the runtime context.
21
+
22
+ Args:
23
+ raw_project_directory: The path to the root project directory used to store raw session data.
24
+ output_directory: The path to the directory where to save the generated manifest file.
25
+ processed_project_directory: The path to the root project directory used to store processed session data if it
26
+ is different from the 'raw_project_directory'. Typically, this would be the case on remote compute server(s)
27
+ and not on local machines.
28
+ """
29
+
30
+ def verify_session_checksum(session_path: Path) -> bool:
31
+ """Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
32
+ comparing it against the checksum stored in the ax_checksum.txt file.
33
+
34
+ Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
35
+ server for long-term storage. This function is designed to do nothing if the checksum matches and to remove the
36
+ 'telomere.bin' marker file if it does not.
37
+
38
+ Notes:
39
+ Removing the telomere.bin marker file from session's raw_data folder marks the session as incomplete, excluding
40
+ it from all further automatic processing.
41
+
42
+ Args:
43
+ session_path: The path to the session directory to be verified. Note, the input session directory must contain
44
+ the 'raw_data' subdirectory.
45
+
46
+ Returns:
47
+ True if the checksum matches, False otherwise.
48
+ """
@@ -0,0 +1,53 @@
1
+ from pathlib import Path
2
+
3
+ from .packaging_tools import calculate_directory_checksum as calculate_directory_checksum
4
+
5
+ def _transfer_file(source_file: Path, source_directory: Path, destination_directory: Path) -> None:
6
+ """Copies the input file from the source directory to the destination directory while preserving the file metadata.
7
+
8
+ This is a worker method used by the transfer_directory() method to move multiple files in parallel.
9
+
10
+ Notes:
11
+ If the file is found under a hierarchy of subdirectories inside the input source_directory, that hierarchy will
12
+ be preserved in the destination directory.
13
+
14
+ Args:
15
+ source_file: The file to be copied.
16
+ source_directory: The root directory where the file is located.
17
+ destination_directory: The destination directory where to move the file.
18
+ """
19
+
20
+ def transfer_directory(source: Path, destination: Path, num_threads: int = 1, verify_integrity: bool = True) -> None:
21
+ """Copies the contents of the input directory tree from source to destination while preserving the folder
22
+ structure.
23
+
24
+ This function is used to assemble the experimental data from all remote machines used in the acquisition process on
25
+ the VRPC before the data is preprocessed. It is also used to transfer the preprocessed data from the VRPC to the
26
+ SynologyNAS and the Sun lab BioHPC server.
27
+
28
+ Notes:
29
+ This method recreates the moved directory hierarchy on the destination if the hierarchy does not exist. This is
30
+ done before copying the files.
31
+
32
+ The method executes a multithreading copy operation. It does not clean up the source files. That job is handed
33
+ to the specific preprocessing function from the sl_experiment or sl-forgery libraries that calls this function.
34
+
35
+ If the method is configured to verify transferred file integrity, it reruns the xxHash3-128 checksum calculation
36
+ and compares the returned checksum to the one stored in the source directory. The method assumes that all input
37
+ directories contain the 'ax_checksum.txt' file that stores the 'source' directory checksum at the highest level
38
+ of the input directory tree.
39
+
40
+ Args:
41
+ source: The path to the directory that needs to be moved.
42
+ destination: The path to the destination directory where to move the contents of the source directory.
43
+ num_threads: The number of threads to use for parallel file transfer. This number should be set depending on the
44
+ type of transfer (local or remote) and is not guaranteed to provide improved transfer performance. For local
45
+ transfers, setting this number above 1 will likely provide a performance boost. For remote transfers using
46
+ a single TCP / IP socket (such as non-multichannel SMB protocol), the number should be set to 1.
47
+ verify_integrity: Determines whether to perform integrity verification for the transferred files. Note,
48
+ integrity verification is a time-consuming process and generally would not be a concern for most runtimes.
49
+ Therefore, it is often fine to disable this option to optimize method runtime speed.
50
+
51
+ Raises:
52
+ RuntimeError: If the transferred files do not pass the xxHas3-128 checksum integrity verification.
53
+ """
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sl-shared-assets
3
- Version: 1.0.0rc19
3
+ Version: 1.0.0rc21
4
4
  Summary: Stores assets shared between multiple Sun (NeuroAI) lab data pipelines.
5
5
  Project-URL: Homepage, https://github.com/Sun-Lab-NBB/sl-shared-assets
6
6
  Project-URL: Documentation, https://sl-shared-assets-api-docs.netlify.app/
7
- Author: Ivan Kondratyev, Kushaan Gupta, Yuantao Deng
7
+ Author: Ivan Kondratyev, Kushaan Gupta, Yuantao Deng, Natalie Yeung
8
8
  Maintainer-email: Ivan Kondratyev <ik278@cornell.edu>
9
9
  License: GNU GENERAL PUBLIC LICENSE
10
10
  Version 3, 29 June 2007
@@ -695,8 +695,10 @@ Requires-Dist: ataraxis-base-utilities<4,>=3
695
695
  Requires-Dist: ataraxis-data-structures<4,>=3.1.1
696
696
  Requires-Dist: ataraxis-time<4,>=3
697
697
  Requires-Dist: click<9,>=8
698
- Requires-Dist: dacite<2,>=1
698
+ Requires-Dist: natsort<9,>=8
699
699
  Requires-Dist: paramiko<4,>=3.5.1
700
+ Requires-Dist: polars<2,>=1
701
+ Requires-Dist: pyarrow<21,>=20
700
702
  Requires-Dist: simple-slurm<1,>=0
701
703
  Requires-Dist: tqdm<5,>=4
702
704
  Requires-Dist: xxhash<4,>=3
@@ -717,8 +719,10 @@ Requires-Dist: types-tqdm<5,>=4; extra == 'conda'
717
719
  Provides-Extra: condarun
718
720
  Requires-Dist: appdirs<2,>=1; extra == 'condarun'
719
721
  Requires-Dist: click<9,>=8; extra == 'condarun'
720
- Requires-Dist: dacite<2,>=1; extra == 'condarun'
722
+ Requires-Dist: natsort<9,>=8; extra == 'condarun'
721
723
  Requires-Dist: paramiko<4,>=3.5.1; extra == 'condarun'
724
+ Requires-Dist: polars<2,>=1; extra == 'condarun'
725
+ Requires-Dist: pyarrow<21,>=20; extra == 'condarun'
722
726
  Requires-Dist: tqdm<5,>=4; extra == 'condarun'
723
727
  Provides-Extra: dev
724
728
  Requires-Dist: ataraxis-automation<5,>=4; extra == 'dev'
@@ -781,6 +785,7 @@ acquisition and processing and provides the API for accessing the lab’s main c
781
785
 
782
786
  - [Dependencies](#dependencies)
783
787
  - [Installation](#installation)
788
+ - [Usage](#usage)
784
789
  - [API Documentation](#api-documentation)
785
790
  - [Versioning](#versioning)
786
791
  - [Authors](#authors)
@@ -811,11 +816,22 @@ Use the following command to install the library using pip: ```pip install sl-sh
811
816
 
812
817
  ---
813
818
 
819
+ ## Usage
820
+
821
+ All library components are intended to be used via other Sun lab libraries. Developers should study the API and CLI
822
+ documentation below to learn how to use library components in other Sun lab libraries.
823
+
824
+ ---
825
+
814
826
  ## API Documentation
815
827
 
816
828
  See the [API documentation](https://sl-shared-assets-api-docs.netlify.app/) for the
817
829
  detailed description of the methods and classes exposed by components of this library.
818
830
 
831
+ **Note!** The API documentation includes important information about Command-Line-Interfaces (CLIs) exposed by this
832
+ library as part of installation into a Python environment. All users are highly encouraged to study the CLI
833
+ documentation to learn how to use library components via the terminal.
834
+
819
835
  ___
820
836
 
821
837
  ## Versioning
@@ -830,6 +846,7 @@ We use [semantic versioning](https://semver.org/) for this project. For the vers
830
846
  - Ivan Kondratyev ([Inkaros](https://github.com/Inkaros))
831
847
  - Kushaan Gupta ([kushaangupta](https://github.com/kushaangupta))
832
848
  - Yuantao Deng ([YuantaoDeng](https://github.com/YuantaoDeng))
849
+ - Natalie Yeung
833
850
 
834
851
  ___
835
852