sl-shared-assets 4.0.0__py3-none-any.whl → 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +45 -42
- sl_shared_assets/command_line_interfaces/__init__.py +3 -0
- sl_shared_assets/command_line_interfaces/configure.py +173 -0
- sl_shared_assets/command_line_interfaces/manage.py +226 -0
- sl_shared_assets/data_classes/__init__.py +33 -32
- sl_shared_assets/data_classes/configuration_data.py +267 -79
- sl_shared_assets/data_classes/runtime_data.py +11 -11
- sl_shared_assets/data_classes/session_data.py +226 -289
- sl_shared_assets/data_classes/surgery_data.py +6 -6
- sl_shared_assets/server/__init__.py +24 -4
- sl_shared_assets/server/job.py +6 -7
- sl_shared_assets/server/pipeline.py +570 -0
- sl_shared_assets/server/server.py +57 -25
- sl_shared_assets/tools/__init__.py +9 -8
- sl_shared_assets/tools/packaging_tools.py +14 -25
- sl_shared_assets/tools/project_management_tools.py +602 -523
- sl_shared_assets/tools/transfer_tools.py +88 -23
- {sl_shared_assets-4.0.0.dist-info → sl_shared_assets-5.0.0.dist-info}/METADATA +46 -202
- sl_shared_assets-5.0.0.dist-info/RECORD +23 -0
- sl_shared_assets-5.0.0.dist-info/entry_points.txt +3 -0
- sl_shared_assets/__init__.pyi +0 -91
- sl_shared_assets/cli.py +0 -500
- sl_shared_assets/cli.pyi +0 -106
- sl_shared_assets/data_classes/__init__.pyi +0 -75
- sl_shared_assets/data_classes/configuration_data.pyi +0 -235
- sl_shared_assets/data_classes/runtime_data.pyi +0 -157
- sl_shared_assets/data_classes/session_data.pyi +0 -379
- sl_shared_assets/data_classes/surgery_data.pyi +0 -89
- sl_shared_assets/server/__init__.pyi +0 -11
- sl_shared_assets/server/job.pyi +0 -205
- sl_shared_assets/server/server.pyi +0 -298
- sl_shared_assets/tools/__init__.pyi +0 -19
- sl_shared_assets/tools/ascension_tools.py +0 -265
- sl_shared_assets/tools/ascension_tools.pyi +0 -68
- sl_shared_assets/tools/packaging_tools.pyi +0 -58
- sl_shared_assets/tools/project_management_tools.pyi +0 -239
- sl_shared_assets/tools/transfer_tools.pyi +0 -53
- sl_shared_assets-4.0.0.dist-info/RECORD +0 -36
- sl_shared_assets-4.0.0.dist-info/entry_points.txt +0 -7
- {sl_shared_assets-4.0.0.dist-info → sl_shared_assets-5.0.0.dist-info}/WHEEL +0 -0
- {sl_shared_assets-4.0.0.dist-info → sl_shared_assets-5.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""This module provides tools for managing the data of any Sun lab project. Tools from this module extend the
|
|
2
|
-
functionality of SessionData class via a convenient API
|
|
3
|
-
|
|
2
|
+
functionality of the SessionData class via a convenient API to automate routine tasks that primarily support data
|
|
3
|
+
processing pipelines."""
|
|
4
4
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from datetime import datetime
|
|
@@ -10,19 +10,604 @@ import polars as pl
|
|
|
10
10
|
from filelock import FileLock
|
|
11
11
|
from ataraxis_base_utilities import LogLevel, console
|
|
12
12
|
|
|
13
|
+
from ..server import TrackerFileNames, ProcessingTracker
|
|
13
14
|
from ..data_classes import (
|
|
14
15
|
SessionData,
|
|
16
|
+
SessionLock,
|
|
15
17
|
SessionTypes,
|
|
16
|
-
TrackerFileNames,
|
|
17
18
|
RunTrainingDescriptor,
|
|
18
19
|
LickTrainingDescriptor,
|
|
19
20
|
WindowCheckingDescriptor,
|
|
20
21
|
MesoscopeExperimentDescriptor,
|
|
21
|
-
get_processing_tracker,
|
|
22
22
|
)
|
|
23
|
+
from .transfer_tools import delete_directory, transfer_directory
|
|
23
24
|
from .packaging_tools import calculate_directory_checksum
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
def resolve_checksum(
|
|
28
|
+
session_path: Path,
|
|
29
|
+
manager_id: int,
|
|
30
|
+
processed_data_root: None | Path = None,
|
|
31
|
+
reset_tracker: bool = False,
|
|
32
|
+
regenerate_checksum: bool = False,
|
|
33
|
+
) -> None:
|
|
34
|
+
"""Verifies the integrity of the session's data by generating the checksum of the raw_data directory and comparing
|
|
35
|
+
it against the checksum stored in the ax_checksum.txt file.
|
|
36
|
+
|
|
37
|
+
Primarily, this function is used to verify data integrity after transferring it from the data acquisition system PC
|
|
38
|
+
to the remote server for long-term storage.
|
|
39
|
+
|
|
40
|
+
Notes:
|
|
41
|
+
Any session that does not successfully pass checksum verification (or recreation) is automatically excluded
|
|
42
|
+
from all further automatic processing steps.
|
|
43
|
+
|
|
44
|
+
Since version 5.0.0, this function also supports recalculating and overwriting the checksum stored inside the
|
|
45
|
+
ax_checksum.txt file. This allows this function to re-checksum session data, which is helpful if the
|
|
46
|
+
experimenter deliberately alters the session's data post-acquisition (for example, to comply with new data
|
|
47
|
+
storage guidelines).
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
session_path: The path to the session directory to be processed.
|
|
51
|
+
manager_id: The unique identifier of the manager process that manages the runtime.
|
|
52
|
+
processed_data_root: The path to the root directory used to store the processed data from all Sun lab projects,
|
|
53
|
+
if different from the 'session_path' root.
|
|
54
|
+
reset_tracker: Determines whether to reset the tracker file before executing the runtime. This allows
|
|
55
|
+
recovering from deadlocked runtimes, but otherwise should not be used to ensure runtime safety.
|
|
56
|
+
regenerate_checksum: Determines whether to update the checksum stored in the ax_checksum.txt file before
|
|
57
|
+
carrying out the verification. In this case, the verification necessarily succeeds and the session's
|
|
58
|
+
reference checksum is changed to reflect the current state of the session data.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# Loads session data layout. If configured to do so, also creates the processed data hierarchy
|
|
62
|
+
session_data = SessionData.load(
|
|
63
|
+
session_path=session_path,
|
|
64
|
+
processed_data_root=processed_data_root,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Acquires the exclusive session data access lock.
|
|
68
|
+
lock = SessionLock(file_path=session_data.tracking_data.session_lock_path)
|
|
69
|
+
lock.acquire(manager_id=manager_id)
|
|
70
|
+
|
|
71
|
+
# Initializes the ProcessingTracker instance
|
|
72
|
+
tracker = ProcessingTracker(
|
|
73
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.CHECKSUM)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# If requested, reset the tracker to the default state before starting the checksum resolution process.
|
|
77
|
+
if reset_tracker:
|
|
78
|
+
tracker.abort()
|
|
79
|
+
|
|
80
|
+
# Updates the tracker data to communicate that the process has started. This automatically clears the previous
|
|
81
|
+
# processing status stored in the file.
|
|
82
|
+
tracker.start(manager_id=manager_id)
|
|
83
|
+
try:
|
|
84
|
+
console.echo(
|
|
85
|
+
message=f"Resolving the data integrity checksum for session '{session_data.session_name}'...",
|
|
86
|
+
level=LogLevel.INFO,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Regenerates the checksum for the raw_data directory. Note, if the 'regenerate_checksum' flag is True, this
|
|
90
|
+
# guarantees that the check below succeeds as the function replaces the checksum in the ax_checksum.txt file
|
|
91
|
+
# with the newly calculated value.
|
|
92
|
+
calculated_checksum = calculate_directory_checksum(
|
|
93
|
+
directory=session_data.raw_data.raw_data_path, batch=False, save_checksum=regenerate_checksum
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Loads the checksum stored inside the ax_checksum.txt file
|
|
97
|
+
with session_data.raw_data.checksum_path.open() as f:
|
|
98
|
+
stored_checksum = f.read().strip()
|
|
99
|
+
|
|
100
|
+
# If the two checksums do not match, this likely indicates data corruption.
|
|
101
|
+
if stored_checksum != calculated_checksum:
|
|
102
|
+
tracker.error(manager_id=manager_id)
|
|
103
|
+
console.echo(
|
|
104
|
+
message=f"Session '{session_data.session_name}' raw data integrity: Compromised.", level=LogLevel.ERROR
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
else:
|
|
108
|
+
# Sets the tracker to indicate that the runtime completed successfully.
|
|
109
|
+
tracker.stop(manager_id=manager_id)
|
|
110
|
+
console.echo(
|
|
111
|
+
message=f"Session '{session_data.session_name}' raw data integrity: Verified.", level=LogLevel.SUCCESS
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
finally:
|
|
115
|
+
# If the code reaches this section while the tracker indicates that the processing is still running,
|
|
116
|
+
# this means that the runtime encountered an error.
|
|
117
|
+
if tracker.is_running:
|
|
118
|
+
tracker.error(manager_id=manager_id)
|
|
119
|
+
|
|
120
|
+
# Updates or generates the manifest file inside the root raw data project directory
|
|
121
|
+
generate_project_manifest(
|
|
122
|
+
raw_project_directory=session_data.raw_data.root_path.joinpath(session_data.project_name),
|
|
123
|
+
processed_data_root=processed_data_root,
|
|
124
|
+
manager_id=manager_id,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def prepare_session(
|
|
129
|
+
session_path: Path,
|
|
130
|
+
manager_id: int,
|
|
131
|
+
processed_data_root: Path | None,
|
|
132
|
+
reset_tracker: bool = False,
|
|
133
|
+
) -> None:
|
|
134
|
+
"""Prepares the target session for data processing and dataset integration.
|
|
135
|
+
|
|
136
|
+
This function is primarily designed to be used on remote compute servers that use different data volumes for
|
|
137
|
+
storage and processing. Since storage volumes are often slow, the session data needs to be copied to the fast
|
|
138
|
+
volume before executing processing pipelines. Typically, this function is used exactly once during each session's
|
|
139
|
+
life cycle: when it is first transferred to the remote compute server.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
session_path: The path to the session directory to be processed.
|
|
143
|
+
manager_id: The unique identifier of the manager process that manages the runtime.
|
|
144
|
+
processed_data_root: The path to the root directory used to store the processed data from all Sun lab projects,
|
|
145
|
+
if different from the 'session_path' root.
|
|
146
|
+
reset_tracker: Determines whether to reset the tracker file before executing the runtime. This allows
|
|
147
|
+
recovering from deadlocked runtimes, but otherwise should not be used to ensure runtime safety.
|
|
148
|
+
|
|
149
|
+
Notes:
|
|
150
|
+
This function inverses the result of running the archive_session() function.
|
|
151
|
+
"""
|
|
152
|
+
# Resolves the data hierarchy for the processed session
|
|
153
|
+
session_data = SessionData.load(
|
|
154
|
+
session_path=session_path,
|
|
155
|
+
processed_data_root=processed_data_root,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Acquires the exclusive session data access lock.
|
|
159
|
+
lock = SessionLock(file_path=session_data.tracking_data.session_lock_path)
|
|
160
|
+
lock.acquire(manager_id=manager_id)
|
|
161
|
+
|
|
162
|
+
# Initializes the ProcessingTracker instances for preparation and archiving pipelines.
|
|
163
|
+
preparation_tracker = ProcessingTracker(
|
|
164
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.PREPARATION)
|
|
165
|
+
)
|
|
166
|
+
archiving_tracker = ProcessingTracker(
|
|
167
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.ARCHIVING)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Explicitly prevents colliding with ongoing archiving runtimes.
|
|
171
|
+
if archiving_tracker.is_running:
|
|
172
|
+
message = (
|
|
173
|
+
f"Unable to prepare the session '{session_data.session_name}' for data processing, as it is currently "
|
|
174
|
+
f"being archived. Abort the archiving process or wait for it to complete before retrying."
|
|
175
|
+
)
|
|
176
|
+
console.error(message=message, error=RuntimeError)
|
|
177
|
+
|
|
178
|
+
# Resets the preparation tracker, if requested.
|
|
179
|
+
if reset_tracker:
|
|
180
|
+
preparation_tracker.abort()
|
|
181
|
+
|
|
182
|
+
# Starts the preparation runtime
|
|
183
|
+
preparation_tracker.start(manager_id=manager_id)
|
|
184
|
+
try:
|
|
185
|
+
console.echo(
|
|
186
|
+
message=f"Preparing session '{session_data.session_name}' for data processing...", level=LogLevel.INFO
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# If the session uses different roots for 'raw' and 'source' data, copies raw_data folder to the path
|
|
190
|
+
# specified by the 'source_data'.
|
|
191
|
+
if session_data.raw_data.root_path != session_data.source_data.root_path:
|
|
192
|
+
console.echo(
|
|
193
|
+
message=f"Copying the 'raw_data' directory to the working volume as the 'source_data' directory...",
|
|
194
|
+
level=LogLevel.INFO,
|
|
195
|
+
)
|
|
196
|
+
transfer_directory(
|
|
197
|
+
source=session_data.raw_data.raw_data_path,
|
|
198
|
+
destination=session_data.source_data.raw_data_path,
|
|
199
|
+
num_threads=0,
|
|
200
|
+
verify_integrity=False,
|
|
201
|
+
remove_source=False,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# If the session contains archived processed data, restores the data to the working root.
|
|
205
|
+
if (
|
|
206
|
+
session_data.archived_data.root_path != session_data.processed_data.root_path
|
|
207
|
+
and archiving_tracker.is_complete
|
|
208
|
+
and session_data.archived_data.processed_data_path.exists()
|
|
209
|
+
):
|
|
210
|
+
console.echo(
|
|
211
|
+
message=(
|
|
212
|
+
f"Transferring the 'archived_data' directory to the working volume as the 'processed_data' "
|
|
213
|
+
f"directory..."
|
|
214
|
+
),
|
|
215
|
+
level=LogLevel.INFO,
|
|
216
|
+
)
|
|
217
|
+
transfer_directory(
|
|
218
|
+
source=session_data.archived_data.processed_data_path,
|
|
219
|
+
destination=session_data.processed_data.processed_data_path,
|
|
220
|
+
num_threads=0,
|
|
221
|
+
verify_integrity=False,
|
|
222
|
+
remove_source=True,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Preparation is complete
|
|
226
|
+
preparation_tracker.stop(manager_id=manager_id)
|
|
227
|
+
archiving_tracker.abort() # Resets the state of the archiving tracker, as the session is no longer archived.
|
|
228
|
+
console.echo(
|
|
229
|
+
message=f"Session '{session_data.session_name}': Prepared for data processing.", level=LogLevel.SUCCESS
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
finally:
|
|
233
|
+
# If the code reaches this section while the tracker indicates that the processing is still running,
|
|
234
|
+
# this means that the runtime encountered an error.
|
|
235
|
+
if preparation_tracker.is_running:
|
|
236
|
+
preparation_tracker.error(manager_id=manager_id)
|
|
237
|
+
|
|
238
|
+
# Updates or generates the manifest file inside the root raw data project directory
|
|
239
|
+
generate_project_manifest(
|
|
240
|
+
raw_project_directory=session_data.raw_data.root_path.joinpath(session_data.project_name),
|
|
241
|
+
processed_data_root=processed_data_root,
|
|
242
|
+
manager_id=manager_id,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def archive_session(
|
|
247
|
+
session_path: Path,
|
|
248
|
+
manager_id: int,
|
|
249
|
+
reset_tracker: bool = False,
|
|
250
|
+
processed_data_root: Path | None = None,
|
|
251
|
+
) -> None:
|
|
252
|
+
"""Prepares the target session for long-term (cold) storage.
|
|
253
|
+
|
|
254
|
+
This function is primarily designed to be used on remote compute servers that use different data volumes for
|
|
255
|
+
storage and processing. It should be called for sessions that are no longer frequently processed or accessed to move
|
|
256
|
+
all session data to the (slow) storage volume and free up the fast processing volume for working with other data.
|
|
257
|
+
Typically, this function is used exactly once during each session's life cycle: when the session's project is
|
|
258
|
+
officially concluded.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
session_path: The path to the session directory to be processed.
|
|
262
|
+
manager_id: The unique identifier of the manager process that manages the runtime.
|
|
263
|
+
processed_data_root: The path to the root directory used to store the processed data from all Sun lab projects,
|
|
264
|
+
if different from the 'session_path' root.
|
|
265
|
+
reset_tracker: Determines whether to reset the tracker file before executing the runtime. This allows
|
|
266
|
+
recovering from deadlocked runtimes, but otherwise should not be used to ensure runtime safety.
|
|
267
|
+
|
|
268
|
+
Notes:
|
|
269
|
+
This function inverses the result of running the prepare_session() function.
|
|
270
|
+
"""
|
|
271
|
+
# Resolves the data hierarchy for the processed session
|
|
272
|
+
session_data = SessionData.load(
|
|
273
|
+
session_path=session_path,
|
|
274
|
+
processed_data_root=processed_data_root,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Acquires the exclusive session data access lock.
|
|
278
|
+
lock = SessionLock(file_path=session_data.tracking_data.session_lock_path)
|
|
279
|
+
lock.acquire(manager_id=manager_id)
|
|
280
|
+
|
|
281
|
+
# Initializes the ProcessingTracker instances for preparation and archiving pipelines.
|
|
282
|
+
preparation_tracker = ProcessingTracker(
|
|
283
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.PREPARATION)
|
|
284
|
+
)
|
|
285
|
+
archiving_tracker = ProcessingTracker(
|
|
286
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.ARCHIVING)
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Explicitly prevents colliding with ongoing preparation runtimes.
|
|
290
|
+
if preparation_tracker.is_running:
|
|
291
|
+
message = (
|
|
292
|
+
f"Unable to archive the session '{session_data.session_name}' for long-term storage, as it is currently "
|
|
293
|
+
f"being prepared for data processing. Abort the preparation process or wait for it to complete before "
|
|
294
|
+
f"retrying."
|
|
295
|
+
)
|
|
296
|
+
console.error(message=message, error=RuntimeError)
|
|
297
|
+
|
|
298
|
+
# Resets the archiving tracker, if requested.
|
|
299
|
+
if reset_tracker:
|
|
300
|
+
archiving_tracker.abort()
|
|
301
|
+
|
|
302
|
+
# Starts the archiving runtime.
|
|
303
|
+
archiving_tracker.start(manager_id=manager_id)
|
|
304
|
+
try:
|
|
305
|
+
console.echo(message=f"Arching session '{session_data.session_name}'...", level=LogLevel.INFO)
|
|
306
|
+
|
|
307
|
+
# If the 'processed_data' root is different from the 'archived_data' root, transfers the 'processed_data'
|
|
308
|
+
# directory to the paths specified by 'archived_data'.
|
|
309
|
+
if (
|
|
310
|
+
session_data.processed_data.root_path != session_data.archived_data.root_path
|
|
311
|
+
and session_data.processed_data.processed_data_path.exists()
|
|
312
|
+
):
|
|
313
|
+
console.echo(
|
|
314
|
+
message=(
|
|
315
|
+
f"Transferring (archiving) the 'processed_data' directory to the storage volume as the "
|
|
316
|
+
f"'archived_data' directory..."
|
|
317
|
+
),
|
|
318
|
+
level=LogLevel.INFO,
|
|
319
|
+
)
|
|
320
|
+
transfer_directory(
|
|
321
|
+
source=session_data.processed_data.processed_data_path,
|
|
322
|
+
destination=session_data.archived_data.processed_data_path,
|
|
323
|
+
num_threads=0,
|
|
324
|
+
verify_integrity=False,
|
|
325
|
+
remove_source=True,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Also ensures that the 'source_data' folder is removed from the working volume.
|
|
329
|
+
if session_data.raw_data.root_path != session_data.source_data.root_path:
|
|
330
|
+
console.echo(
|
|
331
|
+
message=f"Removing the redundant 'source_data' directory from the working volume...",
|
|
332
|
+
level=LogLevel.INFO,
|
|
333
|
+
)
|
|
334
|
+
delete_directory(session_data.source_data.raw_data_path)
|
|
335
|
+
|
|
336
|
+
# Archiving is complete
|
|
337
|
+
archiving_tracker.stop(manager_id=manager_id)
|
|
338
|
+
preparation_tracker.abort() # Resets the preparation tracker, as the session is no longer prepared.
|
|
339
|
+
console.echo(message=f"Session '{session_data.session_name}': Archived.", level=LogLevel.SUCCESS)
|
|
340
|
+
|
|
341
|
+
finally:
|
|
342
|
+
# If the code reaches this section while the tracker indicates that the processing is still running,
|
|
343
|
+
# this means that the runtime encountered an error.
|
|
344
|
+
if archiving_tracker.is_running:
|
|
345
|
+
archiving_tracker.error(manager_id=manager_id)
|
|
346
|
+
|
|
347
|
+
# Updates or generates the manifest file inside the root raw data project directory
|
|
348
|
+
generate_project_manifest(
|
|
349
|
+
raw_project_directory=session_data.raw_data.root_path.joinpath(session_data.project_name),
|
|
350
|
+
processed_data_root=processed_data_root,
|
|
351
|
+
manager_id=manager_id,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def generate_project_manifest(
|
|
356
|
+
raw_project_directory: Path,
|
|
357
|
+
manager_id: int,
|
|
358
|
+
processed_data_root: Path | None = None,
|
|
359
|
+
) -> None:
|
|
360
|
+
"""Builds and saves the project manifest .feather file under the specified output directory.
|
|
361
|
+
|
|
362
|
+
This function evaluates the input project directory and builds the 'manifest' file for the project. The file
|
|
363
|
+
includes the descriptive information about every session stored inside the input project folder and the state of
|
|
364
|
+
the session's data processing (which processing pipelines have been applied to each session). The file is created
|
|
365
|
+
under the input raw project directory and uses the following name pattern: ProjectName_manifest.feather.
|
|
366
|
+
|
|
367
|
+
Notes:
|
|
368
|
+
The manifest file is primarily used to capture and move project state information between machines, typically
|
|
369
|
+
in the context of working with data stored on a remote compute server or cluster.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
raw_project_directory: The path to the root project directory used to store raw session data.
|
|
373
|
+
manager_id: The unique identifier of the manager process that manages the runtime.
|
|
374
|
+
processed_data_root: The path to the root directory (volume) used to store processed data for all Sun lab
|
|
375
|
+
projects if it is different from the parent of the 'raw_project_directory'.
|
|
376
|
+
"""
|
|
377
|
+
|
|
378
|
+
if not raw_project_directory.exists():
|
|
379
|
+
message = (
|
|
380
|
+
f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. "
|
|
381
|
+
f"The specified project directory does not exist."
|
|
382
|
+
)
|
|
383
|
+
console.error(message=message, error=FileNotFoundError)
|
|
384
|
+
|
|
385
|
+
# Finds all session directories for the target project
|
|
386
|
+
session_directories = [directory.parent for directory in raw_project_directory.rglob("raw_data")]
|
|
387
|
+
|
|
388
|
+
if len(session_directories) == 0:
|
|
389
|
+
message = (
|
|
390
|
+
f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. The "
|
|
391
|
+
f"project does not contain any raw session data. To generate the manifest file, the project must contain "
|
|
392
|
+
f"the data for at least one session."
|
|
393
|
+
)
|
|
394
|
+
console.error(message=message, error=FileNotFoundError)
|
|
395
|
+
|
|
396
|
+
# Precreates the 'manifest' dictionary structure
|
|
397
|
+
manifest: dict[str, list[str | bool | datetime | int]] = {
|
|
398
|
+
"animal": [], # Animal IDs.
|
|
399
|
+
"session": [], # Session names.
|
|
400
|
+
"date": [], # Session names stored as timezone-aware date-time objects in EST.
|
|
401
|
+
"type": [], # Type of the session (e.g., mesoscope experiment, run training, etc.).
|
|
402
|
+
"system": [], # Acquisition system used to acquire the session (e.g. mesoscope-vr, etc.).
|
|
403
|
+
"notes": [], # The experimenter notes about the session.
|
|
404
|
+
# Determines whether the session data is complete (ran for the intended duration and has all expected data).
|
|
405
|
+
"complete": [],
|
|
406
|
+
# Determines whether the session data integrity has been verified upon transfer to a storage machine.
|
|
407
|
+
"integrity": [],
|
|
408
|
+
# Determines whether the session's data has been prepared for data processing.
|
|
409
|
+
"prepared": [],
|
|
410
|
+
# Determines whether the session has been processed with the single-day s2p pipeline.
|
|
411
|
+
"suite2p": [],
|
|
412
|
+
# Determines whether the session has been processed with the behavior extraction pipeline.
|
|
413
|
+
"behavior": [],
|
|
414
|
+
# Determines whether the session has been processed with the DeepLabCut pipeline.
|
|
415
|
+
"video": [],
|
|
416
|
+
# Determines whether the session's data has been archived for long-term storage.
|
|
417
|
+
"archived": [],
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
# Resolves the path to the manifest .feather file to be created and the .lock file to ensure only a single process
|
|
421
|
+
# can be working on the manifest file at the same time.
|
|
422
|
+
manifest_path = raw_project_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather")
|
|
423
|
+
manifest_lock = manifest_path.with_suffix(manifest_path.suffix + ".lock")
|
|
424
|
+
|
|
425
|
+
# Also instantiates the processing tracker for the manifest file in the same directory. Note, unlike for most other
|
|
426
|
+
# runtimes, the tracker is NOT used to limit the ability of other processes to run the manifest generation. That
|
|
427
|
+
# job is handled to the manifest lock file. Instead, the tracker is used to communicate whether the manifest
|
|
428
|
+
# generation runs as expected or encounters an error.
|
|
429
|
+
runtime_tracker = ProcessingTracker(file_path=raw_project_directory.joinpath(TrackerFileNames.MANIFEST))
|
|
430
|
+
|
|
431
|
+
# Since the exclusivity of the data manifest generation runtime is enforced through the manifest .lock file, this
|
|
432
|
+
# runtime always resets the processing tracker file.
|
|
433
|
+
runtime_tracker.abort()
|
|
434
|
+
|
|
435
|
+
# Acquires the lock file, ensuring only this specific process can work with the manifest data.
|
|
436
|
+
lock = FileLock(str(manifest_lock))
|
|
437
|
+
with lock.acquire(timeout=20.0):
|
|
438
|
+
# Starts the manifest generation process.
|
|
439
|
+
runtime_tracker.start(manager_id=manager_id)
|
|
440
|
+
try:
|
|
441
|
+
# Loops over each session of every animal in the project and extracts session ID information and
|
|
442
|
+
# information about which processing steps have been successfully applied to the session.
|
|
443
|
+
for directory in session_directories:
|
|
444
|
+
# Skips processing directories without files (sessions with empty raw_data directories)
|
|
445
|
+
if len([file for file in directory.joinpath("raw_data").glob("*")]) == 0:
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
# Instantiates the SessionData instance to resolve the paths to all session's data files and locations.
|
|
449
|
+
session_data = SessionData.load(
|
|
450
|
+
session_path=directory,
|
|
451
|
+
processed_data_root=processed_data_root,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Extracts ID and data path information from the SessionData instance
|
|
455
|
+
manifest["animal"].append(session_data.animal_id)
|
|
456
|
+
manifest["session"].append(session_data.session_name)
|
|
457
|
+
manifest["type"].append(session_data.session_type)
|
|
458
|
+
manifest["system"].append(session_data.acquisition_system)
|
|
459
|
+
|
|
460
|
+
# Parses session name into the date-time object to simplify working with date-time data in the future
|
|
461
|
+
date_time_components = session_data.session_name.split("-")
|
|
462
|
+
date_time = datetime(
|
|
463
|
+
year=int(date_time_components[0]),
|
|
464
|
+
month=int(date_time_components[1]),
|
|
465
|
+
day=int(date_time_components[2]),
|
|
466
|
+
hour=int(date_time_components[3]),
|
|
467
|
+
minute=int(date_time_components[4]),
|
|
468
|
+
second=int(date_time_components[5]),
|
|
469
|
+
microsecond=int(date_time_components[6]),
|
|
470
|
+
tzinfo=pytz.UTC,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# Converts from UTC to EST / EDT for user convenience
|
|
474
|
+
eastern = pytz.timezone("America/New_York")
|
|
475
|
+
date_time = date_time.astimezone(eastern)
|
|
476
|
+
manifest["date"].append(date_time)
|
|
477
|
+
|
|
478
|
+
# Depending on the session type, instantiates the appropriate descriptor instance and uses it to read
|
|
479
|
+
# the experimenter notes
|
|
480
|
+
if session_data.session_type == SessionTypes.LICK_TRAINING:
|
|
481
|
+
descriptor: LickTrainingDescriptor = LickTrainingDescriptor.from_yaml( # type: ignore
|
|
482
|
+
file_path=session_data.raw_data.session_descriptor_path
|
|
483
|
+
)
|
|
484
|
+
manifest["notes"].append(descriptor.experimenter_notes)
|
|
485
|
+
elif session_data.session_type == SessionTypes.RUN_TRAINING:
|
|
486
|
+
descriptor: RunTrainingDescriptor = RunTrainingDescriptor.from_yaml( # type: ignore
|
|
487
|
+
file_path=session_data.raw_data.session_descriptor_path
|
|
488
|
+
)
|
|
489
|
+
manifest["notes"].append(descriptor.experimenter_notes)
|
|
490
|
+
elif session_data.session_type == SessionTypes.MESOSCOPE_EXPERIMENT:
|
|
491
|
+
descriptor: MesoscopeExperimentDescriptor = MesoscopeExperimentDescriptor.from_yaml( # type: ignore
|
|
492
|
+
file_path=session_data.raw_data.session_descriptor_path
|
|
493
|
+
)
|
|
494
|
+
manifest["notes"].append(descriptor.experimenter_notes)
|
|
495
|
+
elif session_data.session_type == SessionTypes.WINDOW_CHECKING:
|
|
496
|
+
# sl-experiment version 3.0.0 added session descriptors to Window Checking runtimes. Since the file
|
|
497
|
+
# does not exist in prior versions, this section is written to statically handle the discrepancy.
|
|
498
|
+
try:
|
|
499
|
+
descriptor: WindowCheckingDescriptor = WindowCheckingDescriptor.from_yaml( # type: ignore
|
|
500
|
+
file_path=session_data.raw_data.session_descriptor_path
|
|
501
|
+
)
|
|
502
|
+
manifest["notes"].append(descriptor.experimenter_notes)
|
|
503
|
+
except Exception:
|
|
504
|
+
manifest["notes"].append("N/A")
|
|
505
|
+
else:
|
|
506
|
+
# Raises an error if an unsupported session type is encountered.
|
|
507
|
+
message = (
|
|
508
|
+
f"Unsupported session type '{session_data.session_type}' encountered for session "
|
|
509
|
+
f"'{directory.stem}' when generating the manifest file for the project "
|
|
510
|
+
f"{raw_project_directory.stem}. Currently, only the following session types are supported: "
|
|
511
|
+
f"{tuple(SessionTypes)}."
|
|
512
|
+
)
|
|
513
|
+
console.error(message=message, error=ValueError)
|
|
514
|
+
raise ValueError(message) # Fallback to appease mypy, should not be reachable
|
|
515
|
+
|
|
516
|
+
# If the session raw_data folder contains the telomere.bin file, marks the session as complete.
|
|
517
|
+
manifest["complete"].append(session_data.raw_data.telomere_path.exists())
|
|
518
|
+
|
|
519
|
+
# Data integrity verification status
|
|
520
|
+
tracker = ProcessingTracker(
|
|
521
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.CHECKSUM)
|
|
522
|
+
)
|
|
523
|
+
manifest["integrity"].append(tracker.is_complete)
|
|
524
|
+
|
|
525
|
+
# If the session is incomplete or unverified, marks all processing steps as FALSE, as automatic
|
|
526
|
+
# processing is disabled for incomplete sessions and, therefore, it could not have been processed.
|
|
527
|
+
if not manifest["complete"][-1] or not manifest["integrity"][-1]:
|
|
528
|
+
manifest["suite2p"].append(False)
|
|
529
|
+
manifest["behavior"].append(False)
|
|
530
|
+
manifest["video"].append(False)
|
|
531
|
+
manifest["prepared"].append(False)
|
|
532
|
+
manifest["archived"].append(False)
|
|
533
|
+
continue # Cycles to the next session
|
|
534
|
+
|
|
535
|
+
# Session data preparation (for processing) status.
|
|
536
|
+
tracker = ProcessingTracker(
|
|
537
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.PREPARATION)
|
|
538
|
+
)
|
|
539
|
+
manifest["prepared"].append(tracker.is_complete)
|
|
540
|
+
|
|
541
|
+
# Suite2p (single-day) processing status.
|
|
542
|
+
tracker = ProcessingTracker(
|
|
543
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.SUITE2P)
|
|
544
|
+
)
|
|
545
|
+
manifest["suite2p"].append(tracker.is_complete)
|
|
546
|
+
|
|
547
|
+
# Behavior data processing status.
|
|
548
|
+
tracker = ProcessingTracker(
|
|
549
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.BEHAVIOR)
|
|
550
|
+
)
|
|
551
|
+
manifest["behavior"].append(tracker.is_complete)
|
|
552
|
+
|
|
553
|
+
# DeepLabCut (video) processing status.
|
|
554
|
+
tracker = ProcessingTracker(
|
|
555
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.VIDEO)
|
|
556
|
+
)
|
|
557
|
+
manifest["video"].append(tracker.is_complete)
|
|
558
|
+
|
|
559
|
+
# Session data archiving status.
|
|
560
|
+
tracker = ProcessingTracker(
|
|
561
|
+
file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.ARCHIVING)
|
|
562
|
+
)
|
|
563
|
+
manifest["archived"].append(tracker.is_complete)
|
|
564
|
+
|
|
565
|
+
# If all animal IDs are integer-convertible, stores them as numbers to promote proper sorting.
|
|
566
|
+
# Otherwise, stores them as strings. The latter options are primarily kept for compatibility with Tyche
|
|
567
|
+
# data.
|
|
568
|
+
animal_type: type[pl.UInt64] | type[pl.String]
|
|
569
|
+
if all([str(animal).isdigit() for animal in manifest["animal"]]):
|
|
570
|
+
# Converts all strings to integers
|
|
571
|
+
manifest["animal"] = [int(animal) for animal in manifest["animal"]] # type: ignore
|
|
572
|
+
animal_type = pl.UInt64 # Uint64 for future proofing
|
|
573
|
+
else:
|
|
574
|
+
animal_type = pl.String
|
|
575
|
+
|
|
576
|
+
# Converts the manifest dictionary to a Polars Dataframe.
|
|
577
|
+
schema = {
|
|
578
|
+
"animal": animal_type,
|
|
579
|
+
"date": pl.Datetime,
|
|
580
|
+
"session": pl.String,
|
|
581
|
+
"type": pl.String,
|
|
582
|
+
"system": pl.String,
|
|
583
|
+
"notes": pl.String,
|
|
584
|
+
"complete": pl.UInt8,
|
|
585
|
+
"integrity": pl.UInt8,
|
|
586
|
+
"prepared": pl.UInt8,
|
|
587
|
+
"suite2p": pl.UInt8,
|
|
588
|
+
"behavior": pl.UInt8,
|
|
589
|
+
"video": pl.UInt8,
|
|
590
|
+
"archived": pl.UInt8,
|
|
591
|
+
}
|
|
592
|
+
df = pl.DataFrame(manifest, schema=schema, strict=False)
|
|
593
|
+
|
|
594
|
+
# Sorts the DataFrame by animal and then session. Since animal IDs are monotonically increasing according to
|
|
595
|
+
# Sun lab standards and session 'names' are based on acquisition timestamps, the sort order is
|
|
596
|
+
# chronological.
|
|
597
|
+
sorted_df = df.sort(["animal", "session"])
|
|
598
|
+
|
|
599
|
+
# Saves the generated manifest to the project-specific manifest .feather file for further processing.
|
|
600
|
+
sorted_df.write_ipc(file=manifest_path, compression="lz4")
|
|
601
|
+
|
|
602
|
+
# The processing is now complete.
|
|
603
|
+
runtime_tracker.stop(manager_id=manager_id)
|
|
604
|
+
|
|
605
|
+
finally:
|
|
606
|
+
# If the tracker indicates that the processing is still running, the runtime has encountered an error.
|
|
607
|
+
if runtime_tracker.is_running:
|
|
608
|
+
tracker.error(manager_id=manager_id)
|
|
609
|
+
|
|
610
|
+
|
|
26
611
|
class ProjectManifest:
|
|
27
612
|
"""Wraps the contents of a Sun lab project manifest .feather file and exposes methods for visualizing and
|
|
28
613
|
working with the data stored inside the file.
|
|
@@ -77,12 +662,14 @@ class ProjectManifest:
|
|
|
77
662
|
"date",
|
|
78
663
|
"session",
|
|
79
664
|
"type",
|
|
665
|
+
"system",
|
|
80
666
|
"complete",
|
|
81
667
|
"integrity",
|
|
668
|
+
"prepared",
|
|
82
669
|
"suite2p",
|
|
83
670
|
"behavior",
|
|
84
671
|
"video",
|
|
85
|
-
"
|
|
672
|
+
"archived",
|
|
86
673
|
]
|
|
87
674
|
|
|
88
675
|
# Retrieves the data
|
|
@@ -119,7 +706,7 @@ class ProjectManifest:
|
|
|
119
706
|
"""
|
|
120
707
|
|
|
121
708
|
# Pre-selects the columns to display
|
|
122
|
-
df = self._data.select(["animal", "date", "session", "type", "notes"])
|
|
709
|
+
df = self._data.select(["animal", "date", "session", "type", "system", "notes"])
|
|
123
710
|
|
|
124
711
|
# Optionally filters the data for the target animal
|
|
125
712
|
if animal is not None:
|
|
@@ -137,8 +724,8 @@ class ProjectManifest:
|
|
|
137
724
|
set_tbl_cols=-1,
|
|
138
725
|
set_tbl_hide_column_data_types=True,
|
|
139
726
|
set_tbl_cell_alignment="LEFT",
|
|
140
|
-
set_tbl_width_chars=
|
|
141
|
-
set_fmt_str_lengths=
|
|
727
|
+
set_tbl_width_chars=100, # Wider columns for notes
|
|
728
|
+
set_fmt_str_lengths=800, # Allows very long strings for notes
|
|
142
729
|
):
|
|
143
730
|
print(df)
|
|
144
731
|
|
|
@@ -148,14 +735,16 @@ class ProjectManifest:
|
|
|
148
735
|
|
|
149
736
|
This provides a tuple of all animal IDs participating in the target project.
|
|
150
737
|
"""
|
|
151
|
-
|
|
738
|
+
|
|
739
|
+
# If animal IDs are stored as integers, converts them to string to support consistent return types.
|
|
740
|
+
return tuple(
|
|
741
|
+
[str(animal) for animal in self._data.select("animal").unique().sort("animal").to_series().to_list()]
|
|
742
|
+
)
|
|
152
743
|
|
|
153
744
|
def _get_filtered_sessions(
|
|
154
745
|
self,
|
|
155
746
|
animal: str | int | None = None,
|
|
156
747
|
exclude_incomplete: bool = True,
|
|
157
|
-
dataset_ready_only: bool = False,
|
|
158
|
-
not_dataset_ready_only: bool = False,
|
|
159
748
|
) -> tuple[str, ...]:
|
|
160
749
|
"""This worker method is used to get a list of sessions with optional filtering.
|
|
161
750
|
|
|
@@ -166,11 +755,6 @@ class ProjectManifest:
|
|
|
166
755
|
animals.
|
|
167
756
|
exclude_incomplete: Determines whether to exclude sessions not marked as 'complete' from the output
|
|
168
757
|
list.
|
|
169
|
-
dataset_ready_only: Determines whether to exclude sessions not marked as 'dataset' integration ready from
|
|
170
|
-
the output list. Enabling this option only shows sessions that can be integrated into a dataset.
|
|
171
|
-
not_dataset_ready_only: The opposite of 'dataset_ready_only'. Determines whether to exclude sessions marked
|
|
172
|
-
as 'dataset' integration ready from the output list. Note, when both this and 'dataset_ready_only' are
|
|
173
|
-
enabled, the 'dataset_ready_only' option takes precedence.
|
|
174
758
|
|
|
175
759
|
Returns:
|
|
176
760
|
The tuple of session IDs matching the filter criteria.
|
|
@@ -198,12 +782,6 @@ class ProjectManifest:
|
|
|
198
782
|
if exclude_incomplete:
|
|
199
783
|
data = data.filter(pl.col("complete") == 1)
|
|
200
784
|
|
|
201
|
-
# Optionally filters sessions based on their readiness for dataset integration.
|
|
202
|
-
if dataset_ready_only: # Dataset-ready option always takes precedence
|
|
203
|
-
data = data.filter(pl.col("dataset") == 1)
|
|
204
|
-
elif not_dataset_ready_only:
|
|
205
|
-
data = data.filter(pl.col("dataset") == 0)
|
|
206
|
-
|
|
207
785
|
# Formats and returns session IDs to the caller
|
|
208
786
|
sessions = data.select("session").sort("session").to_series().to_list()
|
|
209
787
|
return tuple(sessions)
|
|
@@ -221,8 +799,6 @@ class ProjectManifest:
|
|
|
221
799
|
self,
|
|
222
800
|
animal: str | int | None = None,
|
|
223
801
|
exclude_incomplete: bool = True,
|
|
224
|
-
dataset_ready_only: bool = False,
|
|
225
|
-
not_dataset_ready_only: bool = False,
|
|
226
802
|
) -> tuple[str, ...]:
|
|
227
803
|
"""Returns requested session IDs based on selected filtering criteria.
|
|
228
804
|
|
|
@@ -234,11 +810,6 @@ class ProjectManifest:
|
|
|
234
810
|
animals.
|
|
235
811
|
exclude_incomplete: Determines whether to exclude sessions not marked as 'complete' from the output
|
|
236
812
|
list.
|
|
237
|
-
dataset_ready_only: Determines whether to exclude sessions not marked as 'dataset' integration ready from
|
|
238
|
-
the output list. Enabling this option only shows sessions that can be integrated into a dataset.
|
|
239
|
-
not_dataset_ready_only: The opposite of 'dataset_ready_only'. Determines whether to exclude sessions marked
|
|
240
|
-
as 'dataset' integration ready from the output list. Note, when both this and 'dataset_ready_only' are
|
|
241
|
-
enabled, the 'dataset_ready_only' option takes precedence.
|
|
242
813
|
|
|
243
814
|
Returns:
|
|
244
815
|
The tuple of session IDs matching the filter criteria.
|
|
@@ -249,8 +820,6 @@ class ProjectManifest:
|
|
|
249
820
|
return self._get_filtered_sessions(
|
|
250
821
|
animal=animal,
|
|
251
822
|
exclude_incomplete=exclude_incomplete,
|
|
252
|
-
dataset_ready_only=dataset_ready_only,
|
|
253
|
-
not_dataset_ready_only=not_dataset_ready_only,
|
|
254
823
|
)
|
|
255
824
|
|
|
256
825
|
def get_session_info(self, session: str) -> pl.DataFrame:
|
|
@@ -263,500 +832,10 @@ class ProjectManifest:
|
|
|
263
832
|
session: The ID of the session for which to retrieve the data.
|
|
264
833
|
|
|
265
834
|
Returns:
|
|
266
|
-
A Polars DataFrame with the following columns: 'animal', 'date', 'notes', 'session', 'type', '
|
|
267
|
-
'
|
|
835
|
+
A Polars DataFrame with the following columns: 'animal', 'date', 'notes', 'session', 'type', 'system',
|
|
836
|
+
'complete', 'integrity', 'suite2p', 'behavior', 'video', 'archived'.
|
|
268
837
|
"""
|
|
269
838
|
|
|
270
839
|
df = self._data
|
|
271
840
|
df = df.filter(pl.col("session").eq(session))
|
|
272
841
|
return df
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
def generate_project_manifest(
|
|
276
|
-
raw_project_directory: Path, output_directory: Path, processed_data_root: Path | None = None
|
|
277
|
-
) -> None:
|
|
278
|
-
"""Builds and saves the project manifest .feather file under the specified output directory.
|
|
279
|
-
|
|
280
|
-
This function evaluates the input project directory and builds the 'manifest' file for the project. The file
|
|
281
|
-
includes the descriptive information about every session stored inside the input project folder and the state of
|
|
282
|
-
the session's data processing (which processing pipelines have been applied to each session). The file will be
|
|
283
|
-
created under the 'output_path' directory and use the following name pattern: ProjectName_manifest.feather.
|
|
284
|
-
|
|
285
|
-
Notes:
|
|
286
|
-
The manifest file is primarily used to capture and move project state information between machines, typically
|
|
287
|
-
in the context of working with data stored on a remote compute server or cluster. However, it can also be used
|
|
288
|
-
on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
|
|
289
|
-
lab regardless of the runtime context.
|
|
290
|
-
|
|
291
|
-
Args:
|
|
292
|
-
raw_project_directory: The path to the root project directory used to store raw session data.
|
|
293
|
-
output_directory: The path to the directory where to save the generated manifest file.
|
|
294
|
-
processed_data_root: The path to the root directory (volume) used to store processed data for all Sun lab
|
|
295
|
-
projects if it is different from the parent of the 'raw_project_directory'. Typically, this would be the
|
|
296
|
-
case on remote compute server(s) and not on local machines.
|
|
297
|
-
"""
|
|
298
|
-
|
|
299
|
-
if not raw_project_directory.exists():
|
|
300
|
-
message = (
|
|
301
|
-
f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. The "
|
|
302
|
-
f"specified project directory does not exist."
|
|
303
|
-
)
|
|
304
|
-
console.error(message=message, error=FileNotFoundError)
|
|
305
|
-
|
|
306
|
-
# Finds all session directories
|
|
307
|
-
session_directories = [directory.parent for directory in raw_project_directory.rglob("raw_data")]
|
|
308
|
-
|
|
309
|
-
if len(session_directories) == 0:
|
|
310
|
-
message = (
|
|
311
|
-
f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. The "
|
|
312
|
-
f"project does not contain any raw session data. To generate the manifest file, the project must contain "
|
|
313
|
-
f"at least one valid experiment or training session."
|
|
314
|
-
)
|
|
315
|
-
console.error(message=message, error=FileNotFoundError)
|
|
316
|
-
|
|
317
|
-
# Precreates the 'manifest' dictionary structure
|
|
318
|
-
manifest: dict[str, list[str | bool | datetime | int]] = {
|
|
319
|
-
"animal": [], # Animal IDs.
|
|
320
|
-
"session": [], # Session names.
|
|
321
|
-
"date": [], # Session names stored as timezone-aware date-time objects in EST.
|
|
322
|
-
"type": [], # Type of the session (e.g., Experiment, Training, etc.).
|
|
323
|
-
"notes": [], # The experimenter notes about the session.
|
|
324
|
-
# Determines whether the session data is complete (ran for the intended duration and has all expected data).
|
|
325
|
-
"complete": [],
|
|
326
|
-
# Determines whether the session data integrity has been verified upon transfer to a storage machine.
|
|
327
|
-
"integrity": [],
|
|
328
|
-
"suite2p": [], # Determines whether the session has been processed with the single-day s2p pipeline.
|
|
329
|
-
# Determines whether the session has been processed with the behavior extraction pipeline.
|
|
330
|
-
"behavior": [],
|
|
331
|
-
"video": [], # Determines whether the session has been processed with the DeepLabCut pipeline.
|
|
332
|
-
"dataset": [], # Determines whether the session's data is ready to be integrated into a dataset.
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
# Resolves the path to the manifest .feather file to be created and the .lock file for the generated manifest
|
|
336
|
-
manifest_path = output_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather")
|
|
337
|
-
manifest_lock = manifest_path.with_suffix(manifest_path.suffix + ".lock")
|
|
338
|
-
|
|
339
|
-
# Acquires the lock
|
|
340
|
-
lock = FileLock(str(manifest_lock))
|
|
341
|
-
with lock.acquire(timeout=20.0):
|
|
342
|
-
# Loops over each session of every animal in the project and extracts session ID information and information
|
|
343
|
-
# about which processing steps have been successfully applied to the session.
|
|
344
|
-
for directory in session_directories:
|
|
345
|
-
# Skips processing directories without files (sessions with empty raw-data directories)
|
|
346
|
-
if len([file for file in directory.joinpath("raw_data").glob("*")]) == 0:
|
|
347
|
-
continue
|
|
348
|
-
|
|
349
|
-
# Instantiates the SessionData instance to resolve the paths to all session's data files and locations.
|
|
350
|
-
session_data = SessionData.load(
|
|
351
|
-
session_path=directory,
|
|
352
|
-
processed_data_root=processed_data_root,
|
|
353
|
-
make_processed_data_directory=False,
|
|
354
|
-
)
|
|
355
|
-
|
|
356
|
-
# Fills the manifest dictionary with data for the processed session:
|
|
357
|
-
|
|
358
|
-
# Extracts ID and data path information from the SessionData instance
|
|
359
|
-
manifest["animal"].append(session_data.animal_id)
|
|
360
|
-
manifest["session"].append(session_data.session_name)
|
|
361
|
-
manifest["type"].append(session_data.session_type)
|
|
362
|
-
|
|
363
|
-
# Parses session name into the date-time object to simplify working with date-time data in the future
|
|
364
|
-
date_time_components = session_data.session_name.split("-")
|
|
365
|
-
date_time = datetime(
|
|
366
|
-
year=int(date_time_components[0]),
|
|
367
|
-
month=int(date_time_components[1]),
|
|
368
|
-
day=int(date_time_components[2]),
|
|
369
|
-
hour=int(date_time_components[3]),
|
|
370
|
-
minute=int(date_time_components[4]),
|
|
371
|
-
second=int(date_time_components[5]),
|
|
372
|
-
microsecond=int(date_time_components[6]),
|
|
373
|
-
tzinfo=pytz.UTC,
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
# Converts from UTC to EST / EDT for user convenience
|
|
377
|
-
eastern = pytz.timezone("America/New_York")
|
|
378
|
-
date_time = date_time.astimezone(eastern)
|
|
379
|
-
manifest["date"].append(date_time)
|
|
380
|
-
|
|
381
|
-
# Depending on the session type, instantiates the appropriate descriptor instance and uses it to read the
|
|
382
|
-
# experimenter notes
|
|
383
|
-
if session_data.session_type == SessionTypes.LICK_TRAINING:
|
|
384
|
-
descriptor: LickTrainingDescriptor = LickTrainingDescriptor.from_yaml( # type: ignore
|
|
385
|
-
file_path=session_data.raw_data.session_descriptor_path
|
|
386
|
-
)
|
|
387
|
-
manifest["notes"].append(descriptor.experimenter_notes)
|
|
388
|
-
elif session_data.session_type == SessionTypes.RUN_TRAINING:
|
|
389
|
-
descriptor: RunTrainingDescriptor = RunTrainingDescriptor.from_yaml( # type: ignore
|
|
390
|
-
file_path=session_data.raw_data.session_descriptor_path
|
|
391
|
-
)
|
|
392
|
-
manifest["notes"].append(descriptor.experimenter_notes)
|
|
393
|
-
elif session_data.session_type == SessionTypes.MESOSCOPE_EXPERIMENT:
|
|
394
|
-
descriptor: MesoscopeExperimentDescriptor = MesoscopeExperimentDescriptor.from_yaml( # type: ignore
|
|
395
|
-
file_path=session_data.raw_data.session_descriptor_path
|
|
396
|
-
)
|
|
397
|
-
manifest["notes"].append(descriptor.experimenter_notes)
|
|
398
|
-
elif session_data.session_type == SessionTypes.WINDOW_CHECKING:
|
|
399
|
-
# sl-experiment version 3.0.0 added session descriptors to Window Checking runtimes. Since the file
|
|
400
|
-
# does not exist in prior versions, this section is written to statically handle the discrepancy.
|
|
401
|
-
try:
|
|
402
|
-
descriptor: WindowCheckingDescriptor = WindowCheckingDescriptor.from_yaml( # type: ignore
|
|
403
|
-
file_path=session_data.raw_data.session_descriptor_path
|
|
404
|
-
)
|
|
405
|
-
manifest["notes"].append(descriptor.experimenter_notes)
|
|
406
|
-
except Exception:
|
|
407
|
-
manifest["notes"].append("N/A")
|
|
408
|
-
else:
|
|
409
|
-
manifest["notes"].append("N/A")
|
|
410
|
-
|
|
411
|
-
# If the session raw_data folder contains the telomere.bin file, marks the session as complete.
|
|
412
|
-
manifest["complete"].append(session_data.raw_data.telomere_path.exists())
|
|
413
|
-
|
|
414
|
-
# Data verification status
|
|
415
|
-
tracker = get_processing_tracker(
|
|
416
|
-
root=session_data.raw_data.raw_data_path, file_name=TrackerFileNames.INTEGRITY
|
|
417
|
-
)
|
|
418
|
-
manifest["integrity"].append(tracker.is_complete)
|
|
419
|
-
|
|
420
|
-
# If the session is incomplete or unverified, marks all processing steps as FALSE, as automatic processing
|
|
421
|
-
# is disabled for incomplete sessions. If the session is unverified, the case is even more severe, as its
|
|
422
|
-
# data may be corrupted.
|
|
423
|
-
if not manifest["complete"][-1] or not manifest["integrity"][-1]:
|
|
424
|
-
manifest["suite2p"].append(False)
|
|
425
|
-
manifest["dataset"].append(False)
|
|
426
|
-
manifest["behavior"].append(False)
|
|
427
|
-
manifest["video"].append(False)
|
|
428
|
-
continue # Cycles to the next session
|
|
429
|
-
|
|
430
|
-
# Suite2p (single-day) processing status.
|
|
431
|
-
tracker = get_processing_tracker(
|
|
432
|
-
file_name=TrackerFileNames.SUITE2P, root=session_data.processed_data.processed_data_path
|
|
433
|
-
)
|
|
434
|
-
manifest["suite2p"].append(tracker.is_complete)
|
|
435
|
-
|
|
436
|
-
# Behavior data processing status.
|
|
437
|
-
tracker = get_processing_tracker(
|
|
438
|
-
file_name=TrackerFileNames.BEHAVIOR, root=session_data.processed_data.processed_data_path
|
|
439
|
-
)
|
|
440
|
-
manifest["behavior"].append(tracker.is_complete)
|
|
441
|
-
|
|
442
|
-
# DeepLabCut (video) processing status.
|
|
443
|
-
tracker = get_processing_tracker(
|
|
444
|
-
file_name=TrackerFileNames.VIDEO, root=session_data.processed_data.processed_data_path
|
|
445
|
-
)
|
|
446
|
-
manifest["video"].append(tracker.is_complete)
|
|
447
|
-
|
|
448
|
-
# Tracks whether the session's data is currently in the processing or dataset integration mode.
|
|
449
|
-
manifest["dataset"].append(session_data.processed_data.p53_path.exists())
|
|
450
|
-
|
|
451
|
-
# If all animal IDs are integer-convertible, stores them as numbers to promote proper sorting. Otherwise, stores
|
|
452
|
-
# them as strings. The latter options are primarily kept for compatibility with Tyche data
|
|
453
|
-
animal_type: type[pl.UInt64] | type[pl.String]
|
|
454
|
-
if all([str(animal).isdigit() for animal in manifest["animal"]]):
|
|
455
|
-
# Converts all strings to integers
|
|
456
|
-
manifest["animal"] = [int(animal) for animal in manifest["animal"]] # type: ignore
|
|
457
|
-
animal_type = pl.UInt64 # Uint64 for future proofing
|
|
458
|
-
else:
|
|
459
|
-
animal_type = pl.String
|
|
460
|
-
|
|
461
|
-
# Converts the manifest dictionary to a Polars Dataframe.
|
|
462
|
-
schema = {
|
|
463
|
-
"animal": animal_type,
|
|
464
|
-
"date": pl.Datetime,
|
|
465
|
-
"session": pl.String,
|
|
466
|
-
"type": pl.String,
|
|
467
|
-
"notes": pl.String,
|
|
468
|
-
"complete": pl.UInt8,
|
|
469
|
-
"integrity": pl.UInt8,
|
|
470
|
-
"suite2p": pl.UInt8,
|
|
471
|
-
"dataset": pl.UInt8,
|
|
472
|
-
"behavior": pl.UInt8,
|
|
473
|
-
"video": pl.UInt8,
|
|
474
|
-
}
|
|
475
|
-
df = pl.DataFrame(manifest, schema=schema, strict=False)
|
|
476
|
-
|
|
477
|
-
# Sorts the DataFrame by animal and then session. Since we assign animal IDs sequentially and 'name' sessions
|
|
478
|
-
# based on acquisition timestamps, the sort order is chronological.
|
|
479
|
-
sorted_df = df.sort(["animal", "session"])
|
|
480
|
-
|
|
481
|
-
# Saves the generated manifest to the project-specific manifest .feather file for further processing.
|
|
482
|
-
sorted_df.write_ipc(
|
|
483
|
-
file=output_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather"), compression="lz4"
|
|
484
|
-
)
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
def verify_session_checksum(
|
|
488
|
-
session_path: Path,
|
|
489
|
-
manager_id: int,
|
|
490
|
-
create_processed_data_directory: bool = True,
|
|
491
|
-
processed_data_root: None | Path = None,
|
|
492
|
-
update_manifest: bool = False,
|
|
493
|
-
) -> None:
|
|
494
|
-
"""Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
|
|
495
|
-
comparing it against the checksum stored in the ax_checksum.txt file.
|
|
496
|
-
|
|
497
|
-
Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
|
|
498
|
-
server for long-term storage. This function is designed to create the 'verified.bin' marker file if the checksum
|
|
499
|
-
matches and to remove the 'telomere.bin' and 'verified.bin' marker files if it does not.
|
|
500
|
-
|
|
501
|
-
Notes:
|
|
502
|
-
Removing the telomere.bin marker file from the session's raw_data folder marks the session as incomplete,
|
|
503
|
-
excluding it from all further automatic processing.
|
|
504
|
-
|
|
505
|
-
This function is also used to create the processed data hierarchy on the BioHPC server, when it is called as
|
|
506
|
-
part of the data preprocessing runtime performed by a data acquisition system.
|
|
507
|
-
|
|
508
|
-
Since version 3.1.0, this functon also supports (re) generating the processed session's project manifest file,
|
|
509
|
-
which is used to support further Sun lab data processing pipelines.
|
|
510
|
-
|
|
511
|
-
Args:
|
|
512
|
-
session_path: The path to the session directory to be verified. Note, the input session directory must contain
|
|
513
|
-
the 'raw_data' subdirectory.
|
|
514
|
-
manager_id: The xxHash-64 hash-value that specifies the unique identifier of the manager process that
|
|
515
|
-
manages the integrity verification runtime.
|
|
516
|
-
create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
|
|
517
|
-
processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
|
|
518
|
-
the root directory where to store the processed data from all projects, and it will be automatically
|
|
519
|
-
modified to include the project name, the animal name, and the session ID.
|
|
520
|
-
update_manifest: Determines whether to update (regenerate) the project manifest file for the processed session's
|
|
521
|
-
project. This should always be enabled when working with remote compute server(s) to ensure that the
|
|
522
|
-
project manifest file contains the most actual snapshot of the project's state.
|
|
523
|
-
"""
|
|
524
|
-
|
|
525
|
-
# Loads session data layout. If configured to do so, also creates the processed data hierarchy
|
|
526
|
-
session_data = SessionData.load(
|
|
527
|
-
session_path=session_path,
|
|
528
|
-
processed_data_root=processed_data_root,
|
|
529
|
-
make_processed_data_directory=create_processed_data_directory,
|
|
530
|
-
)
|
|
531
|
-
|
|
532
|
-
# Initializes the ProcessingTracker instance for the verification tracker file
|
|
533
|
-
tracker = get_processing_tracker(root=session_data.raw_data.raw_data_path, file_name=TrackerFileNames.INTEGRITY)
|
|
534
|
-
console.echo(f"{tracker.file_path}")
|
|
535
|
-
|
|
536
|
-
# Updates the tracker data to communicate that the verification process has started. This automatically clears
|
|
537
|
-
# the previous 'completed' status.
|
|
538
|
-
tracker.start(manager_id=manager_id)
|
|
539
|
-
try:
|
|
540
|
-
# Re-calculates the checksum for the raw_data directory
|
|
541
|
-
calculated_checksum = calculate_directory_checksum(
|
|
542
|
-
directory=session_data.raw_data.raw_data_path, batch=False, save_checksum=False
|
|
543
|
-
)
|
|
544
|
-
|
|
545
|
-
# Loads the checksum stored inside the ax_checksum.txt file
|
|
546
|
-
with open(session_data.raw_data.checksum_path, "r") as f:
|
|
547
|
-
stored_checksum = f.read().strip()
|
|
548
|
-
|
|
549
|
-
# If the two checksums do not match, this likely indicates data corruption.
|
|
550
|
-
if stored_checksum != calculated_checksum:
|
|
551
|
-
# If the telomere.bin file exists, removes this file. This automatically marks the session as incomplete for
|
|
552
|
-
# all other Sun lab runtimes.
|
|
553
|
-
session_data.raw_data.telomere_path.unlink(missing_ok=True)
|
|
554
|
-
|
|
555
|
-
else:
|
|
556
|
-
# Sets the tracker to indicate that the verification runtime completed successfully.
|
|
557
|
-
tracker.stop(manager_id=manager_id)
|
|
558
|
-
|
|
559
|
-
finally:
|
|
560
|
-
# If the code reaches this section while the tracker indicates that the processing is still running,
|
|
561
|
-
# this means that the verification runtime encountered an error. Configures the tracker to indicate that this
|
|
562
|
-
# runtime finished with an error to prevent deadlocking the runtime.
|
|
563
|
-
if tracker.is_running:
|
|
564
|
-
tracker.error(manager_id=manager_id)
|
|
565
|
-
|
|
566
|
-
# If the runtime is configured to generate the project manifest file, attempts to generate and overwrite the
|
|
567
|
-
# existing manifest file for the target project.
|
|
568
|
-
if update_manifest:
|
|
569
|
-
# All sessions are stored under root/project/animal/session. Therefore, the grandparent of the session is
|
|
570
|
-
# the raw project directory.
|
|
571
|
-
raw_directory = session_path.parents[1]
|
|
572
|
-
|
|
573
|
-
# Generates the manifest file inside the root raw data project directory
|
|
574
|
-
generate_project_manifest(
|
|
575
|
-
raw_project_directory=session_path.parents[1],
|
|
576
|
-
processed_data_root=processed_data_root,
|
|
577
|
-
output_directory=raw_directory,
|
|
578
|
-
)
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
def resolve_p53_marker(
|
|
582
|
-
session_path: Path,
|
|
583
|
-
create_processed_data_directory: bool = True,
|
|
584
|
-
processed_data_root: None | Path = None,
|
|
585
|
-
remove: bool = False,
|
|
586
|
-
update_manifest: bool = False,
|
|
587
|
-
) -> None:
|
|
588
|
-
"""Depending on configuration, either creates or removes the p53.bin marker file for the target session.
|
|
589
|
-
|
|
590
|
-
The marker file statically determines whether the session can be targeted by data processing or dataset formation
|
|
591
|
-
pipelines.
|
|
592
|
-
|
|
593
|
-
Notes:
|
|
594
|
-
Since dataset integration relies on data processing outputs, it is essential to prevent processing pipelines
|
|
595
|
-
from altering the data while it is integrated into a dataset. The p53.bin marker solves this issue by ensuring
|
|
596
|
-
that only one type of runtimes (processing or dataset integration) is allowed to work with the session.
|
|
597
|
-
|
|
598
|
-
For the p53.bin marker to be created, the session must not be undergoing processing. For the p53 marker
|
|
599
|
-
to be removed, the session must not be undergoing dataset integration.
|
|
600
|
-
|
|
601
|
-
Since version 3.1.0, this functon also supports (re)generating the processed session's project manifest file,
|
|
602
|
-
which is used to support further Sun lab data processing pipelines.
|
|
603
|
-
|
|
604
|
-
Args:
|
|
605
|
-
session_path: The path to the session directory for which the p53.bin marker needs to be resolved. Note, the
|
|
606
|
-
input session directory must contain the 'raw_data' subdirectory.
|
|
607
|
-
create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
|
|
608
|
-
processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
|
|
609
|
-
the root directory where to store the processed data from all projects, and it will be automatically
|
|
610
|
-
modified to include the project name, the animal name, and the session ID.
|
|
611
|
-
remove: Determines whether this function is called to create or remove the p53.bin marker.
|
|
612
|
-
update_manifest: Determines whether to update (regenerate) the project manifest file for the processed session's
|
|
613
|
-
project. This should always be enabled when working with remote compute server(s) to ensure that the
|
|
614
|
-
project manifest file contains the most actual snapshot of the project's state.
|
|
615
|
-
"""
|
|
616
|
-
|
|
617
|
-
# Loads session data layout. If configured to do so, also creates the processed data hierarchy
|
|
618
|
-
session_data = SessionData.load(
|
|
619
|
-
session_path=session_path,
|
|
620
|
-
processed_data_root=processed_data_root,
|
|
621
|
-
make_processed_data_directory=create_processed_data_directory,
|
|
622
|
-
)
|
|
623
|
-
|
|
624
|
-
# If the p53.bin marker exists and the runtime is configured to remove it, attempts to remove the marker file.
|
|
625
|
-
if session_data.processed_data.p53_path.exists() and remove:
|
|
626
|
-
# This section deals with a unique nuance related to the Sun lab processing server organization. Specifically,
|
|
627
|
-
# the user accounts are not allowed to modify or create files in the data directories owned by the service
|
|
628
|
-
# accounts. In turn, this prevents user accounts from modifying the processed data directory to indicate when
|
|
629
|
-
# they are running a dataset integration pipeline on the processed data. To work around this problem, the
|
|
630
|
-
# dataset integration pipeline now creates a 'semaphore' marker for each session that is currently being
|
|
631
|
-
# integrated into a dataset. This semaphore marker is stored under the root user working directory, inside the
|
|
632
|
-
# subdirectory called 'semaphore'.
|
|
633
|
-
|
|
634
|
-
# The parent of the shared sun-lab processed data directory is the root 'working' volume. All user directories
|
|
635
|
-
# are stored under this root working directory.
|
|
636
|
-
if processed_data_root is None:
|
|
637
|
-
# If the processed data root is not provided, sets it to the great-grandparent of the session directory.
|
|
638
|
-
# This works assuming that the data is stored under: root/project/animal/session.
|
|
639
|
-
processed_data_root = session_path.parents[2]
|
|
640
|
-
working_root = processed_data_root.parent
|
|
641
|
-
|
|
642
|
-
# Loops over each user directory and checks whether a semaphore marker exists for the processed session.
|
|
643
|
-
for directory in working_root.iterdir():
|
|
644
|
-
if (
|
|
645
|
-
len([marker for marker in directory.joinpath("semaphore").glob(f"*{session_data.session_name}.bin")])
|
|
646
|
-
> 0
|
|
647
|
-
):
|
|
648
|
-
# Aborts with an error if the semaphore marker prevents the p53 marker from being removed.
|
|
649
|
-
message = (
|
|
650
|
-
f"Unable to remove the dataset marker for the session' {session_data.session_name}' acquired "
|
|
651
|
-
f"for the animal '{session_data.animal_id}' under the '{session_data.project_name}' project. "
|
|
652
|
-
f"The session data is currently being integrated into a dataset by the owner the "
|
|
653
|
-
f"'{directory.stem}' user directory. Wait until the ongoing dataset integration is complete and "
|
|
654
|
-
f"repeat the command that produced this error."
|
|
655
|
-
)
|
|
656
|
-
console.error(message=message, error=RuntimeError)
|
|
657
|
-
|
|
658
|
-
# If the session does not have a corresponding semaphore marker in any user directories, removes the p53 marker
|
|
659
|
-
# file.
|
|
660
|
-
session_data.processed_data.p53_path.unlink()
|
|
661
|
-
message = (
|
|
662
|
-
f"Dataset marker for the session '{session_data.session_name}' acquired for the animal "
|
|
663
|
-
f"'{session_data.animal_id}' under the '{session_data.project_name}' project: Removed."
|
|
664
|
-
)
|
|
665
|
-
console.echo(message=message, level=LogLevel.SUCCESS)
|
|
666
|
-
return # Ends remove runtime
|
|
667
|
-
|
|
668
|
-
# If the marker does not exist and the function is called in 'remove' mode, aborts the runtime early
|
|
669
|
-
elif not session_data.processed_data.p53_path.exists() and remove:
|
|
670
|
-
message = (
|
|
671
|
-
f"Dataset marker for the session '{session_data.session_name}' acquired for the animal "
|
|
672
|
-
f"'{session_data.animal_id}' under the '{session_data.project_name}' project: Does not exist. No actions "
|
|
673
|
-
f"taken."
|
|
674
|
-
)
|
|
675
|
-
console.echo(message=message, level=LogLevel.SUCCESS)
|
|
676
|
-
return # Ends remove runtime
|
|
677
|
-
|
|
678
|
-
elif session_data.processed_data.p53_path.exists():
|
|
679
|
-
message = (
|
|
680
|
-
f"Dataset marker for the session '{session_data.session_name}' acquired for the animal "
|
|
681
|
-
f"'{session_data.animal_id}' under the '{session_data.project_name}' project: Already exists. No actions "
|
|
682
|
-
f"taken."
|
|
683
|
-
)
|
|
684
|
-
console.echo(message=message, level=LogLevel.SUCCESS)
|
|
685
|
-
return # Ends create runtime
|
|
686
|
-
|
|
687
|
-
# The rest of the runtime deals with determining whether it is safe to create the marker file.
|
|
688
|
-
# Queries the type of the processed session
|
|
689
|
-
session_type = session_data.session_type
|
|
690
|
-
|
|
691
|
-
# Window checking sessions are not designed to be integrated into datasets, so they cannot be marked with the
|
|
692
|
-
# p53.bin file. Similarly, any incomplete session is automatically excluded from dataset formation.
|
|
693
|
-
if session_type == SessionTypes.WINDOW_CHECKING or not session_data.raw_data.telomere_path.exists():
|
|
694
|
-
message = (
|
|
695
|
-
f"Unable to generate the dataset marker for the session '{session_data.session_name}' acquired for the "
|
|
696
|
-
f"animal '{session_data.animal_id}' under the '{session_data.project_name}' project, as the session is "
|
|
697
|
-
f"incomplete or is of Window Checking type. These sessions must be manually evaluated and marked for "
|
|
698
|
-
f"dataset inclusion by the experimenter. "
|
|
699
|
-
)
|
|
700
|
-
console.error(message=message, error=RuntimeError)
|
|
701
|
-
|
|
702
|
-
# Training sessions collect similar data and share processing pipeline requirements
|
|
703
|
-
error: bool = False
|
|
704
|
-
if session_type == SessionTypes.LICK_TRAINING or session_type == SessionTypes.RUN_TRAINING:
|
|
705
|
-
# Ensures that the session is not being processed with one of the supported pipelines.
|
|
706
|
-
behavior_tracker = get_processing_tracker(
|
|
707
|
-
file_name=TrackerFileNames.BEHAVIOR, root=session_data.processed_data.processed_data_path
|
|
708
|
-
)
|
|
709
|
-
video_tracker = get_processing_tracker(
|
|
710
|
-
file_name=TrackerFileNames.VIDEO, root=session_data.processed_data.processed_data_path
|
|
711
|
-
)
|
|
712
|
-
if behavior_tracker.is_running or video_tracker.is_running:
|
|
713
|
-
# Note, training runtimes do not require suite2p processing.
|
|
714
|
-
error = True
|
|
715
|
-
|
|
716
|
-
# Mesoscope experiment sessions require additional processing with suite2p
|
|
717
|
-
elif session_type == SessionTypes.MESOSCOPE_EXPERIMENT:
|
|
718
|
-
behavior_tracker = get_processing_tracker(
|
|
719
|
-
file_name=TrackerFileNames.BEHAVIOR, root=session_data.processed_data.processed_data_path
|
|
720
|
-
)
|
|
721
|
-
suite2p_tracker = get_processing_tracker(
|
|
722
|
-
file_name=TrackerFileNames.SUITE2P, root=session_data.processed_data.processed_data_path
|
|
723
|
-
)
|
|
724
|
-
video_tracker = get_processing_tracker(
|
|
725
|
-
file_name=TrackerFileNames.VIDEO, root=session_data.processed_data.processed_data_path
|
|
726
|
-
)
|
|
727
|
-
console.echo(f"{behavior_tracker.is_running}")
|
|
728
|
-
if behavior_tracker.is_running or video_tracker.is_running or suite2p_tracker.is_running:
|
|
729
|
-
error = True
|
|
730
|
-
|
|
731
|
-
# If the session is currently being processed by one or more pipelines, aborts with an error.
|
|
732
|
-
if error:
|
|
733
|
-
message = (
|
|
734
|
-
f"Unable to generate the dataset marker for the session '{session_data.session_name}' acquired for the "
|
|
735
|
-
f"animal '{session_data.animal_id}' under the '{session_data.project_name}' project, as it is "
|
|
736
|
-
f"currently being processed by one of the data processing pipelines. Wait until the session is fully "
|
|
737
|
-
f"processed by all pipelines and repeat the command that encountered this error."
|
|
738
|
-
)
|
|
739
|
-
console.error(message=message, error=RuntimeError)
|
|
740
|
-
|
|
741
|
-
# If the runtime reached this point, the session is eligible for dataset integration. Creates the p53.bin marker
|
|
742
|
-
# file, preventing the session from being processed again as long as the marker exists.
|
|
743
|
-
session_data.processed_data.p53_path.touch()
|
|
744
|
-
message = (
|
|
745
|
-
f"Dataset marker for the session '{session_data.session_name}' acquired for the animal "
|
|
746
|
-
f"'{session_data.animal_id}' under the '{session_data.project_name}' project: Created."
|
|
747
|
-
)
|
|
748
|
-
console.echo(message=message, level=LogLevel.SUCCESS)
|
|
749
|
-
|
|
750
|
-
# If the runtime is configured to generate the project manifest file, attempts to generate and overwrite the
|
|
751
|
-
# existing manifest file for the target project.
|
|
752
|
-
if update_manifest:
|
|
753
|
-
# All sessions are stored under root/project/animal/session. Therefore, the grandparent of the session is
|
|
754
|
-
# the raw project directory.
|
|
755
|
-
raw_directory = session_path.parents[1]
|
|
756
|
-
|
|
757
|
-
# Generates the manifest file inside the root raw data project directory
|
|
758
|
-
generate_project_manifest(
|
|
759
|
-
raw_project_directory=session_path.parents[1],
|
|
760
|
-
processed_data_root=processed_data_root,
|
|
761
|
-
output_directory=raw_directory,
|
|
762
|
-
)
|