sl-shared-assets 4.0.1__py3-none-any.whl → 5.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sl-shared-assets might be problematic. Click here for more details.

Files changed (39) hide show
  1. sl_shared_assets/__init__.py +45 -42
  2. sl_shared_assets/command_line_interfaces/__init__.py +3 -0
  3. sl_shared_assets/command_line_interfaces/configure.py +173 -0
  4. sl_shared_assets/command_line_interfaces/manage.py +226 -0
  5. sl_shared_assets/data_classes/__init__.py +33 -32
  6. sl_shared_assets/data_classes/configuration_data.py +267 -79
  7. sl_shared_assets/data_classes/session_data.py +226 -289
  8. sl_shared_assets/server/__init__.py +24 -4
  9. sl_shared_assets/server/job.py +6 -7
  10. sl_shared_assets/server/pipeline.py +570 -0
  11. sl_shared_assets/server/server.py +57 -25
  12. sl_shared_assets/tools/__init__.py +9 -8
  13. sl_shared_assets/tools/packaging_tools.py +14 -25
  14. sl_shared_assets/tools/project_management_tools.py +602 -523
  15. sl_shared_assets/tools/transfer_tools.py +88 -23
  16. {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/METADATA +46 -203
  17. sl_shared_assets-5.0.0.dist-info/RECORD +23 -0
  18. sl_shared_assets-5.0.0.dist-info/entry_points.txt +3 -0
  19. sl_shared_assets/__init__.pyi +0 -91
  20. sl_shared_assets/cli.py +0 -501
  21. sl_shared_assets/cli.pyi +0 -106
  22. sl_shared_assets/data_classes/__init__.pyi +0 -75
  23. sl_shared_assets/data_classes/configuration_data.pyi +0 -235
  24. sl_shared_assets/data_classes/runtime_data.pyi +0 -157
  25. sl_shared_assets/data_classes/session_data.pyi +0 -379
  26. sl_shared_assets/data_classes/surgery_data.pyi +0 -89
  27. sl_shared_assets/server/__init__.pyi +0 -11
  28. sl_shared_assets/server/job.pyi +0 -205
  29. sl_shared_assets/server/server.pyi +0 -298
  30. sl_shared_assets/tools/__init__.pyi +0 -19
  31. sl_shared_assets/tools/ascension_tools.py +0 -265
  32. sl_shared_assets/tools/ascension_tools.pyi +0 -68
  33. sl_shared_assets/tools/packaging_tools.pyi +0 -58
  34. sl_shared_assets/tools/project_management_tools.pyi +0 -239
  35. sl_shared_assets/tools/transfer_tools.pyi +0 -53
  36. sl_shared_assets-4.0.1.dist-info/RECORD +0 -36
  37. sl_shared_assets-4.0.1.dist-info/entry_points.txt +0 -7
  38. {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/WHEEL +0 -0
  39. {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  """This module provides tools for managing the data of any Sun lab project. Tools from this module extend the
2
- functionality of SessionData class via a convenient API that allows working with the data of multiple sessions making
3
- up a given project."""
2
+ functionality of the SessionData class via a convenient API to automate routine tasks that primarily support data
3
+ processing pipelines."""
4
4
 
5
5
  from pathlib import Path
6
6
  from datetime import datetime
@@ -10,19 +10,604 @@ import polars as pl
10
10
  from filelock import FileLock
11
11
  from ataraxis_base_utilities import LogLevel, console
12
12
 
13
+ from ..server import TrackerFileNames, ProcessingTracker
13
14
  from ..data_classes import (
14
15
  SessionData,
16
+ SessionLock,
15
17
  SessionTypes,
16
- TrackerFileNames,
17
18
  RunTrainingDescriptor,
18
19
  LickTrainingDescriptor,
19
20
  WindowCheckingDescriptor,
20
21
  MesoscopeExperimentDescriptor,
21
- get_processing_tracker,
22
22
  )
23
+ from .transfer_tools import delete_directory, transfer_directory
23
24
  from .packaging_tools import calculate_directory_checksum
24
25
 
25
26
 
27
+ def resolve_checksum(
28
+ session_path: Path,
29
+ manager_id: int,
30
+ processed_data_root: None | Path = None,
31
+ reset_tracker: bool = False,
32
+ regenerate_checksum: bool = False,
33
+ ) -> None:
34
+ """Verifies the integrity of the session's data by generating the checksum of the raw_data directory and comparing
35
+ it against the checksum stored in the ax_checksum.txt file.
36
+
37
+ Primarily, this function is used to verify data integrity after transferring it from the data acquisition system PC
38
+ to the remote server for long-term storage.
39
+
40
+ Notes:
41
+ Any session that does not successfully pass checksum verification (or recreation) is automatically excluded
42
+ from all further automatic processing steps.
43
+
44
+ Since version 5.0.0, this function also supports recalculating and overwriting the checksum stored inside the
45
+ ax_checksum.txt file. This allows this function to re-checksum session data, which is helpful if the
46
+ experimenter deliberately alters the session's data post-acquisition (for example, to comply with new data
47
+ storage guidelines).
48
+
49
+ Args:
50
+ session_path: The path to the session directory to be processed.
51
+ manager_id: The unique identifier of the manager process that manages the runtime.
52
+ processed_data_root: The path to the root directory used to store the processed data from all Sun lab projects,
53
+ if different from the 'session_path' root.
54
+ reset_tracker: Determines whether to reset the tracker file before executing the runtime. This allows
55
+ recovering from deadlocked runtimes, but otherwise should not be used to ensure runtime safety.
56
+ regenerate_checksum: Determines whether to update the checksum stored in the ax_checksum.txt file before
57
+ carrying out the verification. In this case, the verification necessarily succeeds and the session's
58
+ reference checksum is changed to reflect the current state of the session data.
59
+ """
60
+
61
+ # Loads session data layout. If configured to do so, also creates the processed data hierarchy
62
+ session_data = SessionData.load(
63
+ session_path=session_path,
64
+ processed_data_root=processed_data_root,
65
+ )
66
+
67
+ # Acquires the exclusive session data access lock.
68
+ lock = SessionLock(file_path=session_data.tracking_data.session_lock_path)
69
+ lock.acquire(manager_id=manager_id)
70
+
71
+ # Initializes the ProcessingTracker instance
72
+ tracker = ProcessingTracker(
73
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.CHECKSUM)
74
+ )
75
+
76
+ # If requested, reset the tracker to the default state before starting the checksum resolution process.
77
+ if reset_tracker:
78
+ tracker.abort()
79
+
80
+ # Updates the tracker data to communicate that the process has started. This automatically clears the previous
81
+ # processing status stored in the file.
82
+ tracker.start(manager_id=manager_id)
83
+ try:
84
+ console.echo(
85
+ message=f"Resolving the data integrity checksum for session '{session_data.session_name}'...",
86
+ level=LogLevel.INFO,
87
+ )
88
+
89
+ # Regenerates the checksum for the raw_data directory. Note, if the 'regenerate_checksum' flag is True, this
90
+ # guarantees that the check below succeeds as the function replaces the checksum in the ax_checksum.txt file
91
+ # with the newly calculated value.
92
+ calculated_checksum = calculate_directory_checksum(
93
+ directory=session_data.raw_data.raw_data_path, batch=False, save_checksum=regenerate_checksum
94
+ )
95
+
96
+ # Loads the checksum stored inside the ax_checksum.txt file
97
+ with session_data.raw_data.checksum_path.open() as f:
98
+ stored_checksum = f.read().strip()
99
+
100
+ # If the two checksums do not match, this likely indicates data corruption.
101
+ if stored_checksum != calculated_checksum:
102
+ tracker.error(manager_id=manager_id)
103
+ console.echo(
104
+ message=f"Session '{session_data.session_name}' raw data integrity: Compromised.", level=LogLevel.ERROR
105
+ )
106
+
107
+ else:
108
+ # Sets the tracker to indicate that the runtime completed successfully.
109
+ tracker.stop(manager_id=manager_id)
110
+ console.echo(
111
+ message=f"Session '{session_data.session_name}' raw data integrity: Verified.", level=LogLevel.SUCCESS
112
+ )
113
+
114
+ finally:
115
+ # If the code reaches this section while the tracker indicates that the processing is still running,
116
+ # this means that the runtime encountered an error.
117
+ if tracker.is_running:
118
+ tracker.error(manager_id=manager_id)
119
+
120
+ # Updates or generates the manifest file inside the root raw data project directory
121
+ generate_project_manifest(
122
+ raw_project_directory=session_data.raw_data.root_path.joinpath(session_data.project_name),
123
+ processed_data_root=processed_data_root,
124
+ manager_id=manager_id,
125
+ )
126
+
127
+
128
+ def prepare_session(
129
+ session_path: Path,
130
+ manager_id: int,
131
+ processed_data_root: Path | None,
132
+ reset_tracker: bool = False,
133
+ ) -> None:
134
+ """Prepares the target session for data processing and dataset integration.
135
+
136
+ This function is primarily designed to be used on remote compute servers that use different data volumes for
137
+ storage and processing. Since storage volumes are often slow, the session data needs to be copied to the fast
138
+ volume before executing processing pipelines. Typically, this function is used exactly once during each session's
139
+ life cycle: when it is first transferred to the remote compute server.
140
+
141
+ Args:
142
+ session_path: The path to the session directory to be processed.
143
+ manager_id: The unique identifier of the manager process that manages the runtime.
144
+ processed_data_root: The path to the root directory used to store the processed data from all Sun lab projects,
145
+ if different from the 'session_path' root.
146
+ reset_tracker: Determines whether to reset the tracker file before executing the runtime. This allows
147
+ recovering from deadlocked runtimes, but otherwise should not be used to ensure runtime safety.
148
+
149
+ Notes:
150
+ This function inverses the result of running the archive_session() function.
151
+ """
152
+ # Resolves the data hierarchy for the processed session
153
+ session_data = SessionData.load(
154
+ session_path=session_path,
155
+ processed_data_root=processed_data_root,
156
+ )
157
+
158
+ # Acquires the exclusive session data access lock.
159
+ lock = SessionLock(file_path=session_data.tracking_data.session_lock_path)
160
+ lock.acquire(manager_id=manager_id)
161
+
162
+ # Initializes the ProcessingTracker instances for preparation and archiving pipelines.
163
+ preparation_tracker = ProcessingTracker(
164
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.PREPARATION)
165
+ )
166
+ archiving_tracker = ProcessingTracker(
167
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.ARCHIVING)
168
+ )
169
+
170
+ # Explicitly prevents colliding with ongoing archiving runtimes.
171
+ if archiving_tracker.is_running:
172
+ message = (
173
+ f"Unable to prepare the session '{session_data.session_name}' for data processing, as it is currently "
174
+ f"being archived. Abort the archiving process or wait for it to complete before retrying."
175
+ )
176
+ console.error(message=message, error=RuntimeError)
177
+
178
+ # Resets the preparation tracker, if requested.
179
+ if reset_tracker:
180
+ preparation_tracker.abort()
181
+
182
+ # Starts the preparation runtime
183
+ preparation_tracker.start(manager_id=manager_id)
184
+ try:
185
+ console.echo(
186
+ message=f"Preparing session '{session_data.session_name}' for data processing...", level=LogLevel.INFO
187
+ )
188
+
189
+ # If the session uses different roots for 'raw' and 'source' data, copies raw_data folder to the path
190
+ # specified by the 'source_data'.
191
+ if session_data.raw_data.root_path != session_data.source_data.root_path:
192
+ console.echo(
193
+ message=f"Copying the 'raw_data' directory to the working volume as the 'source_data' directory...",
194
+ level=LogLevel.INFO,
195
+ )
196
+ transfer_directory(
197
+ source=session_data.raw_data.raw_data_path,
198
+ destination=session_data.source_data.raw_data_path,
199
+ num_threads=0,
200
+ verify_integrity=False,
201
+ remove_source=False,
202
+ )
203
+
204
+ # If the session contains archived processed data, restores the data to the working root.
205
+ if (
206
+ session_data.archived_data.root_path != session_data.processed_data.root_path
207
+ and archiving_tracker.is_complete
208
+ and session_data.archived_data.processed_data_path.exists()
209
+ ):
210
+ console.echo(
211
+ message=(
212
+ f"Transferring the 'archived_data' directory to the working volume as the 'processed_data' "
213
+ f"directory..."
214
+ ),
215
+ level=LogLevel.INFO,
216
+ )
217
+ transfer_directory(
218
+ source=session_data.archived_data.processed_data_path,
219
+ destination=session_data.processed_data.processed_data_path,
220
+ num_threads=0,
221
+ verify_integrity=False,
222
+ remove_source=True,
223
+ )
224
+
225
+ # Preparation is complete
226
+ preparation_tracker.stop(manager_id=manager_id)
227
+ archiving_tracker.abort() # Resets the state of the archiving tracker, as the session is no longer archived.
228
+ console.echo(
229
+ message=f"Session '{session_data.session_name}': Prepared for data processing.", level=LogLevel.SUCCESS
230
+ )
231
+
232
+ finally:
233
+ # If the code reaches this section while the tracker indicates that the processing is still running,
234
+ # this means that the runtime encountered an error.
235
+ if preparation_tracker.is_running:
236
+ preparation_tracker.error(manager_id=manager_id)
237
+
238
+ # Updates or generates the manifest file inside the root raw data project directory
239
+ generate_project_manifest(
240
+ raw_project_directory=session_data.raw_data.root_path.joinpath(session_data.project_name),
241
+ processed_data_root=processed_data_root,
242
+ manager_id=manager_id,
243
+ )
244
+
245
+
246
+ def archive_session(
247
+ session_path: Path,
248
+ manager_id: int,
249
+ reset_tracker: bool = False,
250
+ processed_data_root: Path | None = None,
251
+ ) -> None:
252
+ """Prepares the target session for long-term (cold) storage.
253
+
254
+ This function is primarily designed to be used on remote compute servers that use different data volumes for
255
+ storage and processing. It should be called for sessions that are no longer frequently processed or accessed to move
256
+ all session data to the (slow) storage volume and free up the fast processing volume for working with other data.
257
+ Typically, this function is used exactly once during each session's life cycle: when the session's project is
258
+ officially concluded.
259
+
260
+ Args:
261
+ session_path: The path to the session directory to be processed.
262
+ manager_id: The unique identifier of the manager process that manages the runtime.
263
+ processed_data_root: The path to the root directory used to store the processed data from all Sun lab projects,
264
+ if different from the 'session_path' root.
265
+ reset_tracker: Determines whether to reset the tracker file before executing the runtime. This allows
266
+ recovering from deadlocked runtimes, but otherwise should not be used to ensure runtime safety.
267
+
268
+ Notes:
269
+ This function inverses the result of running the prepare_session() function.
270
+ """
271
+ # Resolves the data hierarchy for the processed session
272
+ session_data = SessionData.load(
273
+ session_path=session_path,
274
+ processed_data_root=processed_data_root,
275
+ )
276
+
277
+ # Acquires the exclusive session data access lock.
278
+ lock = SessionLock(file_path=session_data.tracking_data.session_lock_path)
279
+ lock.acquire(manager_id=manager_id)
280
+
281
+ # Initializes the ProcessingTracker instances for preparation and archiving pipelines.
282
+ preparation_tracker = ProcessingTracker(
283
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.PREPARATION)
284
+ )
285
+ archiving_tracker = ProcessingTracker(
286
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.ARCHIVING)
287
+ )
288
+
289
+ # Explicitly prevents colliding with ongoing preparation runtimes.
290
+ if preparation_tracker.is_running:
291
+ message = (
292
+ f"Unable to archive the session '{session_data.session_name}' for long-term storage, as it is currently "
293
+ f"being prepared for data processing. Abort the preparation process or wait for it to complete before "
294
+ f"retrying."
295
+ )
296
+ console.error(message=message, error=RuntimeError)
297
+
298
+ # Resets the archiving tracker, if requested.
299
+ if reset_tracker:
300
+ archiving_tracker.abort()
301
+
302
+ # Starts the archiving runtime.
303
+ archiving_tracker.start(manager_id=manager_id)
304
+ try:
305
+ console.echo(message=f"Arching session '{session_data.session_name}'...", level=LogLevel.INFO)
306
+
307
+ # If the 'processed_data' root is different from the 'archived_data' root, transfers the 'processed_data'
308
+ # directory to the paths specified by 'archived_data'.
309
+ if (
310
+ session_data.processed_data.root_path != session_data.archived_data.root_path
311
+ and session_data.processed_data.processed_data_path.exists()
312
+ ):
313
+ console.echo(
314
+ message=(
315
+ f"Transferring (archiving) the 'processed_data' directory to the storage volume as the "
316
+ f"'archived_data' directory..."
317
+ ),
318
+ level=LogLevel.INFO,
319
+ )
320
+ transfer_directory(
321
+ source=session_data.processed_data.processed_data_path,
322
+ destination=session_data.archived_data.processed_data_path,
323
+ num_threads=0,
324
+ verify_integrity=False,
325
+ remove_source=True,
326
+ )
327
+
328
+ # Also ensures that the 'source_data' folder is removed from the working volume.
329
+ if session_data.raw_data.root_path != session_data.source_data.root_path:
330
+ console.echo(
331
+ message=f"Removing the redundant 'source_data' directory from the working volume...",
332
+ level=LogLevel.INFO,
333
+ )
334
+ delete_directory(session_data.source_data.raw_data_path)
335
+
336
+ # Archiving is complete
337
+ archiving_tracker.stop(manager_id=manager_id)
338
+ preparation_tracker.abort() # Resets the preparation tracker, as the session is no longer prepared.
339
+ console.echo(message=f"Session '{session_data.session_name}': Archived.", level=LogLevel.SUCCESS)
340
+
341
+ finally:
342
+ # If the code reaches this section while the tracker indicates that the processing is still running,
343
+ # this means that the runtime encountered an error.
344
+ if archiving_tracker.is_running:
345
+ archiving_tracker.error(manager_id=manager_id)
346
+
347
+ # Updates or generates the manifest file inside the root raw data project directory
348
+ generate_project_manifest(
349
+ raw_project_directory=session_data.raw_data.root_path.joinpath(session_data.project_name),
350
+ processed_data_root=processed_data_root,
351
+ manager_id=manager_id,
352
+ )
353
+
354
+
355
+ def generate_project_manifest(
356
+ raw_project_directory: Path,
357
+ manager_id: int,
358
+ processed_data_root: Path | None = None,
359
+ ) -> None:
360
+ """Builds and saves the project manifest .feather file under the specified output directory.
361
+
362
+ This function evaluates the input project directory and builds the 'manifest' file for the project. The file
363
+ includes the descriptive information about every session stored inside the input project folder and the state of
364
+ the session's data processing (which processing pipelines have been applied to each session). The file is created
365
+ under the input raw project directory and uses the following name pattern: ProjectName_manifest.feather.
366
+
367
+ Notes:
368
+ The manifest file is primarily used to capture and move project state information between machines, typically
369
+ in the context of working with data stored on a remote compute server or cluster.
370
+
371
+ Args:
372
+ raw_project_directory: The path to the root project directory used to store raw session data.
373
+ manager_id: The unique identifier of the manager process that manages the runtime.
374
+ processed_data_root: The path to the root directory (volume) used to store processed data for all Sun lab
375
+ projects if it is different from the parent of the 'raw_project_directory'.
376
+ """
377
+
378
+ if not raw_project_directory.exists():
379
+ message = (
380
+ f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. "
381
+ f"The specified project directory does not exist."
382
+ )
383
+ console.error(message=message, error=FileNotFoundError)
384
+
385
+ # Finds all session directories for the target project
386
+ session_directories = [directory.parent for directory in raw_project_directory.rglob("raw_data")]
387
+
388
+ if len(session_directories) == 0:
389
+ message = (
390
+ f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. The "
391
+ f"project does not contain any raw session data. To generate the manifest file, the project must contain "
392
+ f"the data for at least one session."
393
+ )
394
+ console.error(message=message, error=FileNotFoundError)
395
+
396
+ # Precreates the 'manifest' dictionary structure
397
+ manifest: dict[str, list[str | bool | datetime | int]] = {
398
+ "animal": [], # Animal IDs.
399
+ "session": [], # Session names.
400
+ "date": [], # Session names stored as timezone-aware date-time objects in EST.
401
+ "type": [], # Type of the session (e.g., mesoscope experiment, run training, etc.).
402
+ "system": [], # Acquisition system used to acquire the session (e.g. mesoscope-vr, etc.).
403
+ "notes": [], # The experimenter notes about the session.
404
+ # Determines whether the session data is complete (ran for the intended duration and has all expected data).
405
+ "complete": [],
406
+ # Determines whether the session data integrity has been verified upon transfer to a storage machine.
407
+ "integrity": [],
408
+ # Determines whether the session's data has been prepared for data processing.
409
+ "prepared": [],
410
+ # Determines whether the session has been processed with the single-day s2p pipeline.
411
+ "suite2p": [],
412
+ # Determines whether the session has been processed with the behavior extraction pipeline.
413
+ "behavior": [],
414
+ # Determines whether the session has been processed with the DeepLabCut pipeline.
415
+ "video": [],
416
+ # Determines whether the session's data has been archived for long-term storage.
417
+ "archived": [],
418
+ }
419
+
420
+ # Resolves the path to the manifest .feather file to be created and the .lock file to ensure only a single process
421
+ # can be working on the manifest file at the same time.
422
+ manifest_path = raw_project_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather")
423
+ manifest_lock = manifest_path.with_suffix(manifest_path.suffix + ".lock")
424
+
425
+ # Also instantiates the processing tracker for the manifest file in the same directory. Note, unlike for most other
426
+ # runtimes, the tracker is NOT used to limit the ability of other processes to run the manifest generation. That
427
+ # job is handled to the manifest lock file. Instead, the tracker is used to communicate whether the manifest
428
+ # generation runs as expected or encounters an error.
429
+ runtime_tracker = ProcessingTracker(file_path=raw_project_directory.joinpath(TrackerFileNames.MANIFEST))
430
+
431
+ # Since the exclusivity of the data manifest generation runtime is enforced through the manifest .lock file, this
432
+ # runtime always resets the processing tracker file.
433
+ runtime_tracker.abort()
434
+
435
+ # Acquires the lock file, ensuring only this specific process can work with the manifest data.
436
+ lock = FileLock(str(manifest_lock))
437
+ with lock.acquire(timeout=20.0):
438
+ # Starts the manifest generation process.
439
+ runtime_tracker.start(manager_id=manager_id)
440
+ try:
441
+ # Loops over each session of every animal in the project and extracts session ID information and
442
+ # information about which processing steps have been successfully applied to the session.
443
+ for directory in session_directories:
444
+ # Skips processing directories without files (sessions with empty raw_data directories)
445
+ if len([file for file in directory.joinpath("raw_data").glob("*")]) == 0:
446
+ continue
447
+
448
+ # Instantiates the SessionData instance to resolve the paths to all session's data files and locations.
449
+ session_data = SessionData.load(
450
+ session_path=directory,
451
+ processed_data_root=processed_data_root,
452
+ )
453
+
454
+ # Extracts ID and data path information from the SessionData instance
455
+ manifest["animal"].append(session_data.animal_id)
456
+ manifest["session"].append(session_data.session_name)
457
+ manifest["type"].append(session_data.session_type)
458
+ manifest["system"].append(session_data.acquisition_system)
459
+
460
+ # Parses session name into the date-time object to simplify working with date-time data in the future
461
+ date_time_components = session_data.session_name.split("-")
462
+ date_time = datetime(
463
+ year=int(date_time_components[0]),
464
+ month=int(date_time_components[1]),
465
+ day=int(date_time_components[2]),
466
+ hour=int(date_time_components[3]),
467
+ minute=int(date_time_components[4]),
468
+ second=int(date_time_components[5]),
469
+ microsecond=int(date_time_components[6]),
470
+ tzinfo=pytz.UTC,
471
+ )
472
+
473
+ # Converts from UTC to EST / EDT for user convenience
474
+ eastern = pytz.timezone("America/New_York")
475
+ date_time = date_time.astimezone(eastern)
476
+ manifest["date"].append(date_time)
477
+
478
+ # Depending on the session type, instantiates the appropriate descriptor instance and uses it to read
479
+ # the experimenter notes
480
+ if session_data.session_type == SessionTypes.LICK_TRAINING:
481
+ descriptor: LickTrainingDescriptor = LickTrainingDescriptor.from_yaml( # type: ignore
482
+ file_path=session_data.raw_data.session_descriptor_path
483
+ )
484
+ manifest["notes"].append(descriptor.experimenter_notes)
485
+ elif session_data.session_type == SessionTypes.RUN_TRAINING:
486
+ descriptor: RunTrainingDescriptor = RunTrainingDescriptor.from_yaml( # type: ignore
487
+ file_path=session_data.raw_data.session_descriptor_path
488
+ )
489
+ manifest["notes"].append(descriptor.experimenter_notes)
490
+ elif session_data.session_type == SessionTypes.MESOSCOPE_EXPERIMENT:
491
+ descriptor: MesoscopeExperimentDescriptor = MesoscopeExperimentDescriptor.from_yaml( # type: ignore
492
+ file_path=session_data.raw_data.session_descriptor_path
493
+ )
494
+ manifest["notes"].append(descriptor.experimenter_notes)
495
+ elif session_data.session_type == SessionTypes.WINDOW_CHECKING:
496
+ # sl-experiment version 3.0.0 added session descriptors to Window Checking runtimes. Since the file
497
+ # does not exist in prior versions, this section is written to statically handle the discrepancy.
498
+ try:
499
+ descriptor: WindowCheckingDescriptor = WindowCheckingDescriptor.from_yaml( # type: ignore
500
+ file_path=session_data.raw_data.session_descriptor_path
501
+ )
502
+ manifest["notes"].append(descriptor.experimenter_notes)
503
+ except Exception:
504
+ manifest["notes"].append("N/A")
505
+ else:
506
+ # Raises an error if an unsupported session type is encountered.
507
+ message = (
508
+ f"Unsupported session type '{session_data.session_type}' encountered for session "
509
+ f"'{directory.stem}' when generating the manifest file for the project "
510
+ f"{raw_project_directory.stem}. Currently, only the following session types are supported: "
511
+ f"{tuple(SessionTypes)}."
512
+ )
513
+ console.error(message=message, error=ValueError)
514
+ raise ValueError(message) # Fallback to appease mypy, should not be reachable
515
+
516
+ # If the session raw_data folder contains the telomere.bin file, marks the session as complete.
517
+ manifest["complete"].append(session_data.raw_data.telomere_path.exists())
518
+
519
+ # Data integrity verification status
520
+ tracker = ProcessingTracker(
521
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.CHECKSUM)
522
+ )
523
+ manifest["integrity"].append(tracker.is_complete)
524
+
525
+ # If the session is incomplete or unverified, marks all processing steps as FALSE, as automatic
526
+ # processing is disabled for incomplete sessions and, therefore, it could not have been processed.
527
+ if not manifest["complete"][-1] or not manifest["integrity"][-1]:
528
+ manifest["suite2p"].append(False)
529
+ manifest["behavior"].append(False)
530
+ manifest["video"].append(False)
531
+ manifest["prepared"].append(False)
532
+ manifest["archived"].append(False)
533
+ continue # Cycles to the next session
534
+
535
+ # Session data preparation (for processing) status.
536
+ tracker = ProcessingTracker(
537
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.PREPARATION)
538
+ )
539
+ manifest["prepared"].append(tracker.is_complete)
540
+
541
+ # Suite2p (single-day) processing status.
542
+ tracker = ProcessingTracker(
543
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.SUITE2P)
544
+ )
545
+ manifest["suite2p"].append(tracker.is_complete)
546
+
547
+ # Behavior data processing status.
548
+ tracker = ProcessingTracker(
549
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.BEHAVIOR)
550
+ )
551
+ manifest["behavior"].append(tracker.is_complete)
552
+
553
+ # DeepLabCut (video) processing status.
554
+ tracker = ProcessingTracker(
555
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.VIDEO)
556
+ )
557
+ manifest["video"].append(tracker.is_complete)
558
+
559
+ # Session data archiving status.
560
+ tracker = ProcessingTracker(
561
+ file_path=session_data.tracking_data.tracking_data_path.joinpath(TrackerFileNames.ARCHIVING)
562
+ )
563
+ manifest["archived"].append(tracker.is_complete)
564
+
565
+ # If all animal IDs are integer-convertible, stores them as numbers to promote proper sorting.
566
+ # Otherwise, stores them as strings. The latter options are primarily kept for compatibility with Tyche
567
+ # data.
568
+ animal_type: type[pl.UInt64] | type[pl.String]
569
+ if all([str(animal).isdigit() for animal in manifest["animal"]]):
570
+ # Converts all strings to integers
571
+ manifest["animal"] = [int(animal) for animal in manifest["animal"]] # type: ignore
572
+ animal_type = pl.UInt64 # Uint64 for future proofing
573
+ else:
574
+ animal_type = pl.String
575
+
576
+ # Converts the manifest dictionary to a Polars Dataframe.
577
+ schema = {
578
+ "animal": animal_type,
579
+ "date": pl.Datetime,
580
+ "session": pl.String,
581
+ "type": pl.String,
582
+ "system": pl.String,
583
+ "notes": pl.String,
584
+ "complete": pl.UInt8,
585
+ "integrity": pl.UInt8,
586
+ "prepared": pl.UInt8,
587
+ "suite2p": pl.UInt8,
588
+ "behavior": pl.UInt8,
589
+ "video": pl.UInt8,
590
+ "archived": pl.UInt8,
591
+ }
592
+ df = pl.DataFrame(manifest, schema=schema, strict=False)
593
+
594
+ # Sorts the DataFrame by animal and then session. Since animal IDs are monotonically increasing according to
595
+ # Sun lab standards and session 'names' are based on acquisition timestamps, the sort order is
596
+ # chronological.
597
+ sorted_df = df.sort(["animal", "session"])
598
+
599
+ # Saves the generated manifest to the project-specific manifest .feather file for further processing.
600
+ sorted_df.write_ipc(file=manifest_path, compression="lz4")
601
+
602
+ # The processing is now complete.
603
+ runtime_tracker.stop(manager_id=manager_id)
604
+
605
+ finally:
606
+ # If the tracker indicates that the processing is still running, the runtime has encountered an error.
607
+ if runtime_tracker.is_running:
608
+ tracker.error(manager_id=manager_id)
609
+
610
+
26
611
  class ProjectManifest:
27
612
  """Wraps the contents of a Sun lab project manifest .feather file and exposes methods for visualizing and
28
613
  working with the data stored inside the file.
@@ -77,12 +662,14 @@ class ProjectManifest:
77
662
  "date",
78
663
  "session",
79
664
  "type",
665
+ "system",
80
666
  "complete",
81
667
  "integrity",
668
+ "prepared",
82
669
  "suite2p",
83
670
  "behavior",
84
671
  "video",
85
- "dataset",
672
+ "archived",
86
673
  ]
87
674
 
88
675
  # Retrieves the data
@@ -119,7 +706,7 @@ class ProjectManifest:
119
706
  """
120
707
 
121
708
  # Pre-selects the columns to display
122
- df = self._data.select(["animal", "date", "session", "type", "notes"])
709
+ df = self._data.select(["animal", "date", "session", "type", "system", "notes"])
123
710
 
124
711
  # Optionally filters the data for the target animal
125
712
  if animal is not None:
@@ -137,8 +724,8 @@ class ProjectManifest:
137
724
  set_tbl_cols=-1,
138
725
  set_tbl_hide_column_data_types=True,
139
726
  set_tbl_cell_alignment="LEFT",
140
- set_tbl_width_chars=250, # Wider columns for notes
141
- set_fmt_str_lengths=600, # Allows very long strings for notes
727
+ set_tbl_width_chars=100, # Wider columns for notes
728
+ set_fmt_str_lengths=800, # Allows very long strings for notes
142
729
  ):
143
730
  print(df)
144
731
 
@@ -148,14 +735,16 @@ class ProjectManifest:
148
735
 
149
736
  This provides a tuple of all animal IDs participating in the target project.
150
737
  """
151
- return tuple(self._data.select("animal").unique().sort("animal").to_series().to_list())
738
+
739
+ # If animal IDs are stored as integers, converts them to string to support consistent return types.
740
+ return tuple(
741
+ [str(animal) for animal in self._data.select("animal").unique().sort("animal").to_series().to_list()]
742
+ )
152
743
 
153
744
  def _get_filtered_sessions(
154
745
  self,
155
746
  animal: str | int | None = None,
156
747
  exclude_incomplete: bool = True,
157
- dataset_ready_only: bool = False,
158
- not_dataset_ready_only: bool = False,
159
748
  ) -> tuple[str, ...]:
160
749
  """This worker method is used to get a list of sessions with optional filtering.
161
750
 
@@ -166,11 +755,6 @@ class ProjectManifest:
166
755
  animals.
167
756
  exclude_incomplete: Determines whether to exclude sessions not marked as 'complete' from the output
168
757
  list.
169
- dataset_ready_only: Determines whether to exclude sessions not marked as 'dataset' integration ready from
170
- the output list. Enabling this option only shows sessions that can be integrated into a dataset.
171
- not_dataset_ready_only: The opposite of 'dataset_ready_only'. Determines whether to exclude sessions marked
172
- as 'dataset' integration ready from the output list. Note, when both this and 'dataset_ready_only' are
173
- enabled, the 'dataset_ready_only' option takes precedence.
174
758
 
175
759
  Returns:
176
760
  The tuple of session IDs matching the filter criteria.
@@ -198,12 +782,6 @@ class ProjectManifest:
198
782
  if exclude_incomplete:
199
783
  data = data.filter(pl.col("complete") == 1)
200
784
 
201
- # Optionally filters sessions based on their readiness for dataset integration.
202
- if dataset_ready_only: # Dataset-ready option always takes precedence
203
- data = data.filter(pl.col("dataset") == 1)
204
- elif not_dataset_ready_only:
205
- data = data.filter(pl.col("dataset") == 0)
206
-
207
785
  # Formats and returns session IDs to the caller
208
786
  sessions = data.select("session").sort("session").to_series().to_list()
209
787
  return tuple(sessions)
@@ -221,8 +799,6 @@ class ProjectManifest:
221
799
  self,
222
800
  animal: str | int | None = None,
223
801
  exclude_incomplete: bool = True,
224
- dataset_ready_only: bool = False,
225
- not_dataset_ready_only: bool = False,
226
802
  ) -> tuple[str, ...]:
227
803
  """Returns requested session IDs based on selected filtering criteria.
228
804
 
@@ -234,11 +810,6 @@ class ProjectManifest:
234
810
  animals.
235
811
  exclude_incomplete: Determines whether to exclude sessions not marked as 'complete' from the output
236
812
  list.
237
- dataset_ready_only: Determines whether to exclude sessions not marked as 'dataset' integration ready from
238
- the output list. Enabling this option only shows sessions that can be integrated into a dataset.
239
- not_dataset_ready_only: The opposite of 'dataset_ready_only'. Determines whether to exclude sessions marked
240
- as 'dataset' integration ready from the output list. Note, when both this and 'dataset_ready_only' are
241
- enabled, the 'dataset_ready_only' option takes precedence.
242
813
 
243
814
  Returns:
244
815
  The tuple of session IDs matching the filter criteria.
@@ -249,8 +820,6 @@ class ProjectManifest:
249
820
  return self._get_filtered_sessions(
250
821
  animal=animal,
251
822
  exclude_incomplete=exclude_incomplete,
252
- dataset_ready_only=dataset_ready_only,
253
- not_dataset_ready_only=not_dataset_ready_only,
254
823
  )
255
824
 
256
825
  def get_session_info(self, session: str) -> pl.DataFrame:
@@ -263,500 +832,10 @@ class ProjectManifest:
263
832
  session: The ID of the session for which to retrieve the data.
264
833
 
265
834
  Returns:
266
- A Polars DataFrame with the following columns: 'animal', 'date', 'notes', 'session', 'type', 'complete',
267
- 'intensity_verification', 'suite2p', 'behavior', 'video', 'dataset'.
835
+ A Polars DataFrame with the following columns: 'animal', 'date', 'notes', 'session', 'type', 'system',
836
+ 'complete', 'integrity', 'suite2p', 'behavior', 'video', 'archived'.
268
837
  """
269
838
 
270
839
  df = self._data
271
840
  df = df.filter(pl.col("session").eq(session))
272
841
  return df
273
-
274
-
275
- def generate_project_manifest(
276
- raw_project_directory: Path, output_directory: Path, processed_data_root: Path | None = None
277
- ) -> None:
278
- """Builds and saves the project manifest .feather file under the specified output directory.
279
-
280
- This function evaluates the input project directory and builds the 'manifest' file for the project. The file
281
- includes the descriptive information about every session stored inside the input project folder and the state of
282
- the session's data processing (which processing pipelines have been applied to each session). The file will be
283
- created under the 'output_path' directory and use the following name pattern: ProjectName_manifest.feather.
284
-
285
- Notes:
286
- The manifest file is primarily used to capture and move project state information between machines, typically
287
- in the context of working with data stored on a remote compute server or cluster. However, it can also be used
288
- on a local machine, since an up-to-date manifest file is required to run most data processing pipelines in the
289
- lab regardless of the runtime context.
290
-
291
- Args:
292
- raw_project_directory: The path to the root project directory used to store raw session data.
293
- output_directory: The path to the directory where to save the generated manifest file.
294
- processed_data_root: The path to the root directory (volume) used to store processed data for all Sun lab
295
- projects if it is different from the parent of the 'raw_project_directory'. Typically, this would be the
296
- case on remote compute server(s) and not on local machines.
297
- """
298
-
299
- if not raw_project_directory.exists():
300
- message = (
301
- f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. The "
302
- f"specified project directory does not exist."
303
- )
304
- console.error(message=message, error=FileNotFoundError)
305
-
306
- # Finds all session directories
307
- session_directories = [directory.parent for directory in raw_project_directory.rglob("raw_data")]
308
-
309
- if len(session_directories) == 0:
310
- message = (
311
- f"Unable to generate the project manifest file for the requested project {raw_project_directory.stem}. The "
312
- f"project does not contain any raw session data. To generate the manifest file, the project must contain "
313
- f"at least one valid experiment or training session."
314
- )
315
- console.error(message=message, error=FileNotFoundError)
316
-
317
- # Precreates the 'manifest' dictionary structure
318
- manifest: dict[str, list[str | bool | datetime | int]] = {
319
- "animal": [], # Animal IDs.
320
- "session": [], # Session names.
321
- "date": [], # Session names stored as timezone-aware date-time objects in EST.
322
- "type": [], # Type of the session (e.g., Experiment, Training, etc.).
323
- "notes": [], # The experimenter notes about the session.
324
- # Determines whether the session data is complete (ran for the intended duration and has all expected data).
325
- "complete": [],
326
- # Determines whether the session data integrity has been verified upon transfer to a storage machine.
327
- "integrity": [],
328
- "suite2p": [], # Determines whether the session has been processed with the single-day s2p pipeline.
329
- # Determines whether the session has been processed with the behavior extraction pipeline.
330
- "behavior": [],
331
- "video": [], # Determines whether the session has been processed with the DeepLabCut pipeline.
332
- "dataset": [], # Determines whether the session's data is ready to be integrated into a dataset.
333
- }
334
-
335
- # Resolves the path to the manifest .feather file to be created and the .lock file for the generated manifest
336
- manifest_path = output_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather")
337
- manifest_lock = manifest_path.with_suffix(manifest_path.suffix + ".lock")
338
-
339
- # Acquires the lock
340
- lock = FileLock(str(manifest_lock))
341
- with lock.acquire(timeout=20.0):
342
- # Loops over each session of every animal in the project and extracts session ID information and information
343
- # about which processing steps have been successfully applied to the session.
344
- for directory in session_directories:
345
- # Skips processing directories without files (sessions with empty raw-data directories)
346
- if len([file for file in directory.joinpath("raw_data").glob("*")]) == 0:
347
- continue
348
-
349
- # Instantiates the SessionData instance to resolve the paths to all session's data files and locations.
350
- session_data = SessionData.load(
351
- session_path=directory,
352
- processed_data_root=processed_data_root,
353
- make_processed_data_directory=False,
354
- )
355
-
356
- # Fills the manifest dictionary with data for the processed session:
357
-
358
- # Extracts ID and data path information from the SessionData instance
359
- manifest["animal"].append(session_data.animal_id)
360
- manifest["session"].append(session_data.session_name)
361
- manifest["type"].append(session_data.session_type)
362
-
363
- # Parses session name into the date-time object to simplify working with date-time data in the future
364
- date_time_components = session_data.session_name.split("-")
365
- date_time = datetime(
366
- year=int(date_time_components[0]),
367
- month=int(date_time_components[1]),
368
- day=int(date_time_components[2]),
369
- hour=int(date_time_components[3]),
370
- minute=int(date_time_components[4]),
371
- second=int(date_time_components[5]),
372
- microsecond=int(date_time_components[6]),
373
- tzinfo=pytz.UTC,
374
- )
375
-
376
- # Converts from UTC to EST / EDT for user convenience
377
- eastern = pytz.timezone("America/New_York")
378
- date_time = date_time.astimezone(eastern)
379
- manifest["date"].append(date_time)
380
-
381
- # Depending on the session type, instantiates the appropriate descriptor instance and uses it to read the
382
- # experimenter notes
383
- if session_data.session_type == SessionTypes.LICK_TRAINING:
384
- descriptor: LickTrainingDescriptor = LickTrainingDescriptor.from_yaml( # type: ignore
385
- file_path=session_data.raw_data.session_descriptor_path
386
- )
387
- manifest["notes"].append(descriptor.experimenter_notes)
388
- elif session_data.session_type == SessionTypes.RUN_TRAINING:
389
- descriptor: RunTrainingDescriptor = RunTrainingDescriptor.from_yaml( # type: ignore
390
- file_path=session_data.raw_data.session_descriptor_path
391
- )
392
- manifest["notes"].append(descriptor.experimenter_notes)
393
- elif session_data.session_type == SessionTypes.MESOSCOPE_EXPERIMENT:
394
- descriptor: MesoscopeExperimentDescriptor = MesoscopeExperimentDescriptor.from_yaml( # type: ignore
395
- file_path=session_data.raw_data.session_descriptor_path
396
- )
397
- manifest["notes"].append(descriptor.experimenter_notes)
398
- elif session_data.session_type == SessionTypes.WINDOW_CHECKING:
399
- # sl-experiment version 3.0.0 added session descriptors to Window Checking runtimes. Since the file
400
- # does not exist in prior versions, this section is written to statically handle the discrepancy.
401
- try:
402
- descriptor: WindowCheckingDescriptor = WindowCheckingDescriptor.from_yaml( # type: ignore
403
- file_path=session_data.raw_data.session_descriptor_path
404
- )
405
- manifest["notes"].append(descriptor.experimenter_notes)
406
- except Exception:
407
- manifest["notes"].append("N/A")
408
- else:
409
- manifest["notes"].append("N/A")
410
-
411
- # If the session raw_data folder contains the telomere.bin file, marks the session as complete.
412
- manifest["complete"].append(session_data.raw_data.telomere_path.exists())
413
-
414
- # Data verification status
415
- tracker = get_processing_tracker(
416
- root=session_data.raw_data.raw_data_path, file_name=TrackerFileNames.INTEGRITY
417
- )
418
- manifest["integrity"].append(tracker.is_complete)
419
-
420
- # If the session is incomplete or unverified, marks all processing steps as FALSE, as automatic processing
421
- # is disabled for incomplete sessions. If the session is unverified, the case is even more severe, as its
422
- # data may be corrupted.
423
- if not manifest["complete"][-1] or not manifest["integrity"][-1]:
424
- manifest["suite2p"].append(False)
425
- manifest["dataset"].append(False)
426
- manifest["behavior"].append(False)
427
- manifest["video"].append(False)
428
- continue # Cycles to the next session
429
-
430
- # Suite2p (single-day) processing status.
431
- tracker = get_processing_tracker(
432
- file_name=TrackerFileNames.SUITE2P, root=session_data.processed_data.processed_data_path
433
- )
434
- manifest["suite2p"].append(tracker.is_complete)
435
-
436
- # Behavior data processing status.
437
- tracker = get_processing_tracker(
438
- file_name=TrackerFileNames.BEHAVIOR, root=session_data.processed_data.processed_data_path
439
- )
440
- manifest["behavior"].append(tracker.is_complete)
441
-
442
- # DeepLabCut (video) processing status.
443
- tracker = get_processing_tracker(
444
- file_name=TrackerFileNames.VIDEO, root=session_data.processed_data.processed_data_path
445
- )
446
- manifest["video"].append(tracker.is_complete)
447
-
448
- # Tracks whether the session's data is currently in the processing or dataset integration mode.
449
- manifest["dataset"].append(session_data.processed_data.p53_path.exists())
450
-
451
- # If all animal IDs are integer-convertible, stores them as numbers to promote proper sorting. Otherwise, stores
452
- # them as strings. The latter options are primarily kept for compatibility with Tyche data
453
- animal_type: type[pl.UInt64] | type[pl.String]
454
- if all([str(animal).isdigit() for animal in manifest["animal"]]):
455
- # Converts all strings to integers
456
- manifest["animal"] = [int(animal) for animal in manifest["animal"]] # type: ignore
457
- animal_type = pl.UInt64 # Uint64 for future proofing
458
- else:
459
- animal_type = pl.String
460
-
461
- # Converts the manifest dictionary to a Polars Dataframe.
462
- schema = {
463
- "animal": animal_type,
464
- "date": pl.Datetime,
465
- "session": pl.String,
466
- "type": pl.String,
467
- "notes": pl.String,
468
- "complete": pl.UInt8,
469
- "integrity": pl.UInt8,
470
- "suite2p": pl.UInt8,
471
- "dataset": pl.UInt8,
472
- "behavior": pl.UInt8,
473
- "video": pl.UInt8,
474
- }
475
- df = pl.DataFrame(manifest, schema=schema, strict=False)
476
-
477
- # Sorts the DataFrame by animal and then session. Since we assign animal IDs sequentially and 'name' sessions
478
- # based on acquisition timestamps, the sort order is chronological.
479
- sorted_df = df.sort(["animal", "session"])
480
-
481
- # Saves the generated manifest to the project-specific manifest .feather file for further processing.
482
- sorted_df.write_ipc(
483
- file=output_directory.joinpath(f"{raw_project_directory.stem}_manifest.feather"), compression="lz4"
484
- )
485
-
486
-
487
- def verify_session_checksum(
488
- session_path: Path,
489
- manager_id: int,
490
- create_processed_data_directory: bool = True,
491
- processed_data_root: None | Path = None,
492
- update_manifest: bool = False,
493
- ) -> None:
494
- """Verifies the integrity of the session's raw data by generating the checksum of the raw_data directory and
495
- comparing it against the checksum stored in the ax_checksum.txt file.
496
-
497
- Primarily, this function is used to verify data integrity after transferring it from a local PC to the remote
498
- server for long-term storage. This function is designed to create the 'verified.bin' marker file if the checksum
499
- matches and to remove the 'telomere.bin' and 'verified.bin' marker files if it does not.
500
-
501
- Notes:
502
- Removing the telomere.bin marker file from the session's raw_data folder marks the session as incomplete,
503
- excluding it from all further automatic processing.
504
-
505
- This function is also used to create the processed data hierarchy on the BioHPC server, when it is called as
506
- part of the data preprocessing runtime performed by a data acquisition system.
507
-
508
- Since version 3.1.0, this functon also supports (re) generating the processed session's project manifest file,
509
- which is used to support further Sun lab data processing pipelines.
510
-
511
- Args:
512
- session_path: The path to the session directory to be verified. Note, the input session directory must contain
513
- the 'raw_data' subdirectory.
514
- manager_id: The xxHash-64 hash-value that specifies the unique identifier of the manager process that
515
- manages the integrity verification runtime.
516
- create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
517
- processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
518
- the root directory where to store the processed data from all projects, and it will be automatically
519
- modified to include the project name, the animal name, and the session ID.
520
- update_manifest: Determines whether to update (regenerate) the project manifest file for the processed session's
521
- project. This should always be enabled when working with remote compute server(s) to ensure that the
522
- project manifest file contains the most actual snapshot of the project's state.
523
- """
524
-
525
- # Loads session data layout. If configured to do so, also creates the processed data hierarchy
526
- session_data = SessionData.load(
527
- session_path=session_path,
528
- processed_data_root=processed_data_root,
529
- make_processed_data_directory=create_processed_data_directory,
530
- )
531
-
532
- # Initializes the ProcessingTracker instance for the verification tracker file
533
- tracker = get_processing_tracker(root=session_data.raw_data.raw_data_path, file_name=TrackerFileNames.INTEGRITY)
534
- console.echo(f"{tracker.file_path}")
535
-
536
- # Updates the tracker data to communicate that the verification process has started. This automatically clears
537
- # the previous 'completed' status.
538
- tracker.start(manager_id=manager_id)
539
- try:
540
- # Re-calculates the checksum for the raw_data directory
541
- calculated_checksum = calculate_directory_checksum(
542
- directory=session_data.raw_data.raw_data_path, batch=False, save_checksum=False
543
- )
544
-
545
- # Loads the checksum stored inside the ax_checksum.txt file
546
- with open(session_data.raw_data.checksum_path, "r") as f:
547
- stored_checksum = f.read().strip()
548
-
549
- # If the two checksums do not match, this likely indicates data corruption.
550
- if stored_checksum != calculated_checksum:
551
- # If the telomere.bin file exists, removes this file. This automatically marks the session as incomplete for
552
- # all other Sun lab runtimes.
553
- session_data.raw_data.telomere_path.unlink(missing_ok=True)
554
-
555
- else:
556
- # Sets the tracker to indicate that the verification runtime completed successfully.
557
- tracker.stop(manager_id=manager_id)
558
-
559
- finally:
560
- # If the code reaches this section while the tracker indicates that the processing is still running,
561
- # this means that the verification runtime encountered an error. Configures the tracker to indicate that this
562
- # runtime finished with an error to prevent deadlocking the runtime.
563
- if tracker.is_running:
564
- tracker.error(manager_id=manager_id)
565
-
566
- # If the runtime is configured to generate the project manifest file, attempts to generate and overwrite the
567
- # existing manifest file for the target project.
568
- if update_manifest:
569
- # All sessions are stored under root/project/animal/session. Therefore, the grandparent of the session is
570
- # the raw project directory.
571
- raw_directory = session_path.parents[1]
572
-
573
- # Generates the manifest file inside the root raw data project directory
574
- generate_project_manifest(
575
- raw_project_directory=session_path.parents[1],
576
- processed_data_root=processed_data_root,
577
- output_directory=raw_directory,
578
- )
579
-
580
-
581
- def resolve_p53_marker(
582
- session_path: Path,
583
- create_processed_data_directory: bool = True,
584
- processed_data_root: None | Path = None,
585
- remove: bool = False,
586
- update_manifest: bool = False,
587
- ) -> None:
588
- """Depending on configuration, either creates or removes the p53.bin marker file for the target session.
589
-
590
- The marker file statically determines whether the session can be targeted by data processing or dataset formation
591
- pipelines.
592
-
593
- Notes:
594
- Since dataset integration relies on data processing outputs, it is essential to prevent processing pipelines
595
- from altering the data while it is integrated into a dataset. The p53.bin marker solves this issue by ensuring
596
- that only one type of runtimes (processing or dataset integration) is allowed to work with the session.
597
-
598
- For the p53.bin marker to be created, the session must not be undergoing processing. For the p53 marker
599
- to be removed, the session must not be undergoing dataset integration.
600
-
601
- Since version 3.1.0, this functon also supports (re)generating the processed session's project manifest file,
602
- which is used to support further Sun lab data processing pipelines.
603
-
604
- Args:
605
- session_path: The path to the session directory for which the p53.bin marker needs to be resolved. Note, the
606
- input session directory must contain the 'raw_data' subdirectory.
607
- create_processed_data_directory: Determines whether to create the processed data hierarchy during runtime.
608
- processed_data_root: The root directory where to store the processed data hierarchy. This path has to point to
609
- the root directory where to store the processed data from all projects, and it will be automatically
610
- modified to include the project name, the animal name, and the session ID.
611
- remove: Determines whether this function is called to create or remove the p53.bin marker.
612
- update_manifest: Determines whether to update (regenerate) the project manifest file for the processed session's
613
- project. This should always be enabled when working with remote compute server(s) to ensure that the
614
- project manifest file contains the most actual snapshot of the project's state.
615
- """
616
-
617
- # Loads session data layout. If configured to do so, also creates the processed data hierarchy
618
- session_data = SessionData.load(
619
- session_path=session_path,
620
- processed_data_root=processed_data_root,
621
- make_processed_data_directory=create_processed_data_directory,
622
- )
623
-
624
- # If the p53.bin marker exists and the runtime is configured to remove it, attempts to remove the marker file.
625
- if session_data.processed_data.p53_path.exists() and remove:
626
- # This section deals with a unique nuance related to the Sun lab processing server organization. Specifically,
627
- # the user accounts are not allowed to modify or create files in the data directories owned by the service
628
- # accounts. In turn, this prevents user accounts from modifying the processed data directory to indicate when
629
- # they are running a dataset integration pipeline on the processed data. To work around this problem, the
630
- # dataset integration pipeline now creates a 'semaphore' marker for each session that is currently being
631
- # integrated into a dataset. This semaphore marker is stored under the root user working directory, inside the
632
- # subdirectory called 'semaphore'.
633
-
634
- # The parent of the shared sun-lab processed data directory is the root 'working' volume. All user directories
635
- # are stored under this root working directory.
636
- if processed_data_root is None:
637
- # If the processed data root is not provided, sets it to the great-grandparent of the session directory.
638
- # This works assuming that the data is stored under: root/project/animal/session.
639
- processed_data_root = session_path.parents[2]
640
- working_root = processed_data_root.parent
641
-
642
- # Loops over each user directory and checks whether a semaphore marker exists for the processed session.
643
- for directory in working_root.iterdir():
644
- if (
645
- len([marker for marker in directory.joinpath("semaphore").glob(f"*{session_data.session_name}.bin")])
646
- > 0
647
- ):
648
- # Aborts with an error if the semaphore marker prevents the p53 marker from being removed.
649
- message = (
650
- f"Unable to remove the dataset marker for the session' {session_data.session_name}' acquired "
651
- f"for the animal '{session_data.animal_id}' under the '{session_data.project_name}' project. "
652
- f"The session data is currently being integrated into a dataset by the owner the "
653
- f"'{directory.stem}' user directory. Wait until the ongoing dataset integration is complete and "
654
- f"repeat the command that produced this error."
655
- )
656
- console.error(message=message, error=RuntimeError)
657
-
658
- # If the session does not have a corresponding semaphore marker in any user directories, removes the p53 marker
659
- # file.
660
- session_data.processed_data.p53_path.unlink()
661
- message = (
662
- f"Dataset marker for the session '{session_data.session_name}' acquired for the animal "
663
- f"'{session_data.animal_id}' under the '{session_data.project_name}' project: Removed."
664
- )
665
- console.echo(message=message, level=LogLevel.SUCCESS)
666
- return # Ends remove runtime
667
-
668
- # If the marker does not exist and the function is called in 'remove' mode, aborts the runtime early
669
- elif not session_data.processed_data.p53_path.exists() and remove:
670
- message = (
671
- f"Dataset marker for the session '{session_data.session_name}' acquired for the animal "
672
- f"'{session_data.animal_id}' under the '{session_data.project_name}' project: Does not exist. No actions "
673
- f"taken."
674
- )
675
- console.echo(message=message, level=LogLevel.SUCCESS)
676
- return # Ends remove runtime
677
-
678
- elif session_data.processed_data.p53_path.exists():
679
- message = (
680
- f"Dataset marker for the session '{session_data.session_name}' acquired for the animal "
681
- f"'{session_data.animal_id}' under the '{session_data.project_name}' project: Already exists. No actions "
682
- f"taken."
683
- )
684
- console.echo(message=message, level=LogLevel.SUCCESS)
685
- return # Ends create runtime
686
-
687
- # The rest of the runtime deals with determining whether it is safe to create the marker file.
688
- # Queries the type of the processed session
689
- session_type = session_data.session_type
690
-
691
- # Window checking sessions are not designed to be integrated into datasets, so they cannot be marked with the
692
- # p53.bin file. Similarly, any incomplete session is automatically excluded from dataset formation.
693
- if session_type == SessionTypes.WINDOW_CHECKING or not session_data.raw_data.telomere_path.exists():
694
- message = (
695
- f"Unable to generate the dataset marker for the session '{session_data.session_name}' acquired for the "
696
- f"animal '{session_data.animal_id}' under the '{session_data.project_name}' project, as the session is "
697
- f"incomplete or is of Window Checking type. These sessions must be manually evaluated and marked for "
698
- f"dataset inclusion by the experimenter. "
699
- )
700
- console.error(message=message, error=RuntimeError)
701
-
702
- # Training sessions collect similar data and share processing pipeline requirements
703
- error: bool = False
704
- if session_type == SessionTypes.LICK_TRAINING or session_type == SessionTypes.RUN_TRAINING:
705
- # Ensures that the session is not being processed with one of the supported pipelines.
706
- behavior_tracker = get_processing_tracker(
707
- file_name=TrackerFileNames.BEHAVIOR, root=session_data.processed_data.processed_data_path
708
- )
709
- video_tracker = get_processing_tracker(
710
- file_name=TrackerFileNames.VIDEO, root=session_data.processed_data.processed_data_path
711
- )
712
- if behavior_tracker.is_running or video_tracker.is_running:
713
- # Note, training runtimes do not require suite2p processing.
714
- error = True
715
-
716
- # Mesoscope experiment sessions require additional processing with suite2p
717
- elif session_type == SessionTypes.MESOSCOPE_EXPERIMENT:
718
- behavior_tracker = get_processing_tracker(
719
- file_name=TrackerFileNames.BEHAVIOR, root=session_data.processed_data.processed_data_path
720
- )
721
- suite2p_tracker = get_processing_tracker(
722
- file_name=TrackerFileNames.SUITE2P, root=session_data.processed_data.processed_data_path
723
- )
724
- video_tracker = get_processing_tracker(
725
- file_name=TrackerFileNames.VIDEO, root=session_data.processed_data.processed_data_path
726
- )
727
- console.echo(f"{behavior_tracker.is_running}")
728
- if behavior_tracker.is_running or video_tracker.is_running or suite2p_tracker.is_running:
729
- error = True
730
-
731
- # If the session is currently being processed by one or more pipelines, aborts with an error.
732
- if error:
733
- message = (
734
- f"Unable to generate the dataset marker for the session '{session_data.session_name}' acquired for the "
735
- f"animal '{session_data.animal_id}' under the '{session_data.project_name}' project, as it is "
736
- f"currently being processed by one of the data processing pipelines. Wait until the session is fully "
737
- f"processed by all pipelines and repeat the command that encountered this error."
738
- )
739
- console.error(message=message, error=RuntimeError)
740
-
741
- # If the runtime reached this point, the session is eligible for dataset integration. Creates the p53.bin marker
742
- # file, preventing the session from being processed again as long as the marker exists.
743
- session_data.processed_data.p53_path.touch()
744
- message = (
745
- f"Dataset marker for the session '{session_data.session_name}' acquired for the animal "
746
- f"'{session_data.animal_id}' under the '{session_data.project_name}' project: Created."
747
- )
748
- console.echo(message=message, level=LogLevel.SUCCESS)
749
-
750
- # If the runtime is configured to generate the project manifest file, attempts to generate and overwrite the
751
- # existing manifest file for the target project.
752
- if update_manifest:
753
- # All sessions are stored under root/project/animal/session. Therefore, the grandparent of the session is
754
- # the raw project directory.
755
- raw_directory = session_path.parents[1]
756
-
757
- # Generates the manifest file inside the root raw data project directory
758
- generate_project_manifest(
759
- raw_project_directory=session_path.parents[1],
760
- processed_data_root=processed_data_root,
761
- output_directory=raw_directory,
762
- )