sl-shared-assets 4.0.1__py3-none-any.whl → 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sl-shared-assets might be problematic. Click here for more details.
- sl_shared_assets/__init__.py +45 -42
- sl_shared_assets/command_line_interfaces/__init__.py +3 -0
- sl_shared_assets/command_line_interfaces/configure.py +173 -0
- sl_shared_assets/command_line_interfaces/manage.py +226 -0
- sl_shared_assets/data_classes/__init__.py +33 -32
- sl_shared_assets/data_classes/configuration_data.py +267 -79
- sl_shared_assets/data_classes/session_data.py +226 -289
- sl_shared_assets/server/__init__.py +24 -4
- sl_shared_assets/server/job.py +6 -7
- sl_shared_assets/server/pipeline.py +570 -0
- sl_shared_assets/server/server.py +57 -25
- sl_shared_assets/tools/__init__.py +9 -8
- sl_shared_assets/tools/packaging_tools.py +14 -25
- sl_shared_assets/tools/project_management_tools.py +602 -523
- sl_shared_assets/tools/transfer_tools.py +88 -23
- {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/METADATA +46 -203
- sl_shared_assets-5.0.0.dist-info/RECORD +23 -0
- sl_shared_assets-5.0.0.dist-info/entry_points.txt +3 -0
- sl_shared_assets/__init__.pyi +0 -91
- sl_shared_assets/cli.py +0 -501
- sl_shared_assets/cli.pyi +0 -106
- sl_shared_assets/data_classes/__init__.pyi +0 -75
- sl_shared_assets/data_classes/configuration_data.pyi +0 -235
- sl_shared_assets/data_classes/runtime_data.pyi +0 -157
- sl_shared_assets/data_classes/session_data.pyi +0 -379
- sl_shared_assets/data_classes/surgery_data.pyi +0 -89
- sl_shared_assets/server/__init__.pyi +0 -11
- sl_shared_assets/server/job.pyi +0 -205
- sl_shared_assets/server/server.pyi +0 -298
- sl_shared_assets/tools/__init__.pyi +0 -19
- sl_shared_assets/tools/ascension_tools.py +0 -265
- sl_shared_assets/tools/ascension_tools.pyi +0 -68
- sl_shared_assets/tools/packaging_tools.pyi +0 -58
- sl_shared_assets/tools/project_management_tools.pyi +0 -239
- sl_shared_assets/tools/transfer_tools.pyi +0 -53
- sl_shared_assets-4.0.1.dist-info/RECORD +0 -36
- sl_shared_assets-4.0.1.dist-info/entry_points.txt +0 -7
- {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/WHEEL +0 -0
- {sl_shared_assets-4.0.1.dist-info → sl_shared_assets-5.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -7,12 +7,10 @@ libraries use these classes to work with all lab-generated data."""
|
|
|
7
7
|
|
|
8
8
|
import copy
|
|
9
9
|
from enum import StrEnum
|
|
10
|
-
from random import randint
|
|
11
10
|
import shutil as sh
|
|
12
11
|
from pathlib import Path
|
|
13
12
|
from dataclasses import field, dataclass
|
|
14
13
|
|
|
15
|
-
from xxhash import xxh3_64
|
|
16
14
|
from filelock import FileLock
|
|
17
15
|
from ataraxis_base_utilities import LogLevel, console, ensure_directory_exists
|
|
18
16
|
from ataraxis_data_structures import YamlConfig
|
|
@@ -48,26 +46,6 @@ class SessionTypes(StrEnum):
|
|
|
48
46
|
activity data."""
|
|
49
47
|
|
|
50
48
|
|
|
51
|
-
class TrackerFileNames(StrEnum):
|
|
52
|
-
"""Defines a set of processing tacker .yaml files supported by various Sun lab data preprocessing, processing, and
|
|
53
|
-
dataset formation pipelines.
|
|
54
|
-
|
|
55
|
-
This enumeration standardizes the names for all processing tracker files used in the lab. It is designed to be used
|
|
56
|
-
via the get_processing_tracker() function to generate ProcessingTracker instances.
|
|
57
|
-
"""
|
|
58
|
-
|
|
59
|
-
BEHAVIOR = "behavior_processing_tracker.yaml"
|
|
60
|
-
"""This file is used to track the state of the behavior log processing pipeline."""
|
|
61
|
-
SUITE2P = "suite2p_processing_tracker.yaml"
|
|
62
|
-
"""This file is used to track the state of the single-day suite2p processing pipeline."""
|
|
63
|
-
DATASET = "dataset_formation_tracker.yaml"
|
|
64
|
-
"""This file is used to track the state of the dataset formation pipeline."""
|
|
65
|
-
VIDEO = "video_processing_tracker.yaml"
|
|
66
|
-
"""This file is used to track the state of the video (DeepLabCut) processing pipeline."""
|
|
67
|
-
INTEGRITY = "integrity_verification_tracker.yaml"
|
|
68
|
-
"""This file is used to track the state of the data integrity verification pipeline."""
|
|
69
|
-
|
|
70
|
-
|
|
71
49
|
@dataclass()
|
|
72
50
|
class RawData:
|
|
73
51
|
"""Stores the paths to the directories and files that make up the 'raw_data' session-specific directory.
|
|
@@ -155,6 +133,10 @@ class RawData:
|
|
|
155
133
|
runtime initialization. Since runtime initialization is a complex process that may encounter a runtime error, the
|
|
156
134
|
marker is used to discover sessions that failed to initialize. Since uninitialized sessions by definition do not
|
|
157
135
|
contain any valuable data, they are marked for immediate deletion from all managed destinations."""
|
|
136
|
+
root_path: Path = Path()
|
|
137
|
+
"""Stores the path to the root directory of the volume that stores raw data from all Sun lab projects. Primarily,
|
|
138
|
+
this is necessary for pipelines working with the data on the remote compute server to efficiently move it between
|
|
139
|
+
storage and working (processing) volumes."""
|
|
158
140
|
|
|
159
141
|
def resolve_paths(self, root_directory_path: Path) -> None:
|
|
160
142
|
"""Resolves all paths managed by the class instance based on the input root directory path.
|
|
@@ -186,6 +168,10 @@ class RawData:
|
|
|
186
168
|
self.ubiquitin_path = self.raw_data_path.joinpath("ubiquitin.bin")
|
|
187
169
|
self.nk_path = self.raw_data_path.joinpath("nk.bin")
|
|
188
170
|
|
|
171
|
+
# Infers the path to the root raw data directory under which the session's project is stored. This assumes that
|
|
172
|
+
# the raw_data directory is found under root/project/animal/session_id/raw_data
|
|
173
|
+
self.root_path = root_directory_path.parents[3]
|
|
174
|
+
|
|
189
175
|
def make_directories(self) -> None:
|
|
190
176
|
"""Ensures that all major subdirectories and the root directory exist, creating any missing directories.
|
|
191
177
|
|
|
@@ -220,12 +206,10 @@ class ProcessedData:
|
|
|
220
206
|
behavior_data_path: Path = Path()
|
|
221
207
|
"""Stores the path to the directory that contains the non-video and non-brain-activity data extracted from
|
|
222
208
|
.npz log files by the sl-behavior log processing pipeline."""
|
|
223
|
-
|
|
224
|
-
"""Stores the path to the
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
processing pipelines are not allowed to work with the session, as it may be actively integrated into one or more
|
|
228
|
-
datasets."""
|
|
209
|
+
root_path: Path = Path()
|
|
210
|
+
"""Stores the path to the root directory of the volume that stores processed data from all Sun lab projects.
|
|
211
|
+
Primarily, this is necessary for pipelines working with the data on the remote compute server to efficiently move it
|
|
212
|
+
between storage and working (processing) volumes."""
|
|
229
213
|
|
|
230
214
|
def resolve_paths(self, root_directory_path: Path) -> None:
|
|
231
215
|
"""Resolves all paths managed by the class instance based on the input root directory path.
|
|
@@ -242,7 +226,10 @@ class ProcessedData:
|
|
|
242
226
|
self.camera_data_path = self.processed_data_path.joinpath("camera_data")
|
|
243
227
|
self.mesoscope_data_path = self.processed_data_path.joinpath("mesoscope_data")
|
|
244
228
|
self.behavior_data_path = self.processed_data_path.joinpath("behavior_data")
|
|
245
|
-
|
|
229
|
+
|
|
230
|
+
# Infers the path to the root processed data directory under which the session's project is stored. This
|
|
231
|
+
# assumes that the processed_data directory is found under root/project/animal/session_id/processed_data
|
|
232
|
+
self.root_path = root_directory_path.parents[3]
|
|
246
233
|
|
|
247
234
|
def make_directories(self) -> None:
|
|
248
235
|
"""Ensures that all major subdirectories and the root directory exist, creating any missing directories.
|
|
@@ -256,6 +243,48 @@ class ProcessedData:
|
|
|
256
243
|
ensure_directory_exists(self.behavior_data_path)
|
|
257
244
|
|
|
258
245
|
|
|
246
|
+
@dataclass()
|
|
247
|
+
class TrackingData:
|
|
248
|
+
"""Stores the paths to the directories and files that make up the 'tracking_data' session-specific directory.
|
|
249
|
+
|
|
250
|
+
The 'tracking_data' directory was added in version 5.0.0 to store the ProcessingTracker instance data and .lock
|
|
251
|
+
files for pipelines and tasks used to work with session data after acquisition.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
tracking_data_path: Path = Path()
|
|
255
|
+
"""Stores the path to the root tracking_data directory of the session. This directory stores the .yaml
|
|
256
|
+
ProcessingTracker files and the .lock FileLock files that jointly ensure that session's data is accessed in a
|
|
257
|
+
process- and thread-safe way while being processed by multiple different processes and pipelines."""
|
|
258
|
+
session_lock_path: Path = Path()
|
|
259
|
+
"""Stores the path to the session_lock.yaml file for the session. This file is used to ensure that only a single
|
|
260
|
+
manager process has exclusive access to the session's data on the remote compute server. This ensures that multiple
|
|
261
|
+
data processing pipelines can safely run for the same session without compromising session data integrity. This
|
|
262
|
+
file is intended to be used through the SessionLock class."""
|
|
263
|
+
|
|
264
|
+
def resolve_paths(self, root_directory_path: Path) -> None:
|
|
265
|
+
"""Resolves all paths managed by the class instance based on the input root directory path.
|
|
266
|
+
|
|
267
|
+
This method is called each time the (wrapper) SessionData class is instantiated to regenerate the managed path
|
|
268
|
+
hierarchy on any machine that instantiates the class.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
root_directory_path: The path to the top-level directory of the session. Typically, this path is assembled
|
|
272
|
+
using the following hierarchy: root/project/animal/session_id
|
|
273
|
+
"""
|
|
274
|
+
# Generates the managed paths
|
|
275
|
+
self.tracking_data_path = root_directory_path
|
|
276
|
+
self.session_lock_path = self.tracking_data_path.joinpath("session_lock.yaml")
|
|
277
|
+
|
|
278
|
+
def make_directories(self) -> None:
|
|
279
|
+
"""Ensures that all major subdirectories and the root directory exist, creating any missing directories.
|
|
280
|
+
|
|
281
|
+
This method is called each time the (wrapper) SessionData class is instantiated and allowed to generate
|
|
282
|
+
missing data directories.
|
|
283
|
+
"""
|
|
284
|
+
|
|
285
|
+
ensure_directory_exists(self.tracking_data_path)
|
|
286
|
+
|
|
287
|
+
|
|
259
288
|
@dataclass
|
|
260
289
|
class SessionData(YamlConfig):
|
|
261
290
|
"""Stores and manages the data layout of a single Sun lab data acquisition session.
|
|
@@ -297,21 +326,44 @@ class SessionData(YamlConfig):
|
|
|
297
326
|
"""Stores the version of the sl-experiment library that was used to acquire the session data."""
|
|
298
327
|
raw_data: RawData = field(default_factory=lambda: RawData())
|
|
299
328
|
"""Stores absolute paths to all directories and files that jointly make the session's raw data hierarchy. This
|
|
300
|
-
|
|
301
|
-
|
|
329
|
+
hierarchy is initially resolved by the acquisition system that acquires the session and used to store all data
|
|
330
|
+
acquired during the session runtime."""
|
|
302
331
|
processed_data: ProcessedData = field(default_factory=lambda: ProcessedData())
|
|
303
332
|
"""Stores absolute paths to all directories and files that jointly make the session's processed data hierarchy.
|
|
304
|
-
|
|
305
|
-
|
|
333
|
+
Processed data encompasses all data generated from the raw data as part of data processing."""
|
|
334
|
+
source_data: RawData = field(default_factory=lambda: RawData())
|
|
335
|
+
"""Stores absolute paths to the same data as the 'raw_data' field, but with all paths resolved relative to the
|
|
336
|
+
'processed_data' root. On systems that use the same root for processed and raw data, the source and raw directories
|
|
337
|
+
are identical. On systems that use different root directories for processed and raw data, the source and raw
|
|
338
|
+
directories are different. This is used to optimize data processing on the remote compute server by temporarily
|
|
339
|
+
copying all session data to the fast processed data volume."""
|
|
340
|
+
archived_data: ProcessedData = field(default_factory=lambda: ProcessedData())
|
|
341
|
+
"""Similar to the 'source_data' field, stores the absolute path to the same data as the 'processed_data' field, but
|
|
342
|
+
with all paths resolved relative to the 'raw_data' root. This path is used as part of the session data archiving
|
|
343
|
+
process to collect all session data (raw and processed) on the slow 'storage' volume of the remote compute server.
|
|
344
|
+
"""
|
|
345
|
+
tracking_data: TrackingData = field(default_factory=lambda: TrackingData())
|
|
346
|
+
"""Stores absolute paths to all directories and files that jointly make the session's tracking data hierarchy. This
|
|
347
|
+
hierarchy is used during all stages of data processing to track the processing progress and ensure only a single
|
|
348
|
+
manager process can modify the session's data at any given time, ensuring access safety."""
|
|
306
349
|
|
|
307
350
|
def __post_init__(self) -> None:
|
|
308
|
-
"""Ensures raw_data and
|
|
351
|
+
"""Ensures raw_data, processed_data, and source_data are always instances of RawData and ProcessedData."""
|
|
309
352
|
if not isinstance(self.raw_data, RawData):
|
|
310
353
|
self.raw_data = RawData()
|
|
311
354
|
|
|
312
355
|
if not isinstance(self.processed_data, ProcessedData):
|
|
313
356
|
self.processed_data = ProcessedData()
|
|
314
357
|
|
|
358
|
+
if not isinstance(self.source_data, RawData):
|
|
359
|
+
self.raw_data = RawData()
|
|
360
|
+
|
|
361
|
+
if not isinstance(self.archived_data, ProcessedData):
|
|
362
|
+
self.archived_data = ProcessedData()
|
|
363
|
+
|
|
364
|
+
if not isinstance(self.tracking_data, TrackingData):
|
|
365
|
+
self.raw_data = RawData()
|
|
366
|
+
|
|
315
367
|
@classmethod
|
|
316
368
|
def create(
|
|
317
369
|
cls,
|
|
@@ -415,6 +467,22 @@ class SessionData(YamlConfig):
|
|
|
415
467
|
processed_data = ProcessedData()
|
|
416
468
|
processed_data.resolve_paths(root_directory_path=session_path.joinpath("processed_data"))
|
|
417
469
|
|
|
470
|
+
# Added in version 5.0.0. While source data is not used when the session is created (and is set to the same
|
|
471
|
+
# directory as raw_data), it is created here for completeness.
|
|
472
|
+
source_data = RawData()
|
|
473
|
+
source_data.resolve_paths(root_directory_path=session_path.joinpath("source_data"))
|
|
474
|
+
|
|
475
|
+
# Added in version 5.0.0. While processed data is not used when the session is created (and is set to the same
|
|
476
|
+
# directory as processed_data), it is created here for completeness.
|
|
477
|
+
archived_data = ProcessedData()
|
|
478
|
+
archived_data.resolve_paths(root_directory_path=session_path.joinpath("archived_data"))
|
|
479
|
+
|
|
480
|
+
# Similar to source_data, tracking data uses the same root as raw_data and is not used during data acquisition.
|
|
481
|
+
# Tracking data is used during data processing on the remote compute server(s) to ensure multiple pipelines
|
|
482
|
+
# can work with the session's data without collision.
|
|
483
|
+
tracking_data = TrackingData()
|
|
484
|
+
tracking_data.resolve_paths(root_directory_path=session_path.joinpath("tracking_data"))
|
|
485
|
+
|
|
418
486
|
# Packages the sections generated above into a SessionData instance
|
|
419
487
|
# noinspection PyArgumentList
|
|
420
488
|
instance = SessionData(
|
|
@@ -424,6 +492,7 @@ class SessionData(YamlConfig):
|
|
|
424
492
|
session_type=session_type,
|
|
425
493
|
acquisition_system=acquisition_system.name,
|
|
426
494
|
raw_data=raw_data,
|
|
495
|
+
source_data=source_data,
|
|
427
496
|
processed_data=processed_data,
|
|
428
497
|
experiment_name=experiment_name,
|
|
429
498
|
python_version=python_version,
|
|
@@ -460,7 +529,6 @@ class SessionData(YamlConfig):
|
|
|
460
529
|
cls,
|
|
461
530
|
session_path: Path,
|
|
462
531
|
processed_data_root: Path | None = None,
|
|
463
|
-
make_processed_data_directory: bool = False,
|
|
464
532
|
) -> "SessionData":
|
|
465
533
|
"""Loads the SessionData instance from the target session's session_data.yaml file.
|
|
466
534
|
|
|
@@ -478,55 +546,85 @@ class SessionData(YamlConfig):
|
|
|
478
546
|
provide the path to the root project directory (directory that stores all Sun lab projects) on that
|
|
479
547
|
drive. The method will automatically resolve the project/animal/session/processed_data hierarchy using
|
|
480
548
|
this root path. If raw and processed data are kept on the same drive, keep this set to None.
|
|
481
|
-
make_processed_data_directory: Determines whether this method should create the processed_data directory if
|
|
482
|
-
it does not exist.
|
|
483
549
|
|
|
484
550
|
Returns:
|
|
485
551
|
An initialized SessionData instance for the session whose data is stored at the provided path.
|
|
486
552
|
|
|
487
553
|
Raises:
|
|
488
|
-
FileNotFoundError: If
|
|
554
|
+
FileNotFoundError: If multiple or no 'session_data.yaml' file instances are found under the input session
|
|
555
|
+
path directory.
|
|
489
556
|
|
|
490
557
|
"""
|
|
491
|
-
# To properly initialize the SessionData instance, the provided path should contain
|
|
492
|
-
#
|
|
493
|
-
|
|
494
|
-
if
|
|
558
|
+
# To properly initialize the SessionData instance, the provided path should contain a single session_data.yaml
|
|
559
|
+
# file at any hierarchy level.
|
|
560
|
+
session_data_files = [file for file in session_path.rglob("*session_data.yaml")]
|
|
561
|
+
if len(session_data_files) != 1:
|
|
495
562
|
message = (
|
|
496
|
-
f"Unable to load the SessionData class for the target session
|
|
497
|
-
f"
|
|
498
|
-
f"
|
|
499
|
-
f"
|
|
563
|
+
f"Unable to load the SessionData class for the target session. Expected a single session_data.yaml "
|
|
564
|
+
f"file to be located under the directory tree specified by the input path: {session_path}. Instead, "
|
|
565
|
+
f"encountered {len(session_data_files)} candidate files. This indicates that the input path does not "
|
|
566
|
+
f"point to a valid session directory."
|
|
500
567
|
)
|
|
501
568
|
console.error(message=message, error=FileNotFoundError)
|
|
502
569
|
|
|
503
|
-
#
|
|
570
|
+
# If a single candidate is found (as expected), extracts it from the list and uses it to resolve the
|
|
571
|
+
# session data hierarchy.
|
|
572
|
+
session_data_path = session_data_files.pop()
|
|
573
|
+
|
|
574
|
+
# Loads class data from the.yaml file
|
|
504
575
|
instance: SessionData = cls.from_yaml(file_path=session_data_path) # type: ignore
|
|
505
576
|
|
|
506
577
|
# The method assumes that the 'donor' .yaml file is always stored inside the raw_data directory of the session
|
|
507
|
-
# to be processed.
|
|
508
|
-
#
|
|
509
|
-
#
|
|
510
|
-
local_root = session_path.parents[2]
|
|
511
|
-
|
|
512
|
-
# RAW DATA
|
|
513
|
-
new_root = local_root.joinpath(instance.project_name, instance.animal_id, instance.session_name, "raw_data")
|
|
514
|
-
instance.raw_data.resolve_paths(root_directory_path=new_root)
|
|
578
|
+
# to be processed. In turn, that directory is expected to be found under the path root/project/animal/session.
|
|
579
|
+
# The code below uses this heuristic to discover the raw data root based on the session data file path.
|
|
580
|
+
local_root = session_data_path.parents[4] # Raw data root session directory
|
|
515
581
|
|
|
516
582
|
# Unless a different root is provided for processed data, it uses the same root as raw_data.
|
|
517
583
|
if processed_data_root is None:
|
|
518
584
|
processed_data_root = local_root
|
|
519
585
|
|
|
520
|
-
#
|
|
586
|
+
# RAW DATA
|
|
587
|
+
instance.raw_data.resolve_paths(
|
|
588
|
+
root_directory_path=local_root.joinpath(
|
|
589
|
+
instance.project_name, instance.animal_id, instance.session_name, "raw_data"
|
|
590
|
+
)
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# PROCESSED DATA
|
|
521
594
|
instance.processed_data.resolve_paths(
|
|
522
595
|
root_directory_path=processed_data_root.joinpath(
|
|
523
596
|
instance.project_name, instance.animal_id, instance.session_name, "processed_data"
|
|
524
597
|
)
|
|
525
598
|
)
|
|
526
599
|
|
|
527
|
-
#
|
|
528
|
-
|
|
529
|
-
|
|
600
|
+
# SOURCE DATA
|
|
601
|
+
instance.source_data.resolve_paths(
|
|
602
|
+
root_directory_path=processed_data_root.joinpath(
|
|
603
|
+
instance.project_name, instance.animal_id, instance.session_name, "source_data"
|
|
604
|
+
)
|
|
605
|
+
)
|
|
606
|
+
# Note, since source data is populated as part of the 'preparation' runtime, does not make the directories.
|
|
607
|
+
|
|
608
|
+
# ARCHIVED DATA
|
|
609
|
+
instance.archived_data.resolve_paths(
|
|
610
|
+
root_directory_path=local_root.joinpath(
|
|
611
|
+
instance.project_name, instance.animal_id, instance.session_name, "archived_data"
|
|
612
|
+
)
|
|
613
|
+
)
|
|
614
|
+
# Similar to source_data, archived data is populated as part of the 'archiving' pipeline, so directories for
|
|
615
|
+
# this data are not resolved.
|
|
616
|
+
|
|
617
|
+
# If there is no archived processed data, ensures that processed data hierarchy exists.
|
|
618
|
+
if not instance.archived_data.processed_data_path.exists():
|
|
619
|
+
instance.processed_data.make_directories() # Ensures processed data directories exist
|
|
620
|
+
|
|
621
|
+
# TRACKING DATA
|
|
622
|
+
instance.tracking_data.resolve_paths(
|
|
623
|
+
root_directory_path=local_root.joinpath(
|
|
624
|
+
instance.project_name, instance.animal_id, instance.session_name, "tracking_data"
|
|
625
|
+
)
|
|
626
|
+
)
|
|
627
|
+
instance.tracking_data.make_directories() # Ensures tracking data directories exist
|
|
530
628
|
|
|
531
629
|
# Returns the initialized SessionData instance to caller
|
|
532
630
|
return instance
|
|
@@ -557,6 +655,9 @@ class SessionData(YamlConfig):
|
|
|
557
655
|
# prevents the SessionData instance from being loaded from the disk.
|
|
558
656
|
origin.raw_data = None # type: ignore
|
|
559
657
|
origin.processed_data = None # type: ignore
|
|
658
|
+
origin.source_data = None # type: ignore
|
|
659
|
+
origin.archived_data = None # type: ignore
|
|
660
|
+
origin.tracking_data = None # type: ignore
|
|
560
661
|
|
|
561
662
|
# Converts StringEnum instances to strings
|
|
562
663
|
origin.session_type = str(origin.session_type)
|
|
@@ -567,310 +668,146 @@ class SessionData(YamlConfig):
|
|
|
567
668
|
|
|
568
669
|
|
|
569
670
|
@dataclass()
|
|
570
|
-
class
|
|
571
|
-
"""
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
Note:
|
|
579
|
-
In library version 4.0.0 the processing trackers have been refactored to work similar to 'lock' files. That is,
|
|
580
|
-
when a runtime is started, the tracker is switched into the 'running' (locked) state until it is unlocked,
|
|
581
|
-
aborted, or encounters an error. When the tracker is locked, only the same manager process as the one that
|
|
582
|
-
locked the tracker is allowed to work with session data. This feature allows executing complex processing
|
|
583
|
-
pipelines that use multiple concurrent and / or sequential processing jobs on the remote server.
|
|
584
|
-
|
|
585
|
-
This instance frequently refers to a 'manager process' in method documentation. A 'manager process' is the
|
|
586
|
-
highest-level process that manages the runtime. When the runtime is executed on remote compute servers, the
|
|
587
|
-
manager process is typically the process running on the non-server machine (user PC) that executes the remote
|
|
588
|
-
processing job on the compute server (via SSH or similar protocol). The worker process(es) that run the
|
|
589
|
-
processing job(s) on the remote compute servers are NOT considered manager processes.
|
|
671
|
+
class SessionLock(YamlConfig):
|
|
672
|
+
"""Provides thread-safe session locking to ensure exclusive access during data processing.
|
|
673
|
+
|
|
674
|
+
This class manages a lock file that tracks which manager process currently has exclusive access to a session's data.
|
|
675
|
+
It prevents race conditions when multiple manager processes attempt to modify session data simultaneously.
|
|
676
|
+
|
|
677
|
+
The lock is identified by a manager process ID, allowing distributed processing across multiple jobs while
|
|
678
|
+
maintaining data integrity.
|
|
590
679
|
"""
|
|
591
680
|
|
|
592
681
|
file_path: Path
|
|
593
|
-
"""Stores the path to the .yaml file
|
|
594
|
-
|
|
595
|
-
_complete: bool = False
|
|
596
|
-
"""Tracks whether the processing runtime managed by this tracker has finished successfully."""
|
|
597
|
-
_encountered_error: bool = False
|
|
598
|
-
"""Tracks whether the processing runtime managed by this tracker has encountered an error and has finished
|
|
599
|
-
unsuccessfully."""
|
|
600
|
-
_running: bool = False
|
|
601
|
-
"""Tracks whether the processing runtime managed by this tracker is currently running."""
|
|
682
|
+
"""Stores the absolute path to the .yaml file that stores the lock state on disk."""
|
|
683
|
+
|
|
602
684
|
_manager_id: int = -1
|
|
603
|
-
"""Stores the
|
|
604
|
-
|
|
605
|
-
support processing runtimes that are distributed over multiple separate batch jobs on the compute server. This
|
|
606
|
-
ID should be generated using the 'generate_manager_id()' function exposed by this library."""
|
|
685
|
+
"""Stores the unique identifier of the manager process that holds the lock. A value of -1 indicates no lock."""
|
|
686
|
+
|
|
607
687
|
_lock_path: str = field(init=False)
|
|
608
|
-
"""Stores the path to the .lock file
|
|
609
|
-
stored inside the tracker file."""
|
|
688
|
+
"""Stores the absolute path to the .lock file ensuring thread-safe access to the lock state."""
|
|
610
689
|
|
|
611
690
|
def __post_init__(self) -> None:
|
|
612
|
-
|
|
691
|
+
"""Initializes the lock file path based on the .yaml file path."""
|
|
613
692
|
if self.file_path is not None:
|
|
614
693
|
self._lock_path = str(self.file_path.with_suffix(self.file_path.suffix + ".lock"))
|
|
615
694
|
else:
|
|
616
695
|
self._lock_path = ""
|
|
617
696
|
|
|
618
697
|
def _load_state(self) -> None:
|
|
619
|
-
"""
|
|
698
|
+
"""Loads the current lock state from the .yaml file."""
|
|
620
699
|
if self.file_path.exists():
|
|
621
|
-
|
|
622
|
-
instance: ProcessingTracker = self.from_yaml(self.file_path) # type: ignore
|
|
623
|
-
self._complete = copy.copy(instance._complete)
|
|
624
|
-
self._encountered_error = copy.copy(instance._encountered_error)
|
|
625
|
-
self._running = copy.copy(instance._running)
|
|
700
|
+
instance: SessionLock = self.from_yaml(self.file_path) # type: ignore
|
|
626
701
|
self._manager_id = copy.copy(instance._manager_id)
|
|
627
702
|
else:
|
|
628
|
-
#
|
|
629
|
-
# and saves it to disk using the specified tracker file path.
|
|
703
|
+
# Creates a new lock file with the default state (unlocked)
|
|
630
704
|
self._save_state()
|
|
631
705
|
|
|
632
706
|
def _save_state(self) -> None:
|
|
633
|
-
"""Saves the current
|
|
634
|
-
#
|
|
635
|
-
# back.
|
|
707
|
+
"""Saves the current lock state to the .yaml file."""
|
|
708
|
+
# Creates a copy without file paths for clean serialization
|
|
636
709
|
original = copy.deepcopy(self)
|
|
637
710
|
original.file_path = None # type: ignore
|
|
638
711
|
original._lock_path = None # type: ignore
|
|
639
712
|
original.to_yaml(file_path=self.file_path)
|
|
640
713
|
|
|
641
|
-
def
|
|
642
|
-
"""
|
|
643
|
-
runtime.
|
|
644
|
-
|
|
645
|
-
Calling this method effectively 'locks' the tracked session and processing runtime combination to only be
|
|
646
|
-
accessible from the manager process that calls this method. Calling this method for an already running runtime
|
|
647
|
-
managed by the same process does not have any effect, so it is safe to call this method at the beginning of
|
|
648
|
-
each processing job that makes up the runtime.
|
|
714
|
+
def acquire(self, manager_id: int) -> None:
|
|
715
|
+
"""Acquires the session lock for exclusive access.
|
|
649
716
|
|
|
650
717
|
Args:
|
|
651
|
-
manager_id: The unique
|
|
652
|
-
tracked by this tracker file.
|
|
718
|
+
manager_id: The unique identifier of the manager process requesting the lock.
|
|
653
719
|
|
|
654
720
|
Raises:
|
|
655
|
-
TimeoutError: If the .lock file for
|
|
721
|
+
TimeoutError: If the .lock file cannot be acquired for a long period of time due to being held by another
|
|
722
|
+
process.
|
|
723
|
+
RuntimeError: If the lock is held by another process and forcing lock acquisition is disabled.
|
|
656
724
|
"""
|
|
657
|
-
# Acquires the lock
|
|
658
725
|
lock = FileLock(self._lock_path)
|
|
659
726
|
with lock.acquire(timeout=10.0):
|
|
660
|
-
# Loads tracker state from the .yaml file
|
|
661
727
|
self._load_state()
|
|
662
728
|
|
|
663
|
-
#
|
|
664
|
-
if self.
|
|
729
|
+
# Checks if the session is already locked by another process
|
|
730
|
+
if self._manager_id != -1 and self._manager_id != manager_id:
|
|
665
731
|
message = (
|
|
666
|
-
f"
|
|
667
|
-
f"
|
|
668
|
-
f"
|
|
669
|
-
f"
|
|
732
|
+
f"Cannot acquire the session lock for manager process {manager_id}. The {self.file_path.name} "
|
|
733
|
+
f"session lock file indicates The lock is currently held by the manager process "
|
|
734
|
+
f"{self._manager_id}. Call the command that produced this error with the '--reset_lock' flag "
|
|
735
|
+
f"to override this safety feature or wait for the natural lock release."
|
|
670
736
|
)
|
|
671
737
|
console.error(message=message, error=RuntimeError)
|
|
672
|
-
raise RuntimeError(message)
|
|
738
|
+
raise RuntimeError(message)
|
|
673
739
|
|
|
674
|
-
#
|
|
675
|
-
#
|
|
676
|
-
elif self._running and manager_id == self._manager_id:
|
|
677
|
-
return
|
|
678
|
-
|
|
679
|
-
# Otherwise, locks the runtime for the current manager process and updates the cached tracker data
|
|
680
|
-
self._running = True
|
|
740
|
+
# The lock is free or already owned by this manager. If the lock is free, locks the session for the current
|
|
741
|
+
# manager. If it is already owned by this manager, it does nothing.
|
|
681
742
|
self._manager_id = manager_id
|
|
682
|
-
self._complete = False
|
|
683
|
-
self._encountered_error = False
|
|
684
743
|
self._save_state()
|
|
685
744
|
|
|
686
|
-
def
|
|
687
|
-
"""
|
|
688
|
-
to complete.
|
|
689
|
-
|
|
690
|
-
This method fulfills two main purposes. First, it 'unlocks' the runtime, allowing other manager processes to
|
|
691
|
-
interface with the tracked runtime. Second, it updates the tracker file to reflect that the runtime was
|
|
692
|
-
interrupted due to an error, which is used by the manager processes to detect and handle processing failures.
|
|
745
|
+
def release(self, manager_id: int) -> None:
|
|
746
|
+
"""Releases the session lock.
|
|
693
747
|
|
|
694
748
|
Args:
|
|
695
|
-
manager_id: The unique
|
|
696
|
-
runtime tracked by this tracker file has encountered an error.
|
|
749
|
+
manager_id: The unique identifier of the manager process releasing the lock.
|
|
697
750
|
|
|
698
751
|
Raises:
|
|
699
|
-
TimeoutError: If the .lock file for
|
|
752
|
+
TimeoutError: If the .lock file cannot be acquired for a long period of time due to being held by another
|
|
753
|
+
process.
|
|
754
|
+
RuntimeError: If the lock is held by another process.
|
|
700
755
|
"""
|
|
701
756
|
lock = FileLock(self._lock_path)
|
|
702
757
|
with lock.acquire(timeout=10.0):
|
|
703
|
-
# Loads tracker state from the .yaml file
|
|
704
758
|
self._load_state()
|
|
705
759
|
|
|
706
|
-
|
|
707
|
-
if not self._running:
|
|
708
|
-
return
|
|
709
|
-
|
|
710
|
-
# Ensures that only the active manager process can report runtime errors using the tracker file
|
|
711
|
-
if manager_id != self._manager_id:
|
|
760
|
+
if self._manager_id != manager_id:
|
|
712
761
|
message = (
|
|
713
|
-
f"Unable to
|
|
714
|
-
f"
|
|
715
|
-
f"
|
|
716
|
-
f"with the runtime."
|
|
762
|
+
f"Unable to release the session lock from the manager with id {manager_id}. The "
|
|
763
|
+
f"{self.file_path.name} session lock file indicates that the lock is held by the process with "
|
|
764
|
+
f"id {self._manager_id}, preventing other processes from interfacing with the session lock."
|
|
717
765
|
)
|
|
718
766
|
console.error(message=message, error=RuntimeError)
|
|
719
767
|
raise RuntimeError(message) # Fallback to appease mypy, should not be reachable
|
|
720
768
|
|
|
721
|
-
#
|
|
722
|
-
self._running = False
|
|
769
|
+
# Releases the lock
|
|
723
770
|
self._manager_id = -1
|
|
724
|
-
self._complete = False
|
|
725
|
-
self._encountered_error = True
|
|
726
771
|
self._save_state()
|
|
727
772
|
|
|
728
|
-
def
|
|
729
|
-
"""
|
|
730
|
-
|
|
731
|
-
This method 'unlocks' the runtime, allowing other manager processes to interface with the tracked runtime. It
|
|
732
|
-
also configures the tracker file to indicate that the runtime has been completed successfully, which is used
|
|
733
|
-
by the manager processes to detect and handle processing completion.
|
|
773
|
+
def force_release(self) -> None:
|
|
774
|
+
"""Forcibly releases the lock regardless of ownership.
|
|
734
775
|
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
runtime tracked by this tracker file has been completed successfully.
|
|
776
|
+
This method should only be used for emergency recovery of deadlocked sessions. It can be called by any process
|
|
777
|
+
to unlock the session whose lock is managed by this instance.
|
|
738
778
|
|
|
739
779
|
Raises:
|
|
740
|
-
TimeoutError: If the .lock file for
|
|
780
|
+
TimeoutError: If the .lock file cannot be acquired for a long period of time due to being held by another
|
|
781
|
+
process.
|
|
741
782
|
"""
|
|
742
783
|
lock = FileLock(self._lock_path)
|
|
743
784
|
with lock.acquire(timeout=10.0):
|
|
744
|
-
# Loads tracker state from the .yaml file
|
|
745
|
-
self._load_state()
|
|
746
|
-
|
|
747
|
-
# If the runtime is not running, does not do anything
|
|
748
|
-
if not self._running:
|
|
749
|
-
return
|
|
750
|
-
|
|
751
|
-
# Ensures that only the active manager process can report runtime completion using the tracker file
|
|
752
|
-
if manager_id != self._manager_id:
|
|
753
|
-
message = (
|
|
754
|
-
f"Unable to report that the processing runtime has completed successfully from the manager process "
|
|
755
|
-
f"with id {manager_id}. The {self.file_path.name} tracker file indicates that the runtime is "
|
|
756
|
-
f"managed by the process with id {self._manager_id}, preventing other processes from interfacing "
|
|
757
|
-
f"with the runtime."
|
|
758
|
-
)
|
|
759
|
-
console.error(message=message, error=RuntimeError)
|
|
760
|
-
raise RuntimeError(message) # Fallback to appease mypy, should not be reachable
|
|
761
|
-
|
|
762
|
-
# Otherwise, marks the runtime as complete (stopped)
|
|
763
|
-
self._running = False
|
|
764
785
|
self._manager_id = -1
|
|
765
|
-
self._complete = True
|
|
766
|
-
self._encountered_error = False
|
|
767
786
|
self._save_state()
|
|
768
787
|
|
|
769
|
-
|
|
770
|
-
|
|
788
|
+
@property
|
|
789
|
+
def is_locked(self) -> bool:
|
|
790
|
+
"""Returns True if the session is currently locked by any process, False otherwise.
|
|
771
791
|
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
runtime.
|
|
792
|
+
Raises:
|
|
793
|
+
TimeoutError: If the .lock file cannot be acquired for a long period of time due to being held by another
|
|
794
|
+
process.
|
|
776
795
|
"""
|
|
777
796
|
lock = FileLock(self._lock_path)
|
|
778
797
|
with lock.acquire(timeout=10.0):
|
|
779
|
-
# Loads tracker state from the .yaml file
|
|
780
|
-
self._load_state()
|
|
781
|
-
|
|
782
|
-
# Resets the tracker file to the default state. Note, does not indicate that the runtime is complete nor
|
|
783
|
-
# that it has encountered an error.
|
|
784
|
-
self._running = False
|
|
785
|
-
self._manager_id = -1
|
|
786
|
-
self._complete = False
|
|
787
|
-
self._encountered_error = False
|
|
788
|
-
self._save_state()
|
|
789
|
-
|
|
790
|
-
@property
|
|
791
|
-
def is_complete(self) -> bool:
|
|
792
|
-
"""Returns True if the tracker wrapped by the instance indicates that the processing runtime has been completed
|
|
793
|
-
successfully and that the runtime is not currently ongoing."""
|
|
794
|
-
lock = FileLock(self._lock_path)
|
|
795
|
-
with lock.acquire(timeout=10.0):
|
|
796
|
-
# Loads tracker state from the .yaml file
|
|
797
798
|
self._load_state()
|
|
798
|
-
return self.
|
|
799
|
+
return self._manager_id != -1
|
|
799
800
|
|
|
800
801
|
@property
|
|
801
|
-
def
|
|
802
|
-
"""Returns
|
|
803
|
-
|
|
804
|
-
lock = FileLock(self._lock_path)
|
|
805
|
-
with lock.acquire(timeout=10.0):
|
|
806
|
-
# Loads tracker state from the .yaml file
|
|
807
|
-
self._load_state()
|
|
808
|
-
return self._encountered_error
|
|
802
|
+
def owner(self) -> int | None:
|
|
803
|
+
"""Returns the unique identifier of the manager process that holds the lock if the session is locked or None if
|
|
804
|
+
the session is unlocked.
|
|
809
805
|
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
806
|
+
Raises:
|
|
807
|
+
TimeoutError: If the .lock file cannot be acquired for a long period of time due to being held by another
|
|
808
|
+
process.
|
|
809
|
+
"""
|
|
814
810
|
lock = FileLock(self._lock_path)
|
|
815
811
|
with lock.acquire(timeout=10.0):
|
|
816
|
-
# Loads tracker state from the .yaml file
|
|
817
812
|
self._load_state()
|
|
818
|
-
return self.
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
def get_processing_tracker(root: Path, file_name: TrackerFileNames | str) -> ProcessingTracker:
|
|
822
|
-
"""Initializes and returns the ProcessingTracker instance that manages the data stored inside the target processing
|
|
823
|
-
tracker file.
|
|
824
|
-
|
|
825
|
-
This function uses the input root path and tracker file name to first resolve the absolute path to the .yaml data
|
|
826
|
-
cache of the target processing tracker file and then wrap the file into a ProcessingTracker instance. All Sun lab
|
|
827
|
-
libraries that use ProcessingTracker instances use this function to access the necessary trackers.
|
|
828
|
-
|
|
829
|
-
Notes:
|
|
830
|
-
If the target file does not exist, this function will create the file as part of the ProcessingTracker
|
|
831
|
-
initialization.
|
|
832
|
-
|
|
833
|
-
This function also generates the corresponding .lock file to ensure that the data inside the processing tracker
|
|
834
|
-
is accessed by a single process at a time.
|
|
835
|
-
|
|
836
|
-
Args:
|
|
837
|
-
file_name: The name of the target processing tracker file. Has to be one of the names from the TrackerFileNames
|
|
838
|
-
enumeration.
|
|
839
|
-
root: The absolute path to the directory where the target file is stored or should be created.
|
|
840
|
-
|
|
841
|
-
Returns:
|
|
842
|
-
The initialized ProcessingTracker instance that manages the data stored in the target file.
|
|
843
|
-
"""
|
|
844
|
-
|
|
845
|
-
# Prevents using the function for unsupported tracker file names.
|
|
846
|
-
supported_files = tuple(TrackerFileNames)
|
|
847
|
-
if file_name not in supported_files:
|
|
848
|
-
message = (
|
|
849
|
-
f"Unable to construct the path to the tracker file {file_name}. The input name is not one of the supported"
|
|
850
|
-
f"names. Use one of the supported options provided by the TrackerFileNames enumeration."
|
|
851
|
-
)
|
|
852
|
-
console.error(message=message, error=ValueError)
|
|
853
|
-
|
|
854
|
-
# Constructs and returns the absolute path to the requested tracker file.
|
|
855
|
-
tracker_path = root.joinpath(file_name)
|
|
856
|
-
return ProcessingTracker(file_path=tracker_path)
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
def generate_manager_id() -> int:
|
|
860
|
-
"""Generates and returns a unique integer identifier that can be used to identify the manager process that calls
|
|
861
|
-
this function.
|
|
862
|
-
|
|
863
|
-
The identifier is generated based on the current timestamp, accurate to microseconds, and a random number between 1
|
|
864
|
-
and 9999999999999. This ensures that the identifier is unique for each function call. The generated identifier
|
|
865
|
-
string is converted to a unique integer value using the xxHash-64 algorithm before it is returned to the caller.
|
|
866
|
-
|
|
867
|
-
Notes:
|
|
868
|
-
This function should be used to generate manager process identifiers for working with ProcessingTracker
|
|
869
|
-
instances from sl-shared-assets version 4.0.0 and above.
|
|
870
|
-
"""
|
|
871
|
-
timestamp = get_timestamp()
|
|
872
|
-
random_number = randint(1, 9999999999999)
|
|
873
|
-
manager_id = f"{timestamp}_{random_number}"
|
|
874
|
-
id_hash = xxh3_64()
|
|
875
|
-
id_hash.update(manager_id)
|
|
876
|
-
return id_hash.intdigest()
|
|
813
|
+
return self._manager_id if self._manager_id != -1 else None
|