sl-shared-assets 6.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sl_shared_assets/__init__.py +120 -0
- sl_shared_assets/command_line_interfaces/__init__.py +3 -0
- sl_shared_assets/command_line_interfaces/configure.py +318 -0
- sl_shared_assets/data_classes/__init__.py +121 -0
- sl_shared_assets/data_classes/configuration_data.py +939 -0
- sl_shared_assets/data_classes/dataset_data.py +385 -0
- sl_shared_assets/data_classes/processing_data.py +385 -0
- sl_shared_assets/data_classes/runtime_data.py +237 -0
- sl_shared_assets/data_classes/session_data.py +400 -0
- sl_shared_assets/data_classes/surgery_data.py +138 -0
- sl_shared_assets/data_transfer/__init__.py +12 -0
- sl_shared_assets/data_transfer/checksum_tools.py +125 -0
- sl_shared_assets/data_transfer/transfer_tools.py +181 -0
- sl_shared_assets/py.typed +0 -0
- sl_shared_assets-6.1.1.dist-info/METADATA +830 -0
- sl_shared_assets-6.1.1.dist-info/RECORD +19 -0
- sl_shared_assets-6.1.1.dist-info/WHEEL +4 -0
- sl_shared_assets-6.1.1.dist-info/entry_points.txt +2 -0
- sl_shared_assets-6.1.1.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""This module provides the assets for computing data integrity checksums."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from functools import partial
|
|
6
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
7
|
+
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
import xxhash
|
|
10
|
+
|
|
11
|
+
# Defines a 'blacklist' set of files. Primarily, this list contains the service files that may change after the session
|
|
12
|
+
# data has been acquired. Therefore, it does not make sense to include them in the checksum, as they do not reflect the
|
|
13
|
+
# data that should remain permanently unchanged.
|
|
14
|
+
_excluded_files = {
|
|
15
|
+
"ax_checksum.txt",
|
|
16
|
+
"nk.bin",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _calculate_file_checksum(base_directory: Path, file_path: Path) -> tuple[str, bytes]: # pragma: no cover
|
|
21
|
+
"""Calculates the xxHash3-128 checksum for the target file and its path relative to the base directory.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
base_directory: The path to the directory being processed by the 'calculate_directory_checksum' function.
|
|
25
|
+
file_path: The path to the target file relative to the base directory.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
A tuple with two elements. The first element is the path to the file relative to the base directory. The second
|
|
29
|
+
element is the xxHash3-128 checksum that reflects the file's path and data.
|
|
30
|
+
"""
|
|
31
|
+
# Initializes the hashsum object.
|
|
32
|
+
checksum = xxhash.xxh3_128()
|
|
33
|
+
|
|
34
|
+
# Encodes the relative path and appends it to the checksum. This ensures that the hashsum reflects both the state
|
|
35
|
+
# of individual files and the layout of the overall encoded directory structure.
|
|
36
|
+
relative_path = str(file_path.relative_to(base_directory))
|
|
37
|
+
checksum.update(relative_path.encode())
|
|
38
|
+
|
|
39
|
+
# Extends the checksum to reflect the file data state. Uses 8 MB chunks to avoid excessive RAM hogging at the cost
|
|
40
|
+
# of slightly reduced throughput.
|
|
41
|
+
with file_path.open("rb") as f:
|
|
42
|
+
for chunk in iter(lambda: f.read(1024 * 1024 * 8), b""):
|
|
43
|
+
checksum.update(chunk)
|
|
44
|
+
|
|
45
|
+
# Returns both path and file checksum. Although the relative path information is already encoded in the hashsum, the
|
|
46
|
+
# relative path information is re-encoded at the directory level to protect against future changes to the per-file
|
|
47
|
+
# hashsum calculation logic. It is extra work, but it improves the overall checksum security.
|
|
48
|
+
return relative_path, checksum.digest()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def calculate_directory_checksum(
|
|
52
|
+
directory: Path, num_processes: int | None = None, *, progress: bool = False, save_checksum: bool = True
|
|
53
|
+
) -> str:
|
|
54
|
+
"""Calculates the xxHash3-128 checksum for the input directory.
|
|
55
|
+
|
|
56
|
+
Note:
|
|
57
|
+
The function can be configured to write the generated checksum as a hexadecimal string to the ax_checksum.txt
|
|
58
|
+
file stored at the highest level of the input directory.
|
|
59
|
+
|
|
60
|
+
The xxHash3 checksum is not suitable for security purposes and is only used to ensure data integrity.
|
|
61
|
+
|
|
62
|
+
The returned checksum accounts for both the contents of each file and the layout of the input directory
|
|
63
|
+
structure.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
directory: The path to the directory for which to generate the checksum.
|
|
67
|
+
num_processes: The number of processes to use for parallelizing checksum calculation. If set to None, the
|
|
68
|
+
function uses all available CPU cores.
|
|
69
|
+
progress: Determines whether to track the checksum calculation progress using a progress bar.
|
|
70
|
+
save_checksum: Determines whether the checksum should be saved (written to) a .txt file.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The xxHash3-128 checksum for the input directory as a hexadecimal string.
|
|
74
|
+
"""
|
|
75
|
+
# Determines the number of parallel processes to use.
|
|
76
|
+
if num_processes is None:
|
|
77
|
+
num_processes = max(1, os.cpu_count()) # type: ignore[type-var]
|
|
78
|
+
|
|
79
|
+
# Determines the path to each file inside the input directory structure and sorts them for consistency
|
|
80
|
+
files = sorted(
|
|
81
|
+
path
|
|
82
|
+
for path in directory.rglob("*")
|
|
83
|
+
if path.is_file() and f"{path.stem}{path.suffix}" not in _excluded_files # Excludes service files
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Pre-creates the directory checksum
|
|
87
|
+
checksum = xxhash.xxh3_128()
|
|
88
|
+
|
|
89
|
+
# Process files in parallel
|
|
90
|
+
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
|
91
|
+
# Creates the partial function with fixed base_directory (the first argument of _calculate_file_hash())
|
|
92
|
+
process_file = partial(_calculate_file_checksum, directory)
|
|
93
|
+
|
|
94
|
+
# Submits all tasks to be executed in parallel
|
|
95
|
+
# noinspection PyTypeChecker
|
|
96
|
+
future_to_path = {executor.submit(process_file, file): file for file in files}
|
|
97
|
+
|
|
98
|
+
# Collects results as they complete
|
|
99
|
+
results = []
|
|
100
|
+
if progress:
|
|
101
|
+
with tqdm(
|
|
102
|
+
total=len(files), desc=f"Calculating checksum for {Path(*directory.parts[-4:-1])}", unit="file"
|
|
103
|
+
) as pbar:
|
|
104
|
+
for future in as_completed(future_to_path):
|
|
105
|
+
results.append(future.result())
|
|
106
|
+
pbar.update(1)
|
|
107
|
+
else:
|
|
108
|
+
# For batch mode, uses a direct list comprehension with as_completed. This avoids the overhead of progress
|
|
109
|
+
# tracking while maintaining parallel processing, avoiding terminal clutter in batched contexts.
|
|
110
|
+
results = [future.result() for future in as_completed(future_to_path)]
|
|
111
|
+
|
|
112
|
+
# Sorts results for consistency and combines them into the final checksum
|
|
113
|
+
for file_path, file_checksum in sorted(results):
|
|
114
|
+
checksum.update(file_path.encode())
|
|
115
|
+
checksum.update(file_checksum)
|
|
116
|
+
|
|
117
|
+
checksum_hexstring = checksum.hexdigest()
|
|
118
|
+
|
|
119
|
+
# Writes the hash to ax_checksum.txt in the root directory
|
|
120
|
+
if save_checksum:
|
|
121
|
+
checksum_path = directory.joinpath("ax_checksum.txt")
|
|
122
|
+
with checksum_path.open("w") as f:
|
|
123
|
+
f.write(checksum_hexstring)
|
|
124
|
+
|
|
125
|
+
return checksum_hexstring
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""This module provides the assets for moving the data between destinations available on the host-machine's filesystem
|
|
2
|
+
and removing the data from the host-machine.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
from ataraxis_time import PrecisionTimer
|
|
12
|
+
from ataraxis_base_utilities import console, ensure_directory_exists
|
|
13
|
+
|
|
14
|
+
from .checksum_tools import calculate_directory_checksum
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def delete_directory(directory_path: Path) -> None:
|
|
18
|
+
"""Deletes the target directory and all its subdirectories using parallel processing.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
directory_path: The path to the directory to delete.
|
|
22
|
+
"""
|
|
23
|
+
# Checks if the directory exists and, if not, aborts early
|
|
24
|
+
if not directory_path.exists():
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
# Builds the list of files and directories inside the input directory using Path
|
|
28
|
+
files = [p for p in directory_path.iterdir() if p.is_file()]
|
|
29
|
+
subdirectories = [p for p in directory_path.iterdir() if p.is_dir()]
|
|
30
|
+
|
|
31
|
+
# Deletes files in parallel
|
|
32
|
+
with ThreadPoolExecutor() as executor:
|
|
33
|
+
list(executor.map(os.unlink, files)) # Forces completion of all tasks
|
|
34
|
+
|
|
35
|
+
# Recursively deletes subdirectories
|
|
36
|
+
for subdir in subdirectories:
|
|
37
|
+
delete_directory(subdir)
|
|
38
|
+
|
|
39
|
+
# Removes the now-empty root directory. Since Windows is sometimes slow to release file handles, adds
|
|
40
|
+
# an optional delay step to give Windows time to release file handles.
|
|
41
|
+
max_attempts = 5
|
|
42
|
+
delay_timer = PrecisionTimer("ms")
|
|
43
|
+
for _ in range(max_attempts):
|
|
44
|
+
# noinspection PyBroadException
|
|
45
|
+
try:
|
|
46
|
+
directory_path.rmdir()
|
|
47
|
+
break # Breaks early if the call succeeds
|
|
48
|
+
except Exception: # pragma: no cover
|
|
49
|
+
delay_timer.delay(block=False, delay=500, allow_sleep=True) # For each failed attempt, sleeps for 500 ms
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _transfer_file(source_file: Path, source_directory: Path, destination_directory: Path) -> None:
|
|
54
|
+
"""Copies the input file from the source directory to the destination directory while preserving the file metadata.
|
|
55
|
+
|
|
56
|
+
This worker method is used by the transfer_directory() method to move multiple files in parallel.
|
|
57
|
+
|
|
58
|
+
Notes:
|
|
59
|
+
If the file is found under a hierarchy of subdirectories inside the input source_directory, that hierarchy will
|
|
60
|
+
be preserved in the destination directory.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
source_file: The file to be copied.
|
|
64
|
+
source_directory: The root directory where the file is located.
|
|
65
|
+
destination_directory: The destination directory where to move the file.
|
|
66
|
+
"""
|
|
67
|
+
relative = source_file.relative_to(source_directory)
|
|
68
|
+
dest_file = destination_directory / relative
|
|
69
|
+
shutil.copy2(source_file, dest_file)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def transfer_directory(
|
|
73
|
+
source: Path,
|
|
74
|
+
destination: Path,
|
|
75
|
+
num_threads: int = 1,
|
|
76
|
+
*,
|
|
77
|
+
verify_integrity: bool = False,
|
|
78
|
+
remove_source: bool = False,
|
|
79
|
+
progress: bool = False,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Copies the contents of the input source directory to the destination directory while preserving the underlying
|
|
82
|
+
directory hierarchy.
|
|
83
|
+
|
|
84
|
+
Notes:
|
|
85
|
+
This function recreates the moved directory hierarchy on the destination if the hierarchy does not exist. This
|
|
86
|
+
is done before copying the files.
|
|
87
|
+
|
|
88
|
+
The function executes a multithreaded copy operation and does not by default remove the source data after the
|
|
89
|
+
copy is complete.
|
|
90
|
+
|
|
91
|
+
If the function is configured to verify the transferred data's integrity, it generates an xxHash-128 checksum of
|
|
92
|
+
the data before and after the transfer and compares the two checksums to detect data corruption.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
source: The path to the directory to be transferred.
|
|
96
|
+
destination: The path to the destination directory where to move the contents of the source directory.
|
|
97
|
+
num_threads: The number of threads to use for the parallel file transfer. Setting this value to a number below
|
|
98
|
+
1 instructs the function to use all available CPU threads.
|
|
99
|
+
verify_integrity: Determines whether to perform integrity verification for the transferred files.
|
|
100
|
+
remove_source: Determines whether to remove the source directory after the transfer is complete and
|
|
101
|
+
(optionally) verified.
|
|
102
|
+
progress: Determines whether to track the transfer progress using a progress bar.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
RuntimeError: If the transferred files do not pass the xxHas3-128 checksum integrity verification.
|
|
106
|
+
"""
|
|
107
|
+
if not source.exists():
|
|
108
|
+
message = f"Unable to transfer the source directory {source}, as it does not exist."
|
|
109
|
+
console.error(message=message, error=FileNotFoundError)
|
|
110
|
+
|
|
111
|
+
# If the number of threads is less than 1, interprets this as a directive to use all available CPU cores.
|
|
112
|
+
if num_threads < 1:
|
|
113
|
+
cpu_count = os.cpu_count()
|
|
114
|
+
num_threads = cpu_count if cpu_count is not None else 1
|
|
115
|
+
|
|
116
|
+
# If transfer integrity verification is enabled, but the source directory does not contain the 'ax_checksum.txt'
|
|
117
|
+
# file, checksums the directory before the transfer operation.
|
|
118
|
+
if verify_integrity and not source.joinpath("ax_checksum.txt").exists():
|
|
119
|
+
calculate_directory_checksum(directory=source, progress=False, save_checksum=True)
|
|
120
|
+
|
|
121
|
+
# Ensures the destination root directory exists.
|
|
122
|
+
ensure_directory_exists(destination)
|
|
123
|
+
|
|
124
|
+
# Collects all items (files and directories) in the source directory.
|
|
125
|
+
all_items = tuple(source.rglob("*"))
|
|
126
|
+
|
|
127
|
+
# Loops over all items (files and directories). Adds files to the file_list variable. Uses directories to reinstate
|
|
128
|
+
# the source subdirectory hierarchy in the destination directory.
|
|
129
|
+
file_list = []
|
|
130
|
+
for item in sorted(all_items, key=lambda x: len(x.relative_to(source).parts)):
|
|
131
|
+
# Recreates directory structure on destination
|
|
132
|
+
if item.is_dir():
|
|
133
|
+
dest_dir = destination / item.relative_to(source)
|
|
134
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
# Also builds the list of files to be moved
|
|
136
|
+
else: # is_file()
|
|
137
|
+
file_list.append(item)
|
|
138
|
+
|
|
139
|
+
# Copies the data to the destination. For parallel workflows, the method uses the ThreadPoolExecutor to move
|
|
140
|
+
# multiple files at the same time. Since I/O operations do not hold GIL, we do not need to parallelize with
|
|
141
|
+
# Processes here.
|
|
142
|
+
if num_threads > 1:
|
|
143
|
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
|
144
|
+
futures = {executor.submit(_transfer_file, file, source, destination): file for file in file_list}
|
|
145
|
+
for future in tqdm(
|
|
146
|
+
as_completed(futures),
|
|
147
|
+
total=len(file_list),
|
|
148
|
+
desc=f"Transferring files to {Path(*destination.parts[-6:])}",
|
|
149
|
+
unit="file",
|
|
150
|
+
disable=not progress,
|
|
151
|
+
):
|
|
152
|
+
# Propagates any exceptions from the file transfer.
|
|
153
|
+
future.result()
|
|
154
|
+
else:
|
|
155
|
+
for file in tqdm(
|
|
156
|
+
file_list,
|
|
157
|
+
desc=f"Transferring files to {Path(*destination.parts[-6:])}",
|
|
158
|
+
unit="file",
|
|
159
|
+
disable=not progress,
|
|
160
|
+
):
|
|
161
|
+
_transfer_file(file, source, destination)
|
|
162
|
+
|
|
163
|
+
# Verifies the integrity of the transferred directory by rerunning xxHash3-128 calculation.
|
|
164
|
+
if verify_integrity:
|
|
165
|
+
destination_checksum = calculate_directory_checksum(directory=destination, progress=False, save_checksum=False)
|
|
166
|
+
with source.joinpath("ax_checksum.txt").open("r") as local_checksum:
|
|
167
|
+
if destination_checksum != local_checksum.readline().strip():
|
|
168
|
+
message = (
|
|
169
|
+
f"Checksum mismatch detected when transferring {Path(*source.parts[-6:])} to "
|
|
170
|
+
f"{Path(*destination.parts[-6:])}! The data was likely corrupted in transmission."
|
|
171
|
+
)
|
|
172
|
+
console.error(message=message, error=RuntimeError)
|
|
173
|
+
|
|
174
|
+
# If necessary, removes the transferred directory from the original location.
|
|
175
|
+
if remove_source:
|
|
176
|
+
message = (
|
|
177
|
+
f"Removing the now-redundant source directory {source} and all of its contents following the successful "
|
|
178
|
+
f"transfer..."
|
|
179
|
+
)
|
|
180
|
+
console.echo(message=message)
|
|
181
|
+
delete_directory(source)
|
|
File without changes
|