esrf-data-compressor 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {esrf_data_compressor-0.1.1/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.0}/PKG-INFO +4 -2
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/README.md +4 -2
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/pyproject.toml +2 -2
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/run_check.py +6 -5
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/cli.py +51 -20
- esrf_data_compressor-0.2.0/src/esrf_data_compressor/compressors/base.py +271 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_cli.py +93 -7
- esrf_data_compressor-0.2.0/src/esrf_data_compressor/tests/test_paths.py +36 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_run_check.py +18 -0
- esrf_data_compressor-0.2.0/src/esrf_data_compressor/utils/paths.py +81 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info}/PKG-INFO +4 -2
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/SOURCES.txt +2 -0
- esrf_data_compressor-0.1.1/src/esrf_data_compressor/compressors/base.py +0 -167
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/LICENSE +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/setup.cfg +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/__init__.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/ssim.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/__init__.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/jp2k.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/finder/finder.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/__init__.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_finder.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_hdf5_helpers.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_jp2k.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_ssim.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_utils.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/hdf5_helpers.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/utils.py +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/dependency_links.txt +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/entry_points.txt +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/requires.txt +0 -0
- {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/top_level.txt +0 -0
{esrf_data_compressor-0.1.1/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: esrf-data-compressor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A library to compress ESRF data and reduce their footprint
|
|
5
5
|
Author-email: ESRF <dau-pydev@esrf.fr>
|
|
6
6
|
License: MIT License
|
|
@@ -84,7 +84,9 @@ Dynamic: license-file
|
|
|
84
84
|
|
|
85
85
|
* **Non-destructive workflow**
|
|
86
86
|
|
|
87
|
-
1. `compress` writes
|
|
87
|
+
1. `compress` writes compressed files either:
|
|
88
|
+
- next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
|
|
89
|
+
- under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
|
|
88
90
|
2. `check` computes SSIM (first and last frames) and writes a report
|
|
89
91
|
3. `overwrite` (optional) swaps out the raw frame file (irreversible)
|
|
90
92
|
|
|
@@ -23,7 +23,9 @@
|
|
|
23
23
|
|
|
24
24
|
* **Non-destructive workflow**
|
|
25
25
|
|
|
26
|
-
1. `compress` writes
|
|
26
|
+
1. `compress` writes compressed files either:
|
|
27
|
+
- next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
|
|
28
|
+
- under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
|
|
27
29
|
2. `check` computes SSIM (first and last frames) and writes a report
|
|
28
30
|
3. `overwrite` (optional) swaps out the raw frame file (irreversible)
|
|
29
31
|
|
|
@@ -119,4 +121,4 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
|
|
|
119
121
|
* Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
|
|
120
122
|
* Parallelism with worker×thread auto-factoring.
|
|
121
123
|
|
|
122
|
-
For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
|
|
124
|
+
For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "esrf-data-compressor"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
|
|
9
9
|
description = "A library to compress ESRF data and reduce their footprint"
|
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
@@ -69,4 +69,4 @@ omit = ["*/tests/*"]
|
|
|
69
69
|
|
|
70
70
|
[tool.isort]
|
|
71
71
|
profile = "black"
|
|
72
|
-
force_single_line = true
|
|
72
|
+
force_single_line = true
|
|
@@ -3,12 +3,15 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
3
3
|
from tqdm import tqdm
|
|
4
4
|
|
|
5
5
|
from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
|
|
6
|
+
from esrf_data_compressor.utils.paths import resolve_compressed_path
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
def run_ssim_check(
|
|
9
|
+
def run_ssim_check(
|
|
10
|
+
raw_files: list[str], method: str, report_path: str, layout: str = "sibling"
|
|
11
|
+
) -> None:
|
|
9
12
|
"""
|
|
10
13
|
Given a list of raw HDF5 file paths, partitions into:
|
|
11
|
-
to_check → those with
|
|
14
|
+
to_check → those with an expected compressed counterpart according to `layout`
|
|
12
15
|
missing → those without one
|
|
13
16
|
|
|
14
17
|
Writes a report to `report_path`:
|
|
@@ -21,9 +24,7 @@ def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
|
|
|
21
24
|
|
|
22
25
|
# partition
|
|
23
26
|
for orig in raw_files:
|
|
24
|
-
|
|
25
|
-
stem, _ = os.path.splitext(fname)
|
|
26
|
-
comp_path = os.path.join(dirname, f"{stem}_{method}.h5")
|
|
27
|
+
comp_path = resolve_compressed_path(orig, method, layout=layout)
|
|
27
28
|
if os.path.exists(comp_path):
|
|
28
29
|
to_check.append((orig, comp_path))
|
|
29
30
|
else:
|
|
@@ -46,13 +46,13 @@ def do_compress(args):
|
|
|
46
46
|
exit_with_error(f"Failed to read report '{report}': {e}")
|
|
47
47
|
|
|
48
48
|
if not files:
|
|
49
|
-
print("Nothing to compress (TO
|
|
49
|
+
print("Nothing to compress (TO COMPRESS list is empty).")
|
|
50
50
|
return
|
|
51
51
|
|
|
52
52
|
print(
|
|
53
|
-
f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method
|
|
53
|
+
f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method, ratio {args.cratio}, layout '{args.layout}' …"
|
|
54
54
|
)
|
|
55
|
-
mgr = CompressorManager(cratio=args.cratio, method=args.method)
|
|
55
|
+
mgr = CompressorManager(cratio=args.cratio, method=args.method, layout=args.layout)
|
|
56
56
|
mgr.compress_files(files)
|
|
57
57
|
print("Compression complete.\n")
|
|
58
58
|
|
|
@@ -65,15 +65,14 @@ def do_check(args):
|
|
|
65
65
|
exit_with_error(f"Failed to read report '{report}': {e}")
|
|
66
66
|
|
|
67
67
|
if not files:
|
|
68
|
-
print("Nothing to check (TO
|
|
68
|
+
print("Nothing to check (TO COMPRESS list is empty).")
|
|
69
69
|
return
|
|
70
70
|
|
|
71
|
-
# We reuse run_ssim_check in its 3‑arg form (raw_files, method, report_path)
|
|
72
71
|
report_fname = f"{os.path.splitext(report)[0]}_{args.method}_ssim_report.txt"
|
|
73
72
|
report_path = os.path.abspath(report_fname)
|
|
74
73
|
|
|
75
74
|
try:
|
|
76
|
-
run_ssim_check(files, args.method, report_path)
|
|
75
|
+
run_ssim_check(files, args.method, report_path, layout=args.layout)
|
|
77
76
|
except SystemExit as e:
|
|
78
77
|
exit_with_error(str(e))
|
|
79
78
|
|
|
@@ -81,9 +80,6 @@ def do_check(args):
|
|
|
81
80
|
|
|
82
81
|
|
|
83
82
|
def do_overwrite(args):
|
|
84
|
-
"""
|
|
85
|
-
Overwrite TO COMPRESS files with their original sources.
|
|
86
|
-
"""
|
|
87
83
|
report = args.input or "file_list.txt"
|
|
88
84
|
try:
|
|
89
85
|
files = parse_report(report)
|
|
@@ -91,13 +87,26 @@ def do_overwrite(args):
|
|
|
91
87
|
exit_with_error(f"Failed to read report '{report}': {e}")
|
|
92
88
|
|
|
93
89
|
if not files:
|
|
94
|
-
print("Nothing to
|
|
90
|
+
print("Nothing to process (TO COMPRESS list is empty).")
|
|
95
91
|
return
|
|
96
92
|
|
|
97
|
-
print(f"Overwriting {len(files)} file(s) from '{report}' …")
|
|
98
93
|
mgr = CompressorManager()
|
|
94
|
+
|
|
95
|
+
if args.final:
|
|
96
|
+
print(f"Finalizing overwrite for {len(files)} file(s) from '{report}' …")
|
|
97
|
+
mgr.remove_backups(files)
|
|
98
|
+
print("Finalize step complete.\n")
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
if args.undo:
|
|
102
|
+
print(f"Undoing overwrite for {len(files)} file(s) from '{report}' …")
|
|
103
|
+
mgr.restore_backups(files)
|
|
104
|
+
print("Undo step complete.\n")
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
print(f"Overwriting {len(files)} file(s) from '{report}' …")
|
|
99
108
|
mgr.overwrite_files(files)
|
|
100
|
-
print("Overwrite complete.\n")
|
|
109
|
+
print("Overwrite complete (backups kept).\n")
|
|
101
110
|
|
|
102
111
|
|
|
103
112
|
def main():
|
|
@@ -106,7 +115,6 @@ def main():
|
|
|
106
115
|
)
|
|
107
116
|
sub = parser.add_subparsers(dest="command", required=True)
|
|
108
117
|
|
|
109
|
-
# list
|
|
110
118
|
p = sub.add_parser("list", help="Report VDS sources → TO COMPRESS vs REMAINING")
|
|
111
119
|
p.add_argument("experiment", help="Experiment ID")
|
|
112
120
|
p.add_argument("beamline", nargs="?", help="Optional beamline")
|
|
@@ -115,13 +123,12 @@ def main():
|
|
|
115
123
|
p.add_argument(
|
|
116
124
|
"--filter",
|
|
117
125
|
metavar="KEY:VAL[,KEY2:VAL2...]",
|
|
118
|
-
help="Dataset
|
|
126
|
+
help="Dataset-level attribute substring filters",
|
|
119
127
|
)
|
|
120
128
|
p.add_argument("--output", help="Report file (default = file_list.txt)")
|
|
121
129
|
p.set_defaults(func=do_list)
|
|
122
130
|
|
|
123
|
-
|
|
124
|
-
p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
|
|
131
|
+
p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
|
|
125
132
|
p.add_argument(
|
|
126
133
|
"--input",
|
|
127
134
|
"-i",
|
|
@@ -135,23 +142,47 @@ def main():
|
|
|
135
142
|
default="jp2k",
|
|
136
143
|
help="Compression method",
|
|
137
144
|
)
|
|
145
|
+
p.add_argument(
|
|
146
|
+
"--layout",
|
|
147
|
+
choices=["sibling", "mirror"],
|
|
148
|
+
default="mirror",
|
|
149
|
+
help="Output layout: sibling (next to each source) or mirror (under RAW_DATA_COMPRESSED, preserving source names).",
|
|
150
|
+
)
|
|
138
151
|
p.set_defaults(func=do_compress)
|
|
139
152
|
|
|
140
|
-
|
|
141
|
-
p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
|
|
153
|
+
p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
|
|
142
154
|
p.add_argument(
|
|
143
155
|
"--input", "-i", help="Report file to read (default = file_list.txt)"
|
|
144
156
|
)
|
|
145
157
|
p.add_argument(
|
|
146
158
|
"--method", choices=["jp2k"], default="jp2k", help="Compression method"
|
|
147
159
|
)
|
|
160
|
+
p.add_argument(
|
|
161
|
+
"--layout",
|
|
162
|
+
choices=["sibling", "mirror"],
|
|
163
|
+
default="mirror",
|
|
164
|
+
help="Location of compressed files to check.",
|
|
165
|
+
)
|
|
148
166
|
p.set_defaults(func=do_check)
|
|
149
167
|
|
|
150
|
-
|
|
151
|
-
|
|
168
|
+
p = sub.add_parser(
|
|
169
|
+
"overwrite",
|
|
170
|
+
help="Swap in compressed files and keep backups; with --final or --undo, perform cleanup/restore only.",
|
|
171
|
+
)
|
|
152
172
|
p.add_argument(
|
|
153
173
|
"--input", "-i", help="Report file to read (default = file_list.txt)"
|
|
154
174
|
)
|
|
175
|
+
group = p.add_mutually_exclusive_group()
|
|
176
|
+
group.add_argument(
|
|
177
|
+
"--final",
|
|
178
|
+
action="store_true",
|
|
179
|
+
help="Cleanup only: delete existing *.h5.bak backups after confirmation (no overwrite).",
|
|
180
|
+
)
|
|
181
|
+
group.add_argument(
|
|
182
|
+
"--undo",
|
|
183
|
+
action="store_true",
|
|
184
|
+
help="Restore only: move <file>.h5.bak back to <file>.h5 and preserve the current file as <file>_<method>.h5 when needed.",
|
|
185
|
+
)
|
|
155
186
|
p.set_defaults(func=do_overwrite)
|
|
156
187
|
|
|
157
188
|
args = parser.parse_args()
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
|
|
7
|
+
from esrf_data_compressor.utils.paths import (
|
|
8
|
+
find_dataset_base_h5,
|
|
9
|
+
resolve_compressed_path,
|
|
10
|
+
resolve_mirror_path,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Compressor:
|
|
15
|
+
"""
|
|
16
|
+
Abstract base class. Subclasses must implement compress_file().
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def compress_file(self, input_path: str, output_path: str, **kwargs):
|
|
20
|
+
raise NotImplementedError
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CompressorManager:
|
|
24
|
+
"""
|
|
25
|
+
Manages parallel compression and overwrite.
|
|
26
|
+
|
|
27
|
+
Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
|
|
28
|
+
has fewer than 4 cores). The number of worker processes is then
|
|
29
|
+
total_cores // threads_per_worker (at least 1). If the user explicitly
|
|
30
|
+
passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
|
|
31
|
+
= min(4, total_cores // workers).
|
|
32
|
+
|
|
33
|
+
Usage:
|
|
34
|
+
mgr = CompressorManager(cratio=10, method='jp2k')
|
|
35
|
+
mgr.compress_files([...])
|
|
36
|
+
mgr.overwrite_files([...])
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
workers: int | None = None,
|
|
42
|
+
cratio: int = 10,
|
|
43
|
+
method: str = "jp2k",
|
|
44
|
+
layout: str = "sibling",
|
|
45
|
+
):
|
|
46
|
+
total_cores = os.cpu_count() or 1
|
|
47
|
+
default_nthreads = 4 if total_cores >= 4 else 1
|
|
48
|
+
default_workers = max(1, total_cores // default_nthreads)
|
|
49
|
+
|
|
50
|
+
if workers is None:
|
|
51
|
+
w = default_workers
|
|
52
|
+
nthreads = default_nthreads
|
|
53
|
+
else:
|
|
54
|
+
w = min(workers, total_cores)
|
|
55
|
+
possible = total_cores // w
|
|
56
|
+
nthreads = min(possible, 4) if possible >= 1 else 1
|
|
57
|
+
|
|
58
|
+
self.workers = max(1, w)
|
|
59
|
+
self.nthreads = max(1, nthreads)
|
|
60
|
+
self.cratio = cratio
|
|
61
|
+
self.method = method
|
|
62
|
+
self.layout = layout
|
|
63
|
+
|
|
64
|
+
if self.method == "jp2k":
|
|
65
|
+
self.compressor = JP2KCompressorWrapper(
|
|
66
|
+
cratio=cratio, nthreads=self.nthreads
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError(f"Unsupported compression method: {self.method}")
|
|
70
|
+
|
|
71
|
+
print(f"Compression method: {self.method}")
|
|
72
|
+
print(f"Output layout: {self.layout}")
|
|
73
|
+
print(f"Total CPU cores: {total_cores}")
|
|
74
|
+
print(f"Worker processes: {self.workers}")
|
|
75
|
+
print(f"Threads per worker: {self.nthreads}")
|
|
76
|
+
print(f"Total threads: {self.workers * self.nthreads}")
|
|
77
|
+
|
|
78
|
+
def _compress_worker(self, ipath: str) -> tuple[str, str]:
|
|
79
|
+
"""
|
|
80
|
+
Worker function for ProcessPoolExecutor: compress a single HDF5:
|
|
81
|
+
- sibling layout: <same_dir>/<basename>_<method>.h5
|
|
82
|
+
- mirror layout: mirror RAW_DATA tree under RAW_DATA_COMPRESSED
|
|
83
|
+
"""
|
|
84
|
+
outp = resolve_compressed_path(ipath, self.method, layout=self.layout)
|
|
85
|
+
os.makedirs(os.path.dirname(outp), exist_ok=True)
|
|
86
|
+
self.compressor.compress_file(
|
|
87
|
+
ipath, outp, cratio=self.cratio, nthreads=self.nthreads
|
|
88
|
+
)
|
|
89
|
+
return ipath, "success"
|
|
90
|
+
|
|
91
|
+
def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
|
|
92
|
+
source_targets = {os.path.realpath(p) for p in file_list}
|
|
93
|
+
mirror_roots: set[str] = set()
|
|
94
|
+
for ipath in file_list:
|
|
95
|
+
base_h5 = find_dataset_base_h5(ipath)
|
|
96
|
+
dataset_dir = (
|
|
97
|
+
os.path.dirname(base_h5) if base_h5 else os.path.dirname(ipath)
|
|
98
|
+
)
|
|
99
|
+
# Mirror the parent sample folder too, so sidecar files next to
|
|
100
|
+
# dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
|
|
101
|
+
mirror_roots.add(os.path.dirname(dataset_dir))
|
|
102
|
+
|
|
103
|
+
for src_dir in sorted(mirror_roots):
|
|
104
|
+
try:
|
|
105
|
+
dst_dir = resolve_mirror_path(src_dir)
|
|
106
|
+
except ValueError:
|
|
107
|
+
print(f"WARNING: Cannot mirror folder outside RAW_DATA: '{src_dir}'")
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
for cur, dirs, files in os.walk(src_dir):
|
|
111
|
+
rel_cur = os.path.relpath(cur, src_dir)
|
|
112
|
+
target_cur = (
|
|
113
|
+
dst_dir if rel_cur == "." else os.path.join(dst_dir, rel_cur)
|
|
114
|
+
)
|
|
115
|
+
os.makedirs(target_cur, exist_ok=True)
|
|
116
|
+
|
|
117
|
+
for dname in dirs:
|
|
118
|
+
os.makedirs(os.path.join(target_cur, dname), exist_ok=True)
|
|
119
|
+
|
|
120
|
+
for fname in files:
|
|
121
|
+
src_file = os.path.join(cur, fname)
|
|
122
|
+
if os.path.realpath(src_file) in source_targets:
|
|
123
|
+
# Do not copy raw files that will be produced by compression.
|
|
124
|
+
continue
|
|
125
|
+
dst_file = os.path.join(target_cur, fname)
|
|
126
|
+
shutil.copy2(src_file, dst_file)
|
|
127
|
+
|
|
128
|
+
def compress_files(self, file_list: list[str]) -> None:
|
|
129
|
+
"""
|
|
130
|
+
Compress each .h5 in file_list in parallel.
|
|
131
|
+
- sibling layout: produce <basename>_<method>.h5 next to each source.
|
|
132
|
+
- mirror layout: write compressed files to RAW_DATA_COMPRESSED with same file names.
|
|
133
|
+
Does not overwrite originals. At the end, prints total elapsed time and data rate in MB/s.
|
|
134
|
+
"""
|
|
135
|
+
valid = [p for p in file_list if p.lower().endswith(".h5")]
|
|
136
|
+
if not valid:
|
|
137
|
+
print("No valid .h5 files to compress.")
|
|
138
|
+
return
|
|
139
|
+
if self.layout == "mirror":
|
|
140
|
+
print(
|
|
141
|
+
"Preparing RAW_DATA_COMPRESSED with non-compressed dataset content..."
|
|
142
|
+
)
|
|
143
|
+
self._mirror_non_compressed_dataset_content(valid)
|
|
144
|
+
|
|
145
|
+
total_bytes = 0
|
|
146
|
+
for f in valid:
|
|
147
|
+
try:
|
|
148
|
+
total_bytes += os.path.getsize(f)
|
|
149
|
+
except OSError:
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
import time
|
|
153
|
+
|
|
154
|
+
t0 = time.time()
|
|
155
|
+
|
|
156
|
+
with ProcessPoolExecutor(max_workers=self.workers) as executor:
|
|
157
|
+
futures = {executor.submit(self._compress_worker, p): p for p in valid}
|
|
158
|
+
for fut in tqdm(
|
|
159
|
+
as_completed(futures),
|
|
160
|
+
total=len(futures),
|
|
161
|
+
desc=f"Compressing HDF5 files ({self.method})",
|
|
162
|
+
unit="file",
|
|
163
|
+
):
|
|
164
|
+
pth = futures[fut]
|
|
165
|
+
try:
|
|
166
|
+
fut.result()
|
|
167
|
+
except Exception as e:
|
|
168
|
+
print(f"Failed to compress '{pth}': {e}")
|
|
169
|
+
|
|
170
|
+
elapsed = time.time() - t0
|
|
171
|
+
total_mb = total_bytes / (1024 * 1024)
|
|
172
|
+
rate_mb_s = total_mb / elapsed if elapsed > 0 else float("inf")
|
|
173
|
+
print(f"\nTotal elapsed time: {elapsed:.3f}s")
|
|
174
|
+
print(f"Data processed: {total_mb:.2f} MB ({rate_mb_s:.2f} MB/s)\n")
|
|
175
|
+
|
|
176
|
+
def overwrite_files(self, file_list: list[str]) -> None:
|
|
177
|
+
"""
|
|
178
|
+
Overwrites files only if they have a compressed sibling:
|
|
179
|
+
|
|
180
|
+
1) Rename <file>.h5 → <file>.h5.bak
|
|
181
|
+
2) Rename <file>_<method>.h5 → <file>.h5
|
|
182
|
+
|
|
183
|
+
After processing all files, removes the backup .h5.bak files.
|
|
184
|
+
"""
|
|
185
|
+
for ipath in file_list:
|
|
186
|
+
if not ipath.lower().endswith(".h5"):
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
compressed_path = resolve_compressed_path(
|
|
190
|
+
ipath, self.method, layout=self.layout
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if os.path.exists(compressed_path):
|
|
194
|
+
backup = ipath + ".bak"
|
|
195
|
+
try:
|
|
196
|
+
os.replace(ipath, backup)
|
|
197
|
+
os.replace(compressed_path, ipath)
|
|
198
|
+
print(f"Overwritten '{ipath}' (backup at '{backup}').")
|
|
199
|
+
except Exception as e:
|
|
200
|
+
print(f"ERROR overwriting '{ipath}': {e}")
|
|
201
|
+
else:
|
|
202
|
+
print(f"SKIP (no compressed file): {ipath}")
|
|
203
|
+
|
|
204
|
+
def remove_backups(self, file_list: list[str]) -> None:
|
|
205
|
+
candidates = {p + ".bak" for p in file_list if p.lower().endswith(".h5")}
|
|
206
|
+
backups = [b for b in candidates if os.path.exists(b)]
|
|
207
|
+
if not backups:
|
|
208
|
+
print("No backup files to remove.")
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
total_bytes = 0
|
|
212
|
+
for b in backups:
|
|
213
|
+
try:
|
|
214
|
+
total_bytes += os.path.getsize(b)
|
|
215
|
+
except OSError:
|
|
216
|
+
pass
|
|
217
|
+
total_mb = total_bytes / (1024 * 1024)
|
|
218
|
+
|
|
219
|
+
print(
|
|
220
|
+
f"About to remove {len(backups)} backup file(s), ~{total_mb:.2f} MB total."
|
|
221
|
+
)
|
|
222
|
+
ans = input("Proceed? [y/N]: ").strip().lower()
|
|
223
|
+
if ans not in ("y", "yes"):
|
|
224
|
+
print("Backups kept.")
|
|
225
|
+
return
|
|
226
|
+
|
|
227
|
+
removed = 0
|
|
228
|
+
for b in backups:
|
|
229
|
+
try:
|
|
230
|
+
os.remove(b)
|
|
231
|
+
removed += 1
|
|
232
|
+
except Exception as e:
|
|
233
|
+
print(f"ERROR deleting backup '{b}': {e}")
|
|
234
|
+
|
|
235
|
+
print(f"Deleted {removed} backup file(s).")
|
|
236
|
+
|
|
237
|
+
def restore_backups(self, file_list: list[str]) -> None:
|
|
238
|
+
restored = 0
|
|
239
|
+
preserved = 0
|
|
240
|
+
for ipath in file_list:
|
|
241
|
+
if not ipath.lower().endswith(".h5"):
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
backup = ipath + ".bak"
|
|
245
|
+
method_path = resolve_compressed_path(
|
|
246
|
+
ipath, self.method, layout=self.layout
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
if not os.path.exists(backup):
|
|
250
|
+
print(f"SKIP (no backup): {ipath}")
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
if os.path.exists(ipath) and not os.path.exists(method_path):
|
|
254
|
+
try:
|
|
255
|
+
os.replace(ipath, method_path)
|
|
256
|
+
preserved += 1
|
|
257
|
+
print(f"Preserved current file to '{method_path}'.")
|
|
258
|
+
except Exception as e:
|
|
259
|
+
print(f"ERROR preserving current '{ipath}' to '{method_path}': {e}")
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
os.replace(backup, ipath)
|
|
264
|
+
restored += 1
|
|
265
|
+
print(f"Restored '{ipath}' from backup.")
|
|
266
|
+
except Exception as e:
|
|
267
|
+
print(f"ERROR restoring '{ipath}' from '{backup}': {e}")
|
|
268
|
+
|
|
269
|
+
print(
|
|
270
|
+
f"Restore complete. Restored: {restored}, preserved compressed copies: {preserved}."
|
|
271
|
+
)
|
{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_cli.py
RENAMED
|
@@ -109,7 +109,7 @@ def test_commands_with_non_empty_list(
|
|
|
109
109
|
# Run command
|
|
110
110
|
argv = [cmd, "-i", "report.txt"]
|
|
111
111
|
if cmd == "compress":
|
|
112
|
-
argv += ["--cratio", "5", "--method", "jp2k"]
|
|
112
|
+
argv += ["--cratio", "5", "--method", "jp2k", "--layout", "sibling"]
|
|
113
113
|
argv_runner(argv)
|
|
114
114
|
out = capsys.readouterr().out
|
|
115
115
|
assert msg_start in out
|
|
@@ -119,12 +119,14 @@ def test_commands_with_non_empty_list(
|
|
|
119
119
|
for f in files:
|
|
120
120
|
comp = tmp_path / f.replace(".h5", "_jp2k.h5")
|
|
121
121
|
assert comp.exists()
|
|
122
|
-
# For overwrite, verify original replaced and backup
|
|
122
|
+
# For overwrite, verify original replaced and backup KEPT
|
|
123
123
|
if cmd == "overwrite":
|
|
124
124
|
# f1 was overwritten, f2 was skipped
|
|
125
125
|
assert (tmp_path / "f1.h5").exists()
|
|
126
|
-
#
|
|
127
|
-
assert
|
|
126
|
+
# backup remains by default
|
|
127
|
+
assert (tmp_path / "f1.h5.bak").exists()
|
|
128
|
+
# f2 had no compressed sibling → no backup
|
|
129
|
+
assert not (tmp_path / "f2.h5.bak").exists()
|
|
128
130
|
|
|
129
131
|
|
|
130
132
|
def test_list_success_and_output_file(argv_runner, monkeypatch, capsys, tmp_path):
|
|
@@ -150,7 +152,7 @@ def test_list_success_and_output_file(argv_runner, monkeypatch, capsys, tmp_path
|
|
|
150
152
|
[
|
|
151
153
|
("compress", "Nothing to compress"),
|
|
152
154
|
("check", "Nothing to check"),
|
|
153
|
-
("overwrite", "Nothing to
|
|
155
|
+
("overwrite", "Nothing to process"),
|
|
154
156
|
],
|
|
155
157
|
)
|
|
156
158
|
def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_path):
|
|
@@ -165,12 +167,96 @@ def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_pat
|
|
|
165
167
|
def test_check_success_writes_report(argv_runner, monkeypatch, capsys, tmp_path):
|
|
166
168
|
monkeypatch.setattr(cli, "parse_report", lambda rpt: ["f"])
|
|
167
169
|
|
|
168
|
-
def run(files, method, out):
|
|
170
|
+
def run(files, method, out, layout):
|
|
171
|
+
assert layout == "sibling"
|
|
169
172
|
with open(out, "w") as f:
|
|
170
173
|
f.write("ok")
|
|
171
174
|
|
|
172
175
|
monkeypatch.setattr(cli, "run_ssim_check", run)
|
|
173
176
|
report = tmp_path / "rpt.txt"
|
|
174
|
-
argv_runner(["check", "-i", str(report), "--method", "jp2k"])
|
|
177
|
+
argv_runner(["check", "-i", str(report), "--method", "jp2k", "--layout", "sibling"])
|
|
175
178
|
out = capsys.readouterr().out
|
|
176
179
|
assert "SSIM report written to" in out
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_compress_mirror_layout_creates_under_raw_data_compressed(
|
|
183
|
+
argv_runner, monkeypatch, tmp_path
|
|
184
|
+
):
|
|
185
|
+
ds = tmp_path / "RAW_DATA" / "sampleA" / "ds1"
|
|
186
|
+
src = ds / "scan0001" / "f1.h5"
|
|
187
|
+
src.parent.mkdir(parents=True)
|
|
188
|
+
src.write_text("data")
|
|
189
|
+
base = ds / "dataset.h5"
|
|
190
|
+
base.write_text("base")
|
|
191
|
+
sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
|
|
192
|
+
sample_sidecar.write_text("sidecar")
|
|
193
|
+
side = ds / "scan0002" / "meta.txt"
|
|
194
|
+
side.parent.mkdir(parents=True)
|
|
195
|
+
side.write_text("meta")
|
|
196
|
+
monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(src)])
|
|
197
|
+
monkeypatch.setattr(
|
|
198
|
+
JP2KCompressorWrapper,
|
|
199
|
+
"compress_file",
|
|
200
|
+
lambda self, inp, out, **kw: open(out, "w").close(),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
argv_runner(
|
|
204
|
+
[
|
|
205
|
+
"compress",
|
|
206
|
+
"-i",
|
|
207
|
+
"report.txt",
|
|
208
|
+
"--cratio",
|
|
209
|
+
"5",
|
|
210
|
+
"--method",
|
|
211
|
+
"jp2k",
|
|
212
|
+
"--layout",
|
|
213
|
+
"mirror",
|
|
214
|
+
]
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# The dataset base/filter file is mirrored under RAW_DATA_COMPRESSED.
|
|
218
|
+
assert (
|
|
219
|
+
tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "dataset.h5"
|
|
220
|
+
).exists()
|
|
221
|
+
# Compressed file keeps the same source name under mirrored scan path.
|
|
222
|
+
assert (
|
|
223
|
+
tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0001" / "f1.h5"
|
|
224
|
+
).exists()
|
|
225
|
+
assert (
|
|
226
|
+
tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
|
|
227
|
+
).exists()
|
|
228
|
+
assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
|
|
232
|
+
# Prepare a file and its backup
|
|
233
|
+
(tmp_path / "f1.h5").write_text("current")
|
|
234
|
+
(tmp_path / "f1.h5.bak").write_text("backup")
|
|
235
|
+
# parse_report returns the original .h5 path(s)
|
|
236
|
+
monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(tmp_path / "f1.h5")])
|
|
237
|
+
# auto-confirm deletion
|
|
238
|
+
monkeypatch.setattr("builtins.input", lambda *a, **k: "y")
|
|
239
|
+
|
|
240
|
+
argv_runner(["overwrite", "-i", "report.txt", "--final"])
|
|
241
|
+
out = capsys.readouterr().out
|
|
242
|
+
assert "About to remove" in out
|
|
243
|
+
assert not (tmp_path / "f1.h5.bak").exists()
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def test_overwrite_undo_restores_and_preserves(
|
|
247
|
+
argv_runner, monkeypatch, capsys, tmp_path
|
|
248
|
+
):
|
|
249
|
+
# Start with current file and a backup; no <method> file yet
|
|
250
|
+
(tmp_path / "f1.h5").write_text("CUR")
|
|
251
|
+
(tmp_path / "f1.h5.bak").write_text("BAK")
|
|
252
|
+
monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(tmp_path / "f1.h5")])
|
|
253
|
+
|
|
254
|
+
argv_runner(["overwrite", "-i", "report.txt", "--undo"])
|
|
255
|
+
out = capsys.readouterr().out
|
|
256
|
+
assert "Undoing overwrite" in out
|
|
257
|
+
# Backup should have been restored to f1.h5
|
|
258
|
+
assert (tmp_path / "f1.h5").read_text() == "BAK"
|
|
259
|
+
# Previous current should have been preserved as f1_jp2k.h5
|
|
260
|
+
assert (tmp_path / "f1_jp2k.h5").read_text() == "CUR"
|
|
261
|
+
# .bak should be gone after restore (moved)
|
|
262
|
+
assert not (tmp_path / "f1.h5.bak").exists()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from esrf_data_compressor.utils.paths import (
|
|
4
|
+
find_dataset_base_h5,
|
|
5
|
+
resolve_compressed_path,
|
|
6
|
+
resolve_mirror_path,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_resolve_compressed_path_sibling():
|
|
11
|
+
p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
|
|
12
|
+
out = resolve_compressed_path(p, "jp2k", layout="sibling")
|
|
13
|
+
assert out == "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1_jp2k.h5"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_resolve_compressed_path_mirror():
|
|
17
|
+
p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
|
|
18
|
+
out = resolve_compressed_path(p, "jp2k", layout="mirror")
|
|
19
|
+
assert out == "/data/visitor/e/bl/s/RAW_DATA_COMPRESSED/sample/ds/f1.h5"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_resolve_mirror_path_requires_raw_data():
|
|
23
|
+
with pytest.raises(ValueError):
|
|
24
|
+
resolve_mirror_path("/tmp/no_raw_data_here/f1.h5")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_find_dataset_base_h5(tmp_path):
|
|
28
|
+
ds = tmp_path / "RAW_DATA" / "sample" / "ds1"
|
|
29
|
+
scan = ds / "scan0001"
|
|
30
|
+
scan.mkdir(parents=True)
|
|
31
|
+
base = ds / "dataset.h5"
|
|
32
|
+
base.write_text("base")
|
|
33
|
+
src = scan / "frames.h5"
|
|
34
|
+
src.write_text("source")
|
|
35
|
+
|
|
36
|
+
assert find_dataset_base_h5(str(src)) == str(base)
|
|
@@ -105,3 +105,21 @@ def test_ssim_error_handling(tmp_path, monkeypatch):
|
|
|
105
105
|
# should include an ERROR line mentioning the exception message
|
|
106
106
|
assert any("ERROR processing file pair" in line for line in lines)
|
|
107
107
|
assert any("Error" in line for line in lines)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_mirror_layout_finds_compressed_file(tmp_path, monkeypatch):
|
|
111
|
+
raw = tmp_path / "RAW_DATA" / "sample" / "ds" / "d3.h5"
|
|
112
|
+
comp = tmp_path / "RAW_DATA_COMPRESSED" / "sample" / "ds" / "d3.h5"
|
|
113
|
+
raw.parent.mkdir(parents=True)
|
|
114
|
+
comp.parent.mkdir(parents=True)
|
|
115
|
+
raw.write_text("r3")
|
|
116
|
+
comp.write_text("c3")
|
|
117
|
+
report = tmp_path / "report.txt"
|
|
118
|
+
|
|
119
|
+
monkeypatch.setattr(rs, "compute_ssim_for_file_pair", lambda o, c: ("d3", ["ok"]))
|
|
120
|
+
|
|
121
|
+
rs.run_ssim_check(
|
|
122
|
+
[str(raw)], method="method", report_path=str(report), layout="mirror"
|
|
123
|
+
)
|
|
124
|
+
lines = _read_report(report)
|
|
125
|
+
assert lines[2] == f"Compressed file: {comp}"
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def resolve_mirror_path(
|
|
7
|
+
input_path: str,
|
|
8
|
+
*,
|
|
9
|
+
source_root: str = "RAW_DATA",
|
|
10
|
+
target_root: str = "RAW_DATA_COMPRESSED",
|
|
11
|
+
) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Build a mirrored path under `target_root` by replacing the `source_root`
|
|
14
|
+
segment in `input_path`.
|
|
15
|
+
"""
|
|
16
|
+
parts = Path(input_path).parts
|
|
17
|
+
if source_root not in parts:
|
|
18
|
+
raise ValueError(
|
|
19
|
+
f"Cannot mirror path '{input_path}': missing '{source_root}' segment."
|
|
20
|
+
)
|
|
21
|
+
idx = parts.index(source_root)
|
|
22
|
+
return str(Path(*parts[:idx], target_root, *parts[idx + 1 :]))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def resolve_compressed_path(
|
|
26
|
+
input_path: str,
|
|
27
|
+
method: str,
|
|
28
|
+
*,
|
|
29
|
+
layout: str = "sibling",
|
|
30
|
+
source_root: str = "RAW_DATA",
|
|
31
|
+
target_root: str = "RAW_DATA_COMPRESSED",
|
|
32
|
+
) -> str:
|
|
33
|
+
if layout == "sibling":
|
|
34
|
+
base_name = os.path.splitext(os.path.basename(input_path))[0]
|
|
35
|
+
compressed_name = f"{base_name}_{method}.h5"
|
|
36
|
+
return os.path.join(os.path.dirname(input_path), compressed_name)
|
|
37
|
+
if layout == "mirror":
|
|
38
|
+
# In mirror mode, compressed files keep the same file name as source.
|
|
39
|
+
return resolve_mirror_path(
|
|
40
|
+
input_path, source_root=source_root, target_root=target_root
|
|
41
|
+
)
|
|
42
|
+
raise ValueError(f"Unsupported layout: {layout}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def find_dataset_base_h5(
|
|
46
|
+
input_path: str,
|
|
47
|
+
*,
|
|
48
|
+
source_root: str = "RAW_DATA",
|
|
49
|
+
) -> str | None:
|
|
50
|
+
"""
|
|
51
|
+
Walk up from `input_path` to find the dataset directory that contains:
|
|
52
|
+
- exactly one .h5 file (the base/filter file)
|
|
53
|
+
- at least one scanXXXX subdirectory
|
|
54
|
+
Returns the absolute path to that .h5, or None when not found.
|
|
55
|
+
"""
|
|
56
|
+
scan_re = re.compile(r"^scan\d{4}$", re.IGNORECASE)
|
|
57
|
+
p = Path(input_path).resolve()
|
|
58
|
+
parts = p.parts
|
|
59
|
+
if source_root not in parts:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
root_idx = parts.index(source_root)
|
|
63
|
+
cur = p.parent
|
|
64
|
+
while True:
|
|
65
|
+
if len(cur.parts) < root_idx + 1:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
entries = list(cur.iterdir())
|
|
70
|
+
except OSError:
|
|
71
|
+
entries = []
|
|
72
|
+
|
|
73
|
+
h5_files = [e for e in entries if e.is_file() and e.suffix.lower() == ".h5"]
|
|
74
|
+
has_scan = any(e.is_dir() and scan_re.match(e.name) for e in entries)
|
|
75
|
+
|
|
76
|
+
if has_scan and len(h5_files) == 1:
|
|
77
|
+
return str(h5_files[0])
|
|
78
|
+
|
|
79
|
+
if len(cur.parts) == root_idx + 1:
|
|
80
|
+
return None
|
|
81
|
+
cur = cur.parent
|
{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: esrf-data-compressor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A library to compress ESRF data and reduce their footprint
|
|
5
5
|
Author-email: ESRF <dau-pydev@esrf.fr>
|
|
6
6
|
License: MIT License
|
|
@@ -84,7 +84,9 @@ Dynamic: license-file
|
|
|
84
84
|
|
|
85
85
|
* **Non-destructive workflow**
|
|
86
86
|
|
|
87
|
-
1. `compress` writes
|
|
87
|
+
1. `compress` writes compressed files either:
|
|
88
|
+
- next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
|
|
89
|
+
- under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
|
|
88
90
|
2. `check` computes SSIM (first and last frames) and writes a report
|
|
89
91
|
3. `overwrite` (optional) swaps out the raw frame file (irreversible)
|
|
90
92
|
|
|
@@ -20,8 +20,10 @@ src/esrf_data_compressor/tests/test_cli.py
|
|
|
20
20
|
src/esrf_data_compressor/tests/test_finder.py
|
|
21
21
|
src/esrf_data_compressor/tests/test_hdf5_helpers.py
|
|
22
22
|
src/esrf_data_compressor/tests/test_jp2k.py
|
|
23
|
+
src/esrf_data_compressor/tests/test_paths.py
|
|
23
24
|
src/esrf_data_compressor/tests/test_run_check.py
|
|
24
25
|
src/esrf_data_compressor/tests/test_ssim.py
|
|
25
26
|
src/esrf_data_compressor/tests/test_utils.py
|
|
26
27
|
src/esrf_data_compressor/utils/hdf5_helpers.py
|
|
28
|
+
src/esrf_data_compressor/utils/paths.py
|
|
27
29
|
src/esrf_data_compressor/utils/utils.py
|
|
@@ -1,167 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
3
|
-
from tqdm import tqdm
|
|
4
|
-
|
|
5
|
-
from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Compressor:
|
|
9
|
-
"""
|
|
10
|
-
Abstract base class. Subclasses must implement compress_file().
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def compress_file(self, input_path: str, output_path: str, **kwargs):
|
|
14
|
-
raise NotImplementedError
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class CompressorManager:
|
|
18
|
-
"""
|
|
19
|
-
Manages parallel compression and overwrite.
|
|
20
|
-
|
|
21
|
-
Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
|
|
22
|
-
has fewer than 4 cores). The number of worker processes is then
|
|
23
|
-
total_cores // threads_per_worker (at least 1). If the user explicitly
|
|
24
|
-
passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
|
|
25
|
-
= min(4, total_cores // workers).
|
|
26
|
-
|
|
27
|
-
Usage:
|
|
28
|
-
mgr = CompressorManager(cratio=10, method='jp2k')
|
|
29
|
-
mgr.compress_files([...])
|
|
30
|
-
mgr.overwrite_files([...])
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
def __init__(
|
|
34
|
-
self, workers: int | None = None, cratio: int = 10, method: str = "jp2k"
|
|
35
|
-
):
|
|
36
|
-
total_cores = os.cpu_count() or 1
|
|
37
|
-
|
|
38
|
-
# Determine default threads per worker (4, or fewer if total_cores < 4)
|
|
39
|
-
if total_cores >= 4:
|
|
40
|
-
default_nthreads = 4
|
|
41
|
-
else:
|
|
42
|
-
default_nthreads = 1
|
|
43
|
-
|
|
44
|
-
# Default worker count
|
|
45
|
-
default_workers = max(1, total_cores // default_nthreads)
|
|
46
|
-
|
|
47
|
-
if workers is None:
|
|
48
|
-
# Use default workers and default_nthreads
|
|
49
|
-
w = default_workers
|
|
50
|
-
nthreads = default_nthreads
|
|
51
|
-
else:
|
|
52
|
-
# Cap workers to total_cores
|
|
53
|
-
w = min(workers, total_cores)
|
|
54
|
-
# Recompute threads per worker so that (w * nthreads) ≤ total_cores, up to 4
|
|
55
|
-
possible = total_cores // w
|
|
56
|
-
nthreads = min(possible, 4) if possible >= 1 else 1
|
|
57
|
-
|
|
58
|
-
self.workers = max(1, w)
|
|
59
|
-
self.nthreads = max(1, nthreads)
|
|
60
|
-
self.cratio = cratio
|
|
61
|
-
self.method = method
|
|
62
|
-
|
|
63
|
-
# Instantiate compressor based on method
|
|
64
|
-
if self.method == "jp2k":
|
|
65
|
-
self.compressor = JP2KCompressorWrapper(
|
|
66
|
-
cratio=cratio, nthreads=self.nthreads
|
|
67
|
-
)
|
|
68
|
-
else:
|
|
69
|
-
raise ValueError(f"Unsupported compression method: {self.method}")
|
|
70
|
-
|
|
71
|
-
print(f"Compression method: {self.method}")
|
|
72
|
-
print(f"Total CPU cores: {total_cores}")
|
|
73
|
-
print(f"Worker processes: {self.workers}")
|
|
74
|
-
print(f"Threads per worker: {self.nthreads}")
|
|
75
|
-
print(f"Total threads: {self.workers * self.nthreads}")
|
|
76
|
-
|
|
77
|
-
def _compress_worker(self, ipath: str) -> tuple[str, str]:
|
|
78
|
-
"""
|
|
79
|
-
Worker function for ProcessPoolExecutor: compress a single HDF5:
|
|
80
|
-
<ipath>.h5 → <same_dir>/<basename>_<method>.h5
|
|
81
|
-
"""
|
|
82
|
-
base, _ = os.path.splitext(ipath)
|
|
83
|
-
outp = f"{base}_{self.method}.h5"
|
|
84
|
-
self.compressor.compress_file(
|
|
85
|
-
ipath, outp, cratio=self.cratio, nthreads=self.nthreads
|
|
86
|
-
)
|
|
87
|
-
return ipath, "success"
|
|
88
|
-
|
|
89
|
-
def compress_files(self, file_list: list[str]) -> None:
|
|
90
|
-
"""
|
|
91
|
-
Compress each .h5 in file_list in parallel, producing <basename>_<method>.h5
|
|
92
|
-
next to each source file. Does not overwrite originals. At the end, prints
|
|
93
|
-
total elapsed time and data rate in MB/s.
|
|
94
|
-
"""
|
|
95
|
-
valid = [p for p in file_list if p.lower().endswith(".h5")]
|
|
96
|
-
if not valid:
|
|
97
|
-
print("No valid .h5 files to compress.")
|
|
98
|
-
return
|
|
99
|
-
|
|
100
|
-
total_bytes = 0
|
|
101
|
-
for f in valid:
|
|
102
|
-
try:
|
|
103
|
-
total_bytes += os.path.getsize(f)
|
|
104
|
-
except OSError:
|
|
105
|
-
pass
|
|
106
|
-
|
|
107
|
-
import time
|
|
108
|
-
|
|
109
|
-
t0 = time.time()
|
|
110
|
-
|
|
111
|
-
with ProcessPoolExecutor(max_workers=self.workers) as executor:
|
|
112
|
-
futures = {executor.submit(self._compress_worker, p): p for p in valid}
|
|
113
|
-
for fut in tqdm(
|
|
114
|
-
as_completed(futures),
|
|
115
|
-
total=len(futures),
|
|
116
|
-
desc=f"Compressing HDF5 files ({self.method})",
|
|
117
|
-
unit="file",
|
|
118
|
-
):
|
|
119
|
-
pth = futures[fut]
|
|
120
|
-
try:
|
|
121
|
-
fut.result()
|
|
122
|
-
except Exception as e:
|
|
123
|
-
print(f"Failed to compress '{pth}': {e}")
|
|
124
|
-
|
|
125
|
-
t1 = time.time()
|
|
126
|
-
elapsed = t1 - t0
|
|
127
|
-
total_mb = total_bytes / (1024 * 1024)
|
|
128
|
-
rate_mb_s = total_mb / elapsed if elapsed > 0 else float("inf")
|
|
129
|
-
print(f"\nTotal elapsed time: {elapsed:.3f}s")
|
|
130
|
-
print(f"Data processed: {total_mb:.2f} MB ({rate_mb_s:.2f} MB/s)\n")
|
|
131
|
-
|
|
132
|
-
def overwrite_files(self, file_list: list[str]) -> None:
|
|
133
|
-
"""
|
|
134
|
-
Overwrites files only if they have a compressed sibling:
|
|
135
|
-
|
|
136
|
-
1) Rename <file>.h5 → <file>.h5.bak
|
|
137
|
-
2) Rename <file>_<method>.h5 → <file>.h5
|
|
138
|
-
|
|
139
|
-
After processing all files, removes the backup .h5.bak files.
|
|
140
|
-
"""
|
|
141
|
-
backups = []
|
|
142
|
-
for ipath in file_list:
|
|
143
|
-
if not ipath.lower().endswith(".h5"):
|
|
144
|
-
continue
|
|
145
|
-
|
|
146
|
-
base, _ = os.path.splitext(ipath)
|
|
147
|
-
compressed_path = f"{base}_{self.method}.h5"
|
|
148
|
-
|
|
149
|
-
if os.path.exists(compressed_path):
|
|
150
|
-
backup = ipath + ".bak"
|
|
151
|
-
try:
|
|
152
|
-
os.replace(ipath, backup)
|
|
153
|
-
os.replace(compressed_path, ipath)
|
|
154
|
-
backups.append(backup)
|
|
155
|
-
print(f"Overwritten '{ipath}' (backup at '{backup}').")
|
|
156
|
-
except Exception as e:
|
|
157
|
-
print(f"ERROR overwriting '{ipath}': {e}")
|
|
158
|
-
else:
|
|
159
|
-
print(f"SKIP (no compressed file): {ipath}")
|
|
160
|
-
|
|
161
|
-
# Remove all backup files
|
|
162
|
-
for backup in backups:
|
|
163
|
-
try:
|
|
164
|
-
os.remove(backup)
|
|
165
|
-
print(f"Deleted backup '{backup}'.")
|
|
166
|
-
except Exception as e:
|
|
167
|
-
print(f"ERROR deleting backup '{backup}': {e}")
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/__init__.py
RENAMED
|
File without changes
|
{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/ssim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/finder/finder.py
RENAMED
|
File without changes
|
{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|