esrf-data-compressor 0.1.2__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {esrf_data_compressor-0.1.2/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.0}/PKG-INFO +4 -2
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/README.md +4 -2
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/pyproject.toml +2 -2
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/run_check.py +6 -5
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/cli.py +15 -3
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/base.py +69 -11
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_cli.py +53 -3
- esrf_data_compressor-0.2.0/src/esrf_data_compressor/tests/test_paths.py +36 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_run_check.py +18 -0
- esrf_data_compressor-0.2.0/src/esrf_data_compressor/utils/paths.py +81 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info}/PKG-INFO +4 -2
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/SOURCES.txt +2 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/LICENSE +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/setup.cfg +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/__init__.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/ssim.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/__init__.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/jp2k.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/finder/finder.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/__init__.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_finder.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_hdf5_helpers.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_jp2k.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_ssim.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_utils.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/hdf5_helpers.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/utils.py +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/dependency_links.txt +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/entry_points.txt +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/requires.txt +0 -0
- {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/top_level.txt +0 -0
{esrf_data_compressor-0.1.2/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: esrf-data-compressor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A library to compress ESRF data and reduce their footprint
|
|
5
5
|
Author-email: ESRF <dau-pydev@esrf.fr>
|
|
6
6
|
License: MIT License
|
|
@@ -84,7 +84,9 @@ Dynamic: license-file
|
|
|
84
84
|
|
|
85
85
|
* **Non-destructive workflow**
|
|
86
86
|
|
|
87
|
-
1. `compress` writes
|
|
87
|
+
1. `compress` writes compressed files either:
|
|
88
|
+
- next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
|
|
89
|
+
- under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
|
|
88
90
|
2. `check` computes SSIM (first and last frames) and writes a report
|
|
89
91
|
3. `overwrite` (optional) swaps out the raw frame file (irreversible)
|
|
90
92
|
|
|
@@ -23,7 +23,9 @@
|
|
|
23
23
|
|
|
24
24
|
* **Non-destructive workflow**
|
|
25
25
|
|
|
26
|
-
1. `compress` writes
|
|
26
|
+
1. `compress` writes compressed files either:
|
|
27
|
+
- next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
|
|
28
|
+
- under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
|
|
27
29
|
2. `check` computes SSIM (first and last frames) and writes a report
|
|
28
30
|
3. `overwrite` (optional) swaps out the raw frame file (irreversible)
|
|
29
31
|
|
|
@@ -119,4 +121,4 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
|
|
|
119
121
|
* Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
|
|
120
122
|
* Parallelism with worker×thread auto-factoring.
|
|
121
123
|
|
|
122
|
-
For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
|
|
124
|
+
For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "esrf-data-compressor"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
|
|
9
9
|
description = "A library to compress ESRF data and reduce their footprint"
|
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
@@ -69,4 +69,4 @@ omit = ["*/tests/*"]
|
|
|
69
69
|
|
|
70
70
|
[tool.isort]
|
|
71
71
|
profile = "black"
|
|
72
|
-
force_single_line = true
|
|
72
|
+
force_single_line = true
|
|
@@ -3,12 +3,15 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
3
3
|
from tqdm import tqdm
|
|
4
4
|
|
|
5
5
|
from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
|
|
6
|
+
from esrf_data_compressor.utils.paths import resolve_compressed_path
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
def run_ssim_check(
|
|
9
|
+
def run_ssim_check(
|
|
10
|
+
raw_files: list[str], method: str, report_path: str, layout: str = "sibling"
|
|
11
|
+
) -> None:
|
|
9
12
|
"""
|
|
10
13
|
Given a list of raw HDF5 file paths, partitions into:
|
|
11
|
-
to_check → those with
|
|
14
|
+
to_check → those with an expected compressed counterpart according to `layout`
|
|
12
15
|
missing → those without one
|
|
13
16
|
|
|
14
17
|
Writes a report to `report_path`:
|
|
@@ -21,9 +24,7 @@ def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
|
|
|
21
24
|
|
|
22
25
|
# partition
|
|
23
26
|
for orig in raw_files:
|
|
24
|
-
|
|
25
|
-
stem, _ = os.path.splitext(fname)
|
|
26
|
-
comp_path = os.path.join(dirname, f"{stem}_{method}.h5")
|
|
27
|
+
comp_path = resolve_compressed_path(orig, method, layout=layout)
|
|
27
28
|
if os.path.exists(comp_path):
|
|
28
29
|
to_check.append((orig, comp_path))
|
|
29
30
|
else:
|
|
@@ -50,9 +50,9 @@ def do_compress(args):
|
|
|
50
50
|
return
|
|
51
51
|
|
|
52
52
|
print(
|
|
53
|
-
f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method
|
|
53
|
+
f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method, ratio {args.cratio}, layout '{args.layout}' …"
|
|
54
54
|
)
|
|
55
|
-
mgr = CompressorManager(cratio=args.cratio, method=args.method)
|
|
55
|
+
mgr = CompressorManager(cratio=args.cratio, method=args.method, layout=args.layout)
|
|
56
56
|
mgr.compress_files(files)
|
|
57
57
|
print("Compression complete.\n")
|
|
58
58
|
|
|
@@ -72,7 +72,7 @@ def do_check(args):
|
|
|
72
72
|
report_path = os.path.abspath(report_fname)
|
|
73
73
|
|
|
74
74
|
try:
|
|
75
|
-
run_ssim_check(files, args.method, report_path)
|
|
75
|
+
run_ssim_check(files, args.method, report_path, layout=args.layout)
|
|
76
76
|
except SystemExit as e:
|
|
77
77
|
exit_with_error(str(e))
|
|
78
78
|
|
|
@@ -142,6 +142,12 @@ def main():
|
|
|
142
142
|
default="jp2k",
|
|
143
143
|
help="Compression method",
|
|
144
144
|
)
|
|
145
|
+
p.add_argument(
|
|
146
|
+
"--layout",
|
|
147
|
+
choices=["sibling", "mirror"],
|
|
148
|
+
default="mirror",
|
|
149
|
+
help="Output layout: sibling (next to each source) or mirror (under RAW_DATA_COMPRESSED, preserving source names).",
|
|
150
|
+
)
|
|
145
151
|
p.set_defaults(func=do_compress)
|
|
146
152
|
|
|
147
153
|
p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
|
|
@@ -151,6 +157,12 @@ def main():
|
|
|
151
157
|
p.add_argument(
|
|
152
158
|
"--method", choices=["jp2k"], default="jp2k", help="Compression method"
|
|
153
159
|
)
|
|
160
|
+
p.add_argument(
|
|
161
|
+
"--layout",
|
|
162
|
+
choices=["sibling", "mirror"],
|
|
163
|
+
default="mirror",
|
|
164
|
+
help="Location of compressed files to check.",
|
|
165
|
+
)
|
|
154
166
|
p.set_defaults(func=do_check)
|
|
155
167
|
|
|
156
168
|
p = sub.add_parser(
|
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import shutil
|
|
2
3
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
3
4
|
from tqdm import tqdm
|
|
4
5
|
|
|
5
6
|
from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
|
|
7
|
+
from esrf_data_compressor.utils.paths import (
|
|
8
|
+
find_dataset_base_h5,
|
|
9
|
+
resolve_compressed_path,
|
|
10
|
+
resolve_mirror_path,
|
|
11
|
+
)
|
|
6
12
|
|
|
7
13
|
|
|
8
14
|
class Compressor:
|
|
@@ -31,7 +37,11 @@ class CompressorManager:
|
|
|
31
37
|
"""
|
|
32
38
|
|
|
33
39
|
def __init__(
|
|
34
|
-
self,
|
|
40
|
+
self,
|
|
41
|
+
workers: int | None = None,
|
|
42
|
+
cratio: int = 10,
|
|
43
|
+
method: str = "jp2k",
|
|
44
|
+
layout: str = "sibling",
|
|
35
45
|
):
|
|
36
46
|
total_cores = os.cpu_count() or 1
|
|
37
47
|
default_nthreads = 4 if total_cores >= 4 else 1
|
|
@@ -49,6 +59,7 @@ class CompressorManager:
|
|
|
49
59
|
self.nthreads = max(1, nthreads)
|
|
50
60
|
self.cratio = cratio
|
|
51
61
|
self.method = method
|
|
62
|
+
self.layout = layout
|
|
52
63
|
|
|
53
64
|
if self.method == "jp2k":
|
|
54
65
|
self.compressor = JP2KCompressorWrapper(
|
|
@@ -58,6 +69,7 @@ class CompressorManager:
|
|
|
58
69
|
raise ValueError(f"Unsupported compression method: {self.method}")
|
|
59
70
|
|
|
60
71
|
print(f"Compression method: {self.method}")
|
|
72
|
+
print(f"Output layout: {self.layout}")
|
|
61
73
|
print(f"Total CPU cores: {total_cores}")
|
|
62
74
|
print(f"Worker processes: {self.workers}")
|
|
63
75
|
print(f"Threads per worker: {self.nthreads}")
|
|
@@ -66,25 +78,69 @@ class CompressorManager:
|
|
|
66
78
|
def _compress_worker(self, ipath: str) -> tuple[str, str]:
|
|
67
79
|
"""
|
|
68
80
|
Worker function for ProcessPoolExecutor: compress a single HDF5:
|
|
69
|
-
|
|
81
|
+
- sibling layout: <same_dir>/<basename>_<method>.h5
|
|
82
|
+
- mirror layout: mirror RAW_DATA tree under RAW_DATA_COMPRESSED
|
|
70
83
|
"""
|
|
71
|
-
|
|
72
|
-
outp =
|
|
84
|
+
outp = resolve_compressed_path(ipath, self.method, layout=self.layout)
|
|
85
|
+
os.makedirs(os.path.dirname(outp), exist_ok=True)
|
|
73
86
|
self.compressor.compress_file(
|
|
74
87
|
ipath, outp, cratio=self.cratio, nthreads=self.nthreads
|
|
75
88
|
)
|
|
76
89
|
return ipath, "success"
|
|
77
90
|
|
|
91
|
+
def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
|
|
92
|
+
source_targets = {os.path.realpath(p) for p in file_list}
|
|
93
|
+
mirror_roots: set[str] = set()
|
|
94
|
+
for ipath in file_list:
|
|
95
|
+
base_h5 = find_dataset_base_h5(ipath)
|
|
96
|
+
dataset_dir = (
|
|
97
|
+
os.path.dirname(base_h5) if base_h5 else os.path.dirname(ipath)
|
|
98
|
+
)
|
|
99
|
+
# Mirror the parent sample folder too, so sidecar files next to
|
|
100
|
+
# dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
|
|
101
|
+
mirror_roots.add(os.path.dirname(dataset_dir))
|
|
102
|
+
|
|
103
|
+
for src_dir in sorted(mirror_roots):
|
|
104
|
+
try:
|
|
105
|
+
dst_dir = resolve_mirror_path(src_dir)
|
|
106
|
+
except ValueError:
|
|
107
|
+
print(f"WARNING: Cannot mirror folder outside RAW_DATA: '{src_dir}'")
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
for cur, dirs, files in os.walk(src_dir):
|
|
111
|
+
rel_cur = os.path.relpath(cur, src_dir)
|
|
112
|
+
target_cur = (
|
|
113
|
+
dst_dir if rel_cur == "." else os.path.join(dst_dir, rel_cur)
|
|
114
|
+
)
|
|
115
|
+
os.makedirs(target_cur, exist_ok=True)
|
|
116
|
+
|
|
117
|
+
for dname in dirs:
|
|
118
|
+
os.makedirs(os.path.join(target_cur, dname), exist_ok=True)
|
|
119
|
+
|
|
120
|
+
for fname in files:
|
|
121
|
+
src_file = os.path.join(cur, fname)
|
|
122
|
+
if os.path.realpath(src_file) in source_targets:
|
|
123
|
+
# Do not copy raw files that will be produced by compression.
|
|
124
|
+
continue
|
|
125
|
+
dst_file = os.path.join(target_cur, fname)
|
|
126
|
+
shutil.copy2(src_file, dst_file)
|
|
127
|
+
|
|
78
128
|
def compress_files(self, file_list: list[str]) -> None:
|
|
79
129
|
"""
|
|
80
|
-
Compress each .h5 in file_list in parallel
|
|
81
|
-
|
|
82
|
-
|
|
130
|
+
Compress each .h5 in file_list in parallel.
|
|
131
|
+
- sibling layout: produce <basename>_<method>.h5 next to each source.
|
|
132
|
+
- mirror layout: write compressed files to RAW_DATA_COMPRESSED with same file names.
|
|
133
|
+
Does not overwrite originals. At the end, prints total elapsed time and data rate in MB/s.
|
|
83
134
|
"""
|
|
84
135
|
valid = [p for p in file_list if p.lower().endswith(".h5")]
|
|
85
136
|
if not valid:
|
|
86
137
|
print("No valid .h5 files to compress.")
|
|
87
138
|
return
|
|
139
|
+
if self.layout == "mirror":
|
|
140
|
+
print(
|
|
141
|
+
"Preparing RAW_DATA_COMPRESSED with non-compressed dataset content..."
|
|
142
|
+
)
|
|
143
|
+
self._mirror_non_compressed_dataset_content(valid)
|
|
88
144
|
|
|
89
145
|
total_bytes = 0
|
|
90
146
|
for f in valid:
|
|
@@ -130,8 +186,9 @@ class CompressorManager:
|
|
|
130
186
|
if not ipath.lower().endswith(".h5"):
|
|
131
187
|
continue
|
|
132
188
|
|
|
133
|
-
|
|
134
|
-
|
|
189
|
+
compressed_path = resolve_compressed_path(
|
|
190
|
+
ipath, self.method, layout=self.layout
|
|
191
|
+
)
|
|
135
192
|
|
|
136
193
|
if os.path.exists(compressed_path):
|
|
137
194
|
backup = ipath + ".bak"
|
|
@@ -184,9 +241,10 @@ class CompressorManager:
|
|
|
184
241
|
if not ipath.lower().endswith(".h5"):
|
|
185
242
|
continue
|
|
186
243
|
|
|
187
|
-
base, _ = os.path.splitext(ipath)
|
|
188
244
|
backup = ipath + ".bak"
|
|
189
|
-
method_path =
|
|
245
|
+
method_path = resolve_compressed_path(
|
|
246
|
+
ipath, self.method, layout=self.layout
|
|
247
|
+
)
|
|
190
248
|
|
|
191
249
|
if not os.path.exists(backup):
|
|
192
250
|
print(f"SKIP (no backup): {ipath}")
|
{esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_cli.py
RENAMED
|
@@ -109,7 +109,7 @@ def test_commands_with_non_empty_list(
|
|
|
109
109
|
# Run command
|
|
110
110
|
argv = [cmd, "-i", "report.txt"]
|
|
111
111
|
if cmd == "compress":
|
|
112
|
-
argv += ["--cratio", "5", "--method", "jp2k"]
|
|
112
|
+
argv += ["--cratio", "5", "--method", "jp2k", "--layout", "sibling"]
|
|
113
113
|
argv_runner(argv)
|
|
114
114
|
out = capsys.readouterr().out
|
|
115
115
|
assert msg_start in out
|
|
@@ -167,17 +167,67 @@ def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_pat
|
|
|
167
167
|
def test_check_success_writes_report(argv_runner, monkeypatch, capsys, tmp_path):
|
|
168
168
|
monkeypatch.setattr(cli, "parse_report", lambda rpt: ["f"])
|
|
169
169
|
|
|
170
|
-
def run(files, method, out):
|
|
170
|
+
def run(files, method, out, layout):
|
|
171
|
+
assert layout == "sibling"
|
|
171
172
|
with open(out, "w") as f:
|
|
172
173
|
f.write("ok")
|
|
173
174
|
|
|
174
175
|
monkeypatch.setattr(cli, "run_ssim_check", run)
|
|
175
176
|
report = tmp_path / "rpt.txt"
|
|
176
|
-
argv_runner(["check", "-i", str(report), "--method", "jp2k"])
|
|
177
|
+
argv_runner(["check", "-i", str(report), "--method", "jp2k", "--layout", "sibling"])
|
|
177
178
|
out = capsys.readouterr().out
|
|
178
179
|
assert "SSIM report written to" in out
|
|
179
180
|
|
|
180
181
|
|
|
182
|
+
def test_compress_mirror_layout_creates_under_raw_data_compressed(
|
|
183
|
+
argv_runner, monkeypatch, tmp_path
|
|
184
|
+
):
|
|
185
|
+
ds = tmp_path / "RAW_DATA" / "sampleA" / "ds1"
|
|
186
|
+
src = ds / "scan0001" / "f1.h5"
|
|
187
|
+
src.parent.mkdir(parents=True)
|
|
188
|
+
src.write_text("data")
|
|
189
|
+
base = ds / "dataset.h5"
|
|
190
|
+
base.write_text("base")
|
|
191
|
+
sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
|
|
192
|
+
sample_sidecar.write_text("sidecar")
|
|
193
|
+
side = ds / "scan0002" / "meta.txt"
|
|
194
|
+
side.parent.mkdir(parents=True)
|
|
195
|
+
side.write_text("meta")
|
|
196
|
+
monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(src)])
|
|
197
|
+
monkeypatch.setattr(
|
|
198
|
+
JP2KCompressorWrapper,
|
|
199
|
+
"compress_file",
|
|
200
|
+
lambda self, inp, out, **kw: open(out, "w").close(),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
argv_runner(
|
|
204
|
+
[
|
|
205
|
+
"compress",
|
|
206
|
+
"-i",
|
|
207
|
+
"report.txt",
|
|
208
|
+
"--cratio",
|
|
209
|
+
"5",
|
|
210
|
+
"--method",
|
|
211
|
+
"jp2k",
|
|
212
|
+
"--layout",
|
|
213
|
+
"mirror",
|
|
214
|
+
]
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# The dataset base/filter file is mirrored under RAW_DATA_COMPRESSED.
|
|
218
|
+
assert (
|
|
219
|
+
tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "dataset.h5"
|
|
220
|
+
).exists()
|
|
221
|
+
# Compressed file keeps the same source name under mirrored scan path.
|
|
222
|
+
assert (
|
|
223
|
+
tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0001" / "f1.h5"
|
|
224
|
+
).exists()
|
|
225
|
+
assert (
|
|
226
|
+
tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
|
|
227
|
+
).exists()
|
|
228
|
+
assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
|
|
229
|
+
|
|
230
|
+
|
|
181
231
|
def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
|
|
182
232
|
# Prepare a file and its backup
|
|
183
233
|
(tmp_path / "f1.h5").write_text("current")
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from esrf_data_compressor.utils.paths import (
|
|
4
|
+
find_dataset_base_h5,
|
|
5
|
+
resolve_compressed_path,
|
|
6
|
+
resolve_mirror_path,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_resolve_compressed_path_sibling():
|
|
11
|
+
p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
|
|
12
|
+
out = resolve_compressed_path(p, "jp2k", layout="sibling")
|
|
13
|
+
assert out == "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1_jp2k.h5"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_resolve_compressed_path_mirror():
|
|
17
|
+
p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
|
|
18
|
+
out = resolve_compressed_path(p, "jp2k", layout="mirror")
|
|
19
|
+
assert out == "/data/visitor/e/bl/s/RAW_DATA_COMPRESSED/sample/ds/f1.h5"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_resolve_mirror_path_requires_raw_data():
|
|
23
|
+
with pytest.raises(ValueError):
|
|
24
|
+
resolve_mirror_path("/tmp/no_raw_data_here/f1.h5")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_find_dataset_base_h5(tmp_path):
|
|
28
|
+
ds = tmp_path / "RAW_DATA" / "sample" / "ds1"
|
|
29
|
+
scan = ds / "scan0001"
|
|
30
|
+
scan.mkdir(parents=True)
|
|
31
|
+
base = ds / "dataset.h5"
|
|
32
|
+
base.write_text("base")
|
|
33
|
+
src = scan / "frames.h5"
|
|
34
|
+
src.write_text("source")
|
|
35
|
+
|
|
36
|
+
assert find_dataset_base_h5(str(src)) == str(base)
|
|
@@ -105,3 +105,21 @@ def test_ssim_error_handling(tmp_path, monkeypatch):
|
|
|
105
105
|
# should include an ERROR line mentioning the exception message
|
|
106
106
|
assert any("ERROR processing file pair" in line for line in lines)
|
|
107
107
|
assert any("Error" in line for line in lines)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_mirror_layout_finds_compressed_file(tmp_path, monkeypatch):
|
|
111
|
+
raw = tmp_path / "RAW_DATA" / "sample" / "ds" / "d3.h5"
|
|
112
|
+
comp = tmp_path / "RAW_DATA_COMPRESSED" / "sample" / "ds" / "d3.h5"
|
|
113
|
+
raw.parent.mkdir(parents=True)
|
|
114
|
+
comp.parent.mkdir(parents=True)
|
|
115
|
+
raw.write_text("r3")
|
|
116
|
+
comp.write_text("c3")
|
|
117
|
+
report = tmp_path / "report.txt"
|
|
118
|
+
|
|
119
|
+
monkeypatch.setattr(rs, "compute_ssim_for_file_pair", lambda o, c: ("d3", ["ok"]))
|
|
120
|
+
|
|
121
|
+
rs.run_ssim_check(
|
|
122
|
+
[str(raw)], method="method", report_path=str(report), layout="mirror"
|
|
123
|
+
)
|
|
124
|
+
lines = _read_report(report)
|
|
125
|
+
assert lines[2] == f"Compressed file: {comp}"
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def resolve_mirror_path(
|
|
7
|
+
input_path: str,
|
|
8
|
+
*,
|
|
9
|
+
source_root: str = "RAW_DATA",
|
|
10
|
+
target_root: str = "RAW_DATA_COMPRESSED",
|
|
11
|
+
) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Build a mirrored path under `target_root` by replacing the `source_root`
|
|
14
|
+
segment in `input_path`.
|
|
15
|
+
"""
|
|
16
|
+
parts = Path(input_path).parts
|
|
17
|
+
if source_root not in parts:
|
|
18
|
+
raise ValueError(
|
|
19
|
+
f"Cannot mirror path '{input_path}': missing '{source_root}' segment."
|
|
20
|
+
)
|
|
21
|
+
idx = parts.index(source_root)
|
|
22
|
+
return str(Path(*parts[:idx], target_root, *parts[idx + 1 :]))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def resolve_compressed_path(
|
|
26
|
+
input_path: str,
|
|
27
|
+
method: str,
|
|
28
|
+
*,
|
|
29
|
+
layout: str = "sibling",
|
|
30
|
+
source_root: str = "RAW_DATA",
|
|
31
|
+
target_root: str = "RAW_DATA_COMPRESSED",
|
|
32
|
+
) -> str:
|
|
33
|
+
if layout == "sibling":
|
|
34
|
+
base_name = os.path.splitext(os.path.basename(input_path))[0]
|
|
35
|
+
compressed_name = f"{base_name}_{method}.h5"
|
|
36
|
+
return os.path.join(os.path.dirname(input_path), compressed_name)
|
|
37
|
+
if layout == "mirror":
|
|
38
|
+
# In mirror mode, compressed files keep the same file name as source.
|
|
39
|
+
return resolve_mirror_path(
|
|
40
|
+
input_path, source_root=source_root, target_root=target_root
|
|
41
|
+
)
|
|
42
|
+
raise ValueError(f"Unsupported layout: {layout}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def find_dataset_base_h5(
|
|
46
|
+
input_path: str,
|
|
47
|
+
*,
|
|
48
|
+
source_root: str = "RAW_DATA",
|
|
49
|
+
) -> str | None:
|
|
50
|
+
"""
|
|
51
|
+
Walk up from `input_path` to find the dataset directory that contains:
|
|
52
|
+
- exactly one .h5 file (the base/filter file)
|
|
53
|
+
- at least one scanXXXX subdirectory
|
|
54
|
+
Returns the absolute path to that .h5, or None when not found.
|
|
55
|
+
"""
|
|
56
|
+
scan_re = re.compile(r"^scan\d{4}$", re.IGNORECASE)
|
|
57
|
+
p = Path(input_path).resolve()
|
|
58
|
+
parts = p.parts
|
|
59
|
+
if source_root not in parts:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
root_idx = parts.index(source_root)
|
|
63
|
+
cur = p.parent
|
|
64
|
+
while True:
|
|
65
|
+
if len(cur.parts) < root_idx + 1:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
entries = list(cur.iterdir())
|
|
70
|
+
except OSError:
|
|
71
|
+
entries = []
|
|
72
|
+
|
|
73
|
+
h5_files = [e for e in entries if e.is_file() and e.suffix.lower() == ".h5"]
|
|
74
|
+
has_scan = any(e.is_dir() and scan_re.match(e.name) for e in entries)
|
|
75
|
+
|
|
76
|
+
if has_scan and len(h5_files) == 1:
|
|
77
|
+
return str(h5_files[0])
|
|
78
|
+
|
|
79
|
+
if len(cur.parts) == root_idx + 1:
|
|
80
|
+
return None
|
|
81
|
+
cur = cur.parent
|
{esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: esrf-data-compressor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A library to compress ESRF data and reduce their footprint
|
|
5
5
|
Author-email: ESRF <dau-pydev@esrf.fr>
|
|
6
6
|
License: MIT License
|
|
@@ -84,7 +84,9 @@ Dynamic: license-file
|
|
|
84
84
|
|
|
85
85
|
* **Non-destructive workflow**
|
|
86
86
|
|
|
87
|
-
1. `compress` writes
|
|
87
|
+
1. `compress` writes compressed files either:
|
|
88
|
+
- next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
|
|
89
|
+
- under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
|
|
88
90
|
2. `check` computes SSIM (first and last frames) and writes a report
|
|
89
91
|
3. `overwrite` (optional) swaps out the raw frame file (irreversible)
|
|
90
92
|
|
|
@@ -20,8 +20,10 @@ src/esrf_data_compressor/tests/test_cli.py
|
|
|
20
20
|
src/esrf_data_compressor/tests/test_finder.py
|
|
21
21
|
src/esrf_data_compressor/tests/test_hdf5_helpers.py
|
|
22
22
|
src/esrf_data_compressor/tests/test_jp2k.py
|
|
23
|
+
src/esrf_data_compressor/tests/test_paths.py
|
|
23
24
|
src/esrf_data_compressor/tests/test_run_check.py
|
|
24
25
|
src/esrf_data_compressor/tests/test_ssim.py
|
|
25
26
|
src/esrf_data_compressor/tests/test_utils.py
|
|
26
27
|
src/esrf_data_compressor/utils/hdf5_helpers.py
|
|
28
|
+
src/esrf_data_compressor/utils/paths.py
|
|
27
29
|
src/esrf_data_compressor/utils/utils.py
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/__init__.py
RENAMED
|
File without changes
|
{esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/ssim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/finder/finder.py
RENAMED
|
File without changes
|
{esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|