esrf-data-compressor 0.1.1__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {esrf_data_compressor-0.1.1/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.0}/PKG-INFO +4 -2
  2. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/README.md +4 -2
  3. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/pyproject.toml +2 -2
  4. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/run_check.py +6 -5
  5. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/cli.py +51 -20
  6. esrf_data_compressor-0.2.0/src/esrf_data_compressor/compressors/base.py +271 -0
  7. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_cli.py +93 -7
  8. esrf_data_compressor-0.2.0/src/esrf_data_compressor/tests/test_paths.py +36 -0
  9. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_run_check.py +18 -0
  10. esrf_data_compressor-0.2.0/src/esrf_data_compressor/utils/paths.py +81 -0
  11. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info}/PKG-INFO +4 -2
  12. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/SOURCES.txt +2 -0
  13. esrf_data_compressor-0.1.1/src/esrf_data_compressor/compressors/base.py +0 -167
  14. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/LICENSE +0 -0
  15. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/setup.cfg +0 -0
  16. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/__init__.py +0 -0
  17. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/ssim.py +0 -0
  18. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/__init__.py +0 -0
  19. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/jp2k.py +0 -0
  20. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/finder/finder.py +0 -0
  21. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/__init__.py +0 -0
  22. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_finder.py +0 -0
  23. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_hdf5_helpers.py +0 -0
  24. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_jp2k.py +0 -0
  25. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_ssim.py +0 -0
  26. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_utils.py +0 -0
  27. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/hdf5_helpers.py +0 -0
  28. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/utils.py +0 -0
  29. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/dependency_links.txt +0 -0
  30. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/entry_points.txt +0 -0
  31. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/requires.txt +0 -0
  32. {esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -84,7 +84,9 @@ Dynamic: license-file
84
84
 
85
85
  * **Non-destructive workflow**
86
86
 
87
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
87
+ 1. `compress` writes compressed files either:
88
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
89
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
88
90
  2. `check` computes SSIM (first and last frames) and writes a report
89
91
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
90
92
 
@@ -23,7 +23,9 @@
23
23
 
24
24
  * **Non-destructive workflow**
25
25
 
26
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
26
+ 1. `compress` writes compressed files either:
27
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
28
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
27
29
  2. `check` computes SSIM (first and last frames) and writes a report
28
30
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
29
31
 
@@ -119,4 +121,4 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
119
121
  * Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
120
122
  * Parallelism with worker×thread auto-factoring.
121
123
 
122
- For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
124
+ For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "esrf-data-compressor"
7
- version = "0.1.1"
7
+ version = "0.2.0"
8
8
  authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
9
9
  description = "A library to compress ESRF data and reduce their footprint"
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -69,4 +69,4 @@ omit = ["*/tests/*"]
69
69
 
70
70
  [tool.isort]
71
71
  profile = "black"
72
- force_single_line = true
72
+ force_single_line = true
@@ -3,12 +3,15 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
3
3
  from tqdm import tqdm
4
4
 
5
5
  from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
6
+ from esrf_data_compressor.utils.paths import resolve_compressed_path
6
7
 
7
8
 
8
- def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
9
+ def run_ssim_check(
10
+ raw_files: list[str], method: str, report_path: str, layout: str = "sibling"
11
+ ) -> None:
9
12
  """
10
13
  Given a list of raw HDF5 file paths, partitions into:
11
- to_check → those with a sibling <stem>_<method>.h5
14
+ to_check → those with an expected compressed counterpart according to `layout`
12
15
  missing → those without one
13
16
 
14
17
  Writes a report to `report_path`:
@@ -21,9 +24,7 @@ def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
21
24
 
22
25
  # partition
23
26
  for orig in raw_files:
24
- dirname, fname = os.path.dirname(orig), os.path.basename(orig)
25
- stem, _ = os.path.splitext(fname)
26
- comp_path = os.path.join(dirname, f"{stem}_{method}.h5")
27
+ comp_path = resolve_compressed_path(orig, method, layout=layout)
27
28
  if os.path.exists(comp_path):
28
29
  to_check.append((orig, comp_path))
29
30
  else:
@@ -46,13 +46,13 @@ def do_compress(args):
46
46
  exit_with_error(f"Failed to read report '{report}': {e}")
47
47
 
48
48
  if not files:
49
- print("Nothing to compress (TO COMPRESS list is empty).")
49
+ print("Nothing to compress (TO COMPRESS list is empty).")
50
50
  return
51
51
 
52
52
  print(
53
- f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method and ratio {args.cratio} …"
53
+ f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method, ratio {args.cratio}, layout '{args.layout}' …"
54
54
  )
55
- mgr = CompressorManager(cratio=args.cratio, method=args.method)
55
+ mgr = CompressorManager(cratio=args.cratio, method=args.method, layout=args.layout)
56
56
  mgr.compress_files(files)
57
57
  print("Compression complete.\n")
58
58
 
@@ -65,15 +65,14 @@ def do_check(args):
65
65
  exit_with_error(f"Failed to read report '{report}': {e}")
66
66
 
67
67
  if not files:
68
- print("Nothing to check (TO COMPRESS list is empty).")
68
+ print("Nothing to check (TO COMPRESS list is empty).")
69
69
  return
70
70
 
71
- # We reuse run_ssim_check in its 3‑arg form (raw_files, method, report_path)
72
71
  report_fname = f"{os.path.splitext(report)[0]}_{args.method}_ssim_report.txt"
73
72
  report_path = os.path.abspath(report_fname)
74
73
 
75
74
  try:
76
- run_ssim_check(files, args.method, report_path)
75
+ run_ssim_check(files, args.method, report_path, layout=args.layout)
77
76
  except SystemExit as e:
78
77
  exit_with_error(str(e))
79
78
 
@@ -81,9 +80,6 @@ def do_check(args):
81
80
 
82
81
 
83
82
  def do_overwrite(args):
84
- """
85
- Overwrite TO COMPRESS files with their original sources.
86
- """
87
83
  report = args.input or "file_list.txt"
88
84
  try:
89
85
  files = parse_report(report)
@@ -91,13 +87,26 @@ def do_overwrite(args):
91
87
  exit_with_error(f"Failed to read report '{report}': {e}")
92
88
 
93
89
  if not files:
94
- print("Nothing to overwrite (TO COMPRESS list is empty).")
90
+ print("Nothing to process (TO COMPRESS list is empty).")
95
91
  return
96
92
 
97
- print(f"Overwriting {len(files)} file(s) from '{report}' …")
98
93
  mgr = CompressorManager()
94
+
95
+ if args.final:
96
+ print(f"Finalizing overwrite for {len(files)} file(s) from '{report}' …")
97
+ mgr.remove_backups(files)
98
+ print("Finalize step complete.\n")
99
+ return
100
+
101
+ if args.undo:
102
+ print(f"Undoing overwrite for {len(files)} file(s) from '{report}' …")
103
+ mgr.restore_backups(files)
104
+ print("Undo step complete.\n")
105
+ return
106
+
107
+ print(f"Overwriting {len(files)} file(s) from '{report}' …")
99
108
  mgr.overwrite_files(files)
100
- print("Overwrite complete.\n")
109
+ print("Overwrite complete (backups kept).\n")
101
110
 
102
111
 
103
112
  def main():
@@ -106,7 +115,6 @@ def main():
106
115
  )
107
116
  sub = parser.add_subparsers(dest="command", required=True)
108
117
 
109
- # list
110
118
  p = sub.add_parser("list", help="Report VDS sources → TO COMPRESS vs REMAINING")
111
119
  p.add_argument("experiment", help="Experiment ID")
112
120
  p.add_argument("beamline", nargs="?", help="Optional beamline")
@@ -115,13 +123,12 @@ def main():
115
123
  p.add_argument(
116
124
  "--filter",
117
125
  metavar="KEY:VAL[,KEY2:VAL2...]",
118
- help="Datasetlevel attribute substring filters",
126
+ help="Dataset-level attribute substring filters",
119
127
  )
120
128
  p.add_argument("--output", help="Report file (default = file_list.txt)")
121
129
  p.set_defaults(func=do_list)
122
130
 
123
- # compress
124
- p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
131
+ p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
125
132
  p.add_argument(
126
133
  "--input",
127
134
  "-i",
@@ -135,23 +142,47 @@ def main():
135
142
  default="jp2k",
136
143
  help="Compression method",
137
144
  )
145
+ p.add_argument(
146
+ "--layout",
147
+ choices=["sibling", "mirror"],
148
+ default="mirror",
149
+ help="Output layout: sibling (next to each source) or mirror (under RAW_DATA_COMPRESSED, preserving source names).",
150
+ )
138
151
  p.set_defaults(func=do_compress)
139
152
 
140
- # check
141
- p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
153
+ p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
142
154
  p.add_argument(
143
155
  "--input", "-i", help="Report file to read (default = file_list.txt)"
144
156
  )
145
157
  p.add_argument(
146
158
  "--method", choices=["jp2k"], default="jp2k", help="Compression method"
147
159
  )
160
+ p.add_argument(
161
+ "--layout",
162
+ choices=["sibling", "mirror"],
163
+ default="mirror",
164
+ help="Location of compressed files to check.",
165
+ )
148
166
  p.set_defaults(func=do_check)
149
167
 
150
- # overwrite
151
- p = sub.add_parser("overwrite", help="Overwrite only TO COMPRESS files")
168
+ p = sub.add_parser(
169
+ "overwrite",
170
+ help="Swap in compressed files and keep backups; with --final or --undo, perform cleanup/restore only.",
171
+ )
152
172
  p.add_argument(
153
173
  "--input", "-i", help="Report file to read (default = file_list.txt)"
154
174
  )
175
+ group = p.add_mutually_exclusive_group()
176
+ group.add_argument(
177
+ "--final",
178
+ action="store_true",
179
+ help="Cleanup only: delete existing *.h5.bak backups after confirmation (no overwrite).",
180
+ )
181
+ group.add_argument(
182
+ "--undo",
183
+ action="store_true",
184
+ help="Restore only: move <file>.h5.bak back to <file>.h5 and preserve the current file as <file>_<method>.h5 when needed.",
185
+ )
155
186
  p.set_defaults(func=do_overwrite)
156
187
 
157
188
  args = parser.parse_args()
@@ -0,0 +1,271 @@
1
+ import os
2
+ import shutil
3
+ from concurrent.futures import ProcessPoolExecutor, as_completed
4
+ from tqdm import tqdm
5
+
6
+ from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
7
+ from esrf_data_compressor.utils.paths import (
8
+ find_dataset_base_h5,
9
+ resolve_compressed_path,
10
+ resolve_mirror_path,
11
+ )
12
+
13
+
14
+ class Compressor:
15
+ """
16
+ Abstract base class. Subclasses must implement compress_file().
17
+ """
18
+
19
+ def compress_file(self, input_path: str, output_path: str, **kwargs):
20
+ raise NotImplementedError
21
+
22
+
23
+ class CompressorManager:
24
+ """
25
+ Manages parallel compression and overwrite.
26
+
27
+ Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
28
+ has fewer than 4 cores). The number of worker processes is then
29
+ total_cores // threads_per_worker (at least 1). If the user explicitly
30
+ passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
31
+ = min(4, total_cores // workers).
32
+
33
+ Usage:
34
+ mgr = CompressorManager(cratio=10, method='jp2k')
35
+ mgr.compress_files([...])
36
+ mgr.overwrite_files([...])
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ workers: int | None = None,
42
+ cratio: int = 10,
43
+ method: str = "jp2k",
44
+ layout: str = "sibling",
45
+ ):
46
+ total_cores = os.cpu_count() or 1
47
+ default_nthreads = 4 if total_cores >= 4 else 1
48
+ default_workers = max(1, total_cores // default_nthreads)
49
+
50
+ if workers is None:
51
+ w = default_workers
52
+ nthreads = default_nthreads
53
+ else:
54
+ w = min(workers, total_cores)
55
+ possible = total_cores // w
56
+ nthreads = min(possible, 4) if possible >= 1 else 1
57
+
58
+ self.workers = max(1, w)
59
+ self.nthreads = max(1, nthreads)
60
+ self.cratio = cratio
61
+ self.method = method
62
+ self.layout = layout
63
+
64
+ if self.method == "jp2k":
65
+ self.compressor = JP2KCompressorWrapper(
66
+ cratio=cratio, nthreads=self.nthreads
67
+ )
68
+ else:
69
+ raise ValueError(f"Unsupported compression method: {self.method}")
70
+
71
+ print(f"Compression method: {self.method}")
72
+ print(f"Output layout: {self.layout}")
73
+ print(f"Total CPU cores: {total_cores}")
74
+ print(f"Worker processes: {self.workers}")
75
+ print(f"Threads per worker: {self.nthreads}")
76
+ print(f"Total threads: {self.workers * self.nthreads}")
77
+
78
+ def _compress_worker(self, ipath: str) -> tuple[str, str]:
79
+ """
80
+ Worker function for ProcessPoolExecutor: compress a single HDF5:
81
+ - sibling layout: <same_dir>/<basename>_<method>.h5
82
+ - mirror layout: mirror RAW_DATA tree under RAW_DATA_COMPRESSED
83
+ """
84
+ outp = resolve_compressed_path(ipath, self.method, layout=self.layout)
85
+ os.makedirs(os.path.dirname(outp), exist_ok=True)
86
+ self.compressor.compress_file(
87
+ ipath, outp, cratio=self.cratio, nthreads=self.nthreads
88
+ )
89
+ return ipath, "success"
90
+
91
+ def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
92
+ source_targets = {os.path.realpath(p) for p in file_list}
93
+ mirror_roots: set[str] = set()
94
+ for ipath in file_list:
95
+ base_h5 = find_dataset_base_h5(ipath)
96
+ dataset_dir = (
97
+ os.path.dirname(base_h5) if base_h5 else os.path.dirname(ipath)
98
+ )
99
+ # Mirror the parent sample folder too, so sidecar files next to
100
+ # dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
101
+ mirror_roots.add(os.path.dirname(dataset_dir))
102
+
103
+ for src_dir in sorted(mirror_roots):
104
+ try:
105
+ dst_dir = resolve_mirror_path(src_dir)
106
+ except ValueError:
107
+ print(f"WARNING: Cannot mirror folder outside RAW_DATA: '{src_dir}'")
108
+ continue
109
+
110
+ for cur, dirs, files in os.walk(src_dir):
111
+ rel_cur = os.path.relpath(cur, src_dir)
112
+ target_cur = (
113
+ dst_dir if rel_cur == "." else os.path.join(dst_dir, rel_cur)
114
+ )
115
+ os.makedirs(target_cur, exist_ok=True)
116
+
117
+ for dname in dirs:
118
+ os.makedirs(os.path.join(target_cur, dname), exist_ok=True)
119
+
120
+ for fname in files:
121
+ src_file = os.path.join(cur, fname)
122
+ if os.path.realpath(src_file) in source_targets:
123
+ # Do not copy raw files that will be produced by compression.
124
+ continue
125
+ dst_file = os.path.join(target_cur, fname)
126
+ shutil.copy2(src_file, dst_file)
127
+
128
+ def compress_files(self, file_list: list[str]) -> None:
129
+ """
130
+ Compress each .h5 in file_list in parallel.
131
+ - sibling layout: produce <basename>_<method>.h5 next to each source.
132
+ - mirror layout: write compressed files to RAW_DATA_COMPRESSED with same file names.
133
+ Does not overwrite originals. At the end, prints total elapsed time and data rate in MB/s.
134
+ """
135
+ valid = [p for p in file_list if p.lower().endswith(".h5")]
136
+ if not valid:
137
+ print("No valid .h5 files to compress.")
138
+ return
139
+ if self.layout == "mirror":
140
+ print(
141
+ "Preparing RAW_DATA_COMPRESSED with non-compressed dataset content..."
142
+ )
143
+ self._mirror_non_compressed_dataset_content(valid)
144
+
145
+ total_bytes = 0
146
+ for f in valid:
147
+ try:
148
+ total_bytes += os.path.getsize(f)
149
+ except OSError:
150
+ pass
151
+
152
+ import time
153
+
154
+ t0 = time.time()
155
+
156
+ with ProcessPoolExecutor(max_workers=self.workers) as executor:
157
+ futures = {executor.submit(self._compress_worker, p): p for p in valid}
158
+ for fut in tqdm(
159
+ as_completed(futures),
160
+ total=len(futures),
161
+ desc=f"Compressing HDF5 files ({self.method})",
162
+ unit="file",
163
+ ):
164
+ pth = futures[fut]
165
+ try:
166
+ fut.result()
167
+ except Exception as e:
168
+ print(f"Failed to compress '{pth}': {e}")
169
+
170
+ elapsed = time.time() - t0
171
+ total_mb = total_bytes / (1024 * 1024)
172
+ rate_mb_s = total_mb / elapsed if elapsed > 0 else float("inf")
173
+ print(f"\nTotal elapsed time: {elapsed:.3f}s")
174
+ print(f"Data processed: {total_mb:.2f} MB ({rate_mb_s:.2f} MB/s)\n")
175
+
176
+ def overwrite_files(self, file_list: list[str]) -> None:
177
+ """
178
+ Overwrites files only if they have a compressed sibling:
179
+
180
+ 1) Rename <file>.h5 → <file>.h5.bak
181
+ 2) Rename <file>_<method>.h5 → <file>.h5
182
+
183
+ After processing all files, removes the backup .h5.bak files.
184
+ """
185
+ for ipath in file_list:
186
+ if not ipath.lower().endswith(".h5"):
187
+ continue
188
+
189
+ compressed_path = resolve_compressed_path(
190
+ ipath, self.method, layout=self.layout
191
+ )
192
+
193
+ if os.path.exists(compressed_path):
194
+ backup = ipath + ".bak"
195
+ try:
196
+ os.replace(ipath, backup)
197
+ os.replace(compressed_path, ipath)
198
+ print(f"Overwritten '{ipath}' (backup at '{backup}').")
199
+ except Exception as e:
200
+ print(f"ERROR overwriting '{ipath}': {e}")
201
+ else:
202
+ print(f"SKIP (no compressed file): {ipath}")
203
+
204
+ def remove_backups(self, file_list: list[str]) -> None:
205
+ candidates = {p + ".bak" for p in file_list if p.lower().endswith(".h5")}
206
+ backups = [b for b in candidates if os.path.exists(b)]
207
+ if not backups:
208
+ print("No backup files to remove.")
209
+ return
210
+
211
+ total_bytes = 0
212
+ for b in backups:
213
+ try:
214
+ total_bytes += os.path.getsize(b)
215
+ except OSError:
216
+ pass
217
+ total_mb = total_bytes / (1024 * 1024)
218
+
219
+ print(
220
+ f"About to remove {len(backups)} backup file(s), ~{total_mb:.2f} MB total."
221
+ )
222
+ ans = input("Proceed? [y/N]: ").strip().lower()
223
+ if ans not in ("y", "yes"):
224
+ print("Backups kept.")
225
+ return
226
+
227
+ removed = 0
228
+ for b in backups:
229
+ try:
230
+ os.remove(b)
231
+ removed += 1
232
+ except Exception as e:
233
+ print(f"ERROR deleting backup '{b}': {e}")
234
+
235
+ print(f"Deleted {removed} backup file(s).")
236
+
237
+ def restore_backups(self, file_list: list[str]) -> None:
238
+ restored = 0
239
+ preserved = 0
240
+ for ipath in file_list:
241
+ if not ipath.lower().endswith(".h5"):
242
+ continue
243
+
244
+ backup = ipath + ".bak"
245
+ method_path = resolve_compressed_path(
246
+ ipath, self.method, layout=self.layout
247
+ )
248
+
249
+ if not os.path.exists(backup):
250
+ print(f"SKIP (no backup): {ipath}")
251
+ continue
252
+
253
+ if os.path.exists(ipath) and not os.path.exists(method_path):
254
+ try:
255
+ os.replace(ipath, method_path)
256
+ preserved += 1
257
+ print(f"Preserved current file to '{method_path}'.")
258
+ except Exception as e:
259
+ print(f"ERROR preserving current '{ipath}' to '{method_path}': {e}")
260
+ continue
261
+
262
+ try:
263
+ os.replace(backup, ipath)
264
+ restored += 1
265
+ print(f"Restored '{ipath}' from backup.")
266
+ except Exception as e:
267
+ print(f"ERROR restoring '{ipath}' from '{backup}': {e}")
268
+
269
+ print(
270
+ f"Restore complete. Restored: {restored}, preserved compressed copies: {preserved}."
271
+ )
@@ -109,7 +109,7 @@ def test_commands_with_non_empty_list(
109
109
  # Run command
110
110
  argv = [cmd, "-i", "report.txt"]
111
111
  if cmd == "compress":
112
- argv += ["--cratio", "5", "--method", "jp2k"]
112
+ argv += ["--cratio", "5", "--method", "jp2k", "--layout", "sibling"]
113
113
  argv_runner(argv)
114
114
  out = capsys.readouterr().out
115
115
  assert msg_start in out
@@ -119,12 +119,14 @@ def test_commands_with_non_empty_list(
119
119
  for f in files:
120
120
  comp = tmp_path / f.replace(".h5", "_jp2k.h5")
121
121
  assert comp.exists()
122
- # For overwrite, verify original replaced and backup removed
122
+ # For overwrite, verify original replaced and backup KEPT
123
123
  if cmd == "overwrite":
124
124
  # f1 was overwritten, f2 was skipped
125
125
  assert (tmp_path / "f1.h5").exists()
126
- # no backup remains
127
- assert not (tmp_path / "f1.h5.bak").exists()
126
+ # backup remains by default
127
+ assert (tmp_path / "f1.h5.bak").exists()
128
+ # f2 had no compressed sibling → no backup
129
+ assert not (tmp_path / "f2.h5.bak").exists()
128
130
 
129
131
 
130
132
  def test_list_success_and_output_file(argv_runner, monkeypatch, capsys, tmp_path):
@@ -150,7 +152,7 @@ def test_list_success_and_output_file(argv_runner, monkeypatch, capsys, tmp_path
150
152
  [
151
153
  ("compress", "Nothing to compress"),
152
154
  ("check", "Nothing to check"),
153
- ("overwrite", "Nothing to overwrite"),
155
+ ("overwrite", "Nothing to process"),
154
156
  ],
155
157
  )
156
158
  def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_path):
@@ -165,12 +167,96 @@ def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_pat
165
167
  def test_check_success_writes_report(argv_runner, monkeypatch, capsys, tmp_path):
166
168
  monkeypatch.setattr(cli, "parse_report", lambda rpt: ["f"])
167
169
 
168
- def run(files, method, out):
170
+ def run(files, method, out, layout):
171
+ assert layout == "sibling"
169
172
  with open(out, "w") as f:
170
173
  f.write("ok")
171
174
 
172
175
  monkeypatch.setattr(cli, "run_ssim_check", run)
173
176
  report = tmp_path / "rpt.txt"
174
- argv_runner(["check", "-i", str(report), "--method", "jp2k"])
177
+ argv_runner(["check", "-i", str(report), "--method", "jp2k", "--layout", "sibling"])
175
178
  out = capsys.readouterr().out
176
179
  assert "SSIM report written to" in out
180
+
181
+
182
+ def test_compress_mirror_layout_creates_under_raw_data_compressed(
183
+ argv_runner, monkeypatch, tmp_path
184
+ ):
185
+ ds = tmp_path / "RAW_DATA" / "sampleA" / "ds1"
186
+ src = ds / "scan0001" / "f1.h5"
187
+ src.parent.mkdir(parents=True)
188
+ src.write_text("data")
189
+ base = ds / "dataset.h5"
190
+ base.write_text("base")
191
+ sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
192
+ sample_sidecar.write_text("sidecar")
193
+ side = ds / "scan0002" / "meta.txt"
194
+ side.parent.mkdir(parents=True)
195
+ side.write_text("meta")
196
+ monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(src)])
197
+ monkeypatch.setattr(
198
+ JP2KCompressorWrapper,
199
+ "compress_file",
200
+ lambda self, inp, out, **kw: open(out, "w").close(),
201
+ )
202
+
203
+ argv_runner(
204
+ [
205
+ "compress",
206
+ "-i",
207
+ "report.txt",
208
+ "--cratio",
209
+ "5",
210
+ "--method",
211
+ "jp2k",
212
+ "--layout",
213
+ "mirror",
214
+ ]
215
+ )
216
+
217
+ # The dataset base/filter file is mirrored under RAW_DATA_COMPRESSED.
218
+ assert (
219
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "dataset.h5"
220
+ ).exists()
221
+ # Compressed file keeps the same source name under mirrored scan path.
222
+ assert (
223
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0001" / "f1.h5"
224
+ ).exists()
225
+ assert (
226
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
227
+ ).exists()
228
+ assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
229
+
230
+
231
+ def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
232
+ # Prepare a file and its backup
233
+ (tmp_path / "f1.h5").write_text("current")
234
+ (tmp_path / "f1.h5.bak").write_text("backup")
235
+ # parse_report returns the original .h5 path(s)
236
+ monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(tmp_path / "f1.h5")])
237
+ # auto-confirm deletion
238
+ monkeypatch.setattr("builtins.input", lambda *a, **k: "y")
239
+
240
+ argv_runner(["overwrite", "-i", "report.txt", "--final"])
241
+ out = capsys.readouterr().out
242
+ assert "About to remove" in out
243
+ assert not (tmp_path / "f1.h5.bak").exists()
244
+
245
+
246
+ def test_overwrite_undo_restores_and_preserves(
247
+ argv_runner, monkeypatch, capsys, tmp_path
248
+ ):
249
+ # Start with current file and a backup; no <method> file yet
250
+ (tmp_path / "f1.h5").write_text("CUR")
251
+ (tmp_path / "f1.h5.bak").write_text("BAK")
252
+ monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(tmp_path / "f1.h5")])
253
+
254
+ argv_runner(["overwrite", "-i", "report.txt", "--undo"])
255
+ out = capsys.readouterr().out
256
+ assert "Undoing overwrite" in out
257
+ # Backup should have been restored to f1.h5
258
+ assert (tmp_path / "f1.h5").read_text() == "BAK"
259
+ # Previous current should have been preserved as f1_jp2k.h5
260
+ assert (tmp_path / "f1_jp2k.h5").read_text() == "CUR"
261
+ # .bak should be gone after restore (moved)
262
+ assert not (tmp_path / "f1.h5.bak").exists()
@@ -0,0 +1,36 @@
1
+ import pytest
2
+
3
+ from esrf_data_compressor.utils.paths import (
4
+ find_dataset_base_h5,
5
+ resolve_compressed_path,
6
+ resolve_mirror_path,
7
+ )
8
+
9
+
10
+ def test_resolve_compressed_path_sibling():
11
+ p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
12
+ out = resolve_compressed_path(p, "jp2k", layout="sibling")
13
+ assert out == "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1_jp2k.h5"
14
+
15
+
16
+ def test_resolve_compressed_path_mirror():
17
+ p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
18
+ out = resolve_compressed_path(p, "jp2k", layout="mirror")
19
+ assert out == "/data/visitor/e/bl/s/RAW_DATA_COMPRESSED/sample/ds/f1.h5"
20
+
21
+
22
+ def test_resolve_mirror_path_requires_raw_data():
23
+ with pytest.raises(ValueError):
24
+ resolve_mirror_path("/tmp/no_raw_data_here/f1.h5")
25
+
26
+
27
+ def test_find_dataset_base_h5(tmp_path):
28
+ ds = tmp_path / "RAW_DATA" / "sample" / "ds1"
29
+ scan = ds / "scan0001"
30
+ scan.mkdir(parents=True)
31
+ base = ds / "dataset.h5"
32
+ base.write_text("base")
33
+ src = scan / "frames.h5"
34
+ src.write_text("source")
35
+
36
+ assert find_dataset_base_h5(str(src)) == str(base)
@@ -105,3 +105,21 @@ def test_ssim_error_handling(tmp_path, monkeypatch):
105
105
  # should include an ERROR line mentioning the exception message
106
106
  assert any("ERROR processing file pair" in line for line in lines)
107
107
  assert any("Error" in line for line in lines)
108
+
109
+
110
+ def test_mirror_layout_finds_compressed_file(tmp_path, monkeypatch):
111
+ raw = tmp_path / "RAW_DATA" / "sample" / "ds" / "d3.h5"
112
+ comp = tmp_path / "RAW_DATA_COMPRESSED" / "sample" / "ds" / "d3.h5"
113
+ raw.parent.mkdir(parents=True)
114
+ comp.parent.mkdir(parents=True)
115
+ raw.write_text("r3")
116
+ comp.write_text("c3")
117
+ report = tmp_path / "report.txt"
118
+
119
+ monkeypatch.setattr(rs, "compute_ssim_for_file_pair", lambda o, c: ("d3", ["ok"]))
120
+
121
+ rs.run_ssim_check(
122
+ [str(raw)], method="method", report_path=str(report), layout="mirror"
123
+ )
124
+ lines = _read_report(report)
125
+ assert lines[2] == f"Compressed file: {comp}"
@@ -0,0 +1,81 @@
1
+ import os
2
+ from pathlib import Path
3
+ import re
4
+
5
+
6
+ def resolve_mirror_path(
7
+ input_path: str,
8
+ *,
9
+ source_root: str = "RAW_DATA",
10
+ target_root: str = "RAW_DATA_COMPRESSED",
11
+ ) -> str:
12
+ """
13
+ Build a mirrored path under `target_root` by replacing the `source_root`
14
+ segment in `input_path`.
15
+ """
16
+ parts = Path(input_path).parts
17
+ if source_root not in parts:
18
+ raise ValueError(
19
+ f"Cannot mirror path '{input_path}': missing '{source_root}' segment."
20
+ )
21
+ idx = parts.index(source_root)
22
+ return str(Path(*parts[:idx], target_root, *parts[idx + 1 :]))
23
+
24
+
25
+ def resolve_compressed_path(
26
+ input_path: str,
27
+ method: str,
28
+ *,
29
+ layout: str = "sibling",
30
+ source_root: str = "RAW_DATA",
31
+ target_root: str = "RAW_DATA_COMPRESSED",
32
+ ) -> str:
33
+ if layout == "sibling":
34
+ base_name = os.path.splitext(os.path.basename(input_path))[0]
35
+ compressed_name = f"{base_name}_{method}.h5"
36
+ return os.path.join(os.path.dirname(input_path), compressed_name)
37
+ if layout == "mirror":
38
+ # In mirror mode, compressed files keep the same file name as source.
39
+ return resolve_mirror_path(
40
+ input_path, source_root=source_root, target_root=target_root
41
+ )
42
+ raise ValueError(f"Unsupported layout: {layout}")
43
+
44
+
45
+ def find_dataset_base_h5(
46
+ input_path: str,
47
+ *,
48
+ source_root: str = "RAW_DATA",
49
+ ) -> str | None:
50
+ """
51
+ Walk up from `input_path` to find the dataset directory that contains:
52
+ - exactly one .h5 file (the base/filter file)
53
+ - at least one scanXXXX subdirectory
54
+ Returns the absolute path to that .h5, or None when not found.
55
+ """
56
+ scan_re = re.compile(r"^scan\d{4}$", re.IGNORECASE)
57
+ p = Path(input_path).resolve()
58
+ parts = p.parts
59
+ if source_root not in parts:
60
+ return None
61
+
62
+ root_idx = parts.index(source_root)
63
+ cur = p.parent
64
+ while True:
65
+ if len(cur.parts) < root_idx + 1:
66
+ return None
67
+
68
+ try:
69
+ entries = list(cur.iterdir())
70
+ except OSError:
71
+ entries = []
72
+
73
+ h5_files = [e for e in entries if e.is_file() and e.suffix.lower() == ".h5"]
74
+ has_scan = any(e.is_dir() and scan_re.match(e.name) for e in entries)
75
+
76
+ if has_scan and len(h5_files) == 1:
77
+ return str(h5_files[0])
78
+
79
+ if len(cur.parts) == root_idx + 1:
80
+ return None
81
+ cur = cur.parent
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -84,7 +84,9 @@ Dynamic: license-file
84
84
 
85
85
  * **Non-destructive workflow**
86
86
 
87
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
87
+ 1. `compress` writes compressed files either:
88
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
89
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
88
90
  2. `check` computes SSIM (first and last frames) and writes a report
89
91
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
90
92
 
@@ -20,8 +20,10 @@ src/esrf_data_compressor/tests/test_cli.py
20
20
  src/esrf_data_compressor/tests/test_finder.py
21
21
  src/esrf_data_compressor/tests/test_hdf5_helpers.py
22
22
  src/esrf_data_compressor/tests/test_jp2k.py
23
+ src/esrf_data_compressor/tests/test_paths.py
23
24
  src/esrf_data_compressor/tests/test_run_check.py
24
25
  src/esrf_data_compressor/tests/test_ssim.py
25
26
  src/esrf_data_compressor/tests/test_utils.py
26
27
  src/esrf_data_compressor/utils/hdf5_helpers.py
28
+ src/esrf_data_compressor/utils/paths.py
27
29
  src/esrf_data_compressor/utils/utils.py
@@ -1,167 +0,0 @@
1
- import os
2
- from concurrent.futures import ProcessPoolExecutor, as_completed
3
- from tqdm import tqdm
4
-
5
- from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
6
-
7
-
8
- class Compressor:
9
- """
10
- Abstract base class. Subclasses must implement compress_file().
11
- """
12
-
13
- def compress_file(self, input_path: str, output_path: str, **kwargs):
14
- raise NotImplementedError
15
-
16
-
17
- class CompressorManager:
18
- """
19
- Manages parallel compression and overwrite.
20
-
21
- Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
22
- has fewer than 4 cores). The number of worker processes is then
23
- total_cores // threads_per_worker (at least 1). If the user explicitly
24
- passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
25
- = min(4, total_cores // workers).
26
-
27
- Usage:
28
- mgr = CompressorManager(cratio=10, method='jp2k')
29
- mgr.compress_files([...])
30
- mgr.overwrite_files([...])
31
- """
32
-
33
- def __init__(
34
- self, workers: int | None = None, cratio: int = 10, method: str = "jp2k"
35
- ):
36
- total_cores = os.cpu_count() or 1
37
-
38
- # Determine default threads per worker (4, or fewer if total_cores < 4)
39
- if total_cores >= 4:
40
- default_nthreads = 4
41
- else:
42
- default_nthreads = 1
43
-
44
- # Default worker count
45
- default_workers = max(1, total_cores // default_nthreads)
46
-
47
- if workers is None:
48
- # Use default workers and default_nthreads
49
- w = default_workers
50
- nthreads = default_nthreads
51
- else:
52
- # Cap workers to total_cores
53
- w = min(workers, total_cores)
54
- # Recompute threads per worker so that (w * nthreads) ≤ total_cores, up to 4
55
- possible = total_cores // w
56
- nthreads = min(possible, 4) if possible >= 1 else 1
57
-
58
- self.workers = max(1, w)
59
- self.nthreads = max(1, nthreads)
60
- self.cratio = cratio
61
- self.method = method
62
-
63
- # Instantiate compressor based on method
64
- if self.method == "jp2k":
65
- self.compressor = JP2KCompressorWrapper(
66
- cratio=cratio, nthreads=self.nthreads
67
- )
68
- else:
69
- raise ValueError(f"Unsupported compression method: {self.method}")
70
-
71
- print(f"Compression method: {self.method}")
72
- print(f"Total CPU cores: {total_cores}")
73
- print(f"Worker processes: {self.workers}")
74
- print(f"Threads per worker: {self.nthreads}")
75
- print(f"Total threads: {self.workers * self.nthreads}")
76
-
77
- def _compress_worker(self, ipath: str) -> tuple[str, str]:
78
- """
79
- Worker function for ProcessPoolExecutor: compress a single HDF5:
80
- <ipath>.h5 → <same_dir>/<basename>_<method>.h5
81
- """
82
- base, _ = os.path.splitext(ipath)
83
- outp = f"{base}_{self.method}.h5"
84
- self.compressor.compress_file(
85
- ipath, outp, cratio=self.cratio, nthreads=self.nthreads
86
- )
87
- return ipath, "success"
88
-
89
- def compress_files(self, file_list: list[str]) -> None:
90
- """
91
- Compress each .h5 in file_list in parallel, producing <basename>_<method>.h5
92
- next to each source file. Does not overwrite originals. At the end, prints
93
- total elapsed time and data rate in MB/s.
94
- """
95
- valid = [p for p in file_list if p.lower().endswith(".h5")]
96
- if not valid:
97
- print("No valid .h5 files to compress.")
98
- return
99
-
100
- total_bytes = 0
101
- for f in valid:
102
- try:
103
- total_bytes += os.path.getsize(f)
104
- except OSError:
105
- pass
106
-
107
- import time
108
-
109
- t0 = time.time()
110
-
111
- with ProcessPoolExecutor(max_workers=self.workers) as executor:
112
- futures = {executor.submit(self._compress_worker, p): p for p in valid}
113
- for fut in tqdm(
114
- as_completed(futures),
115
- total=len(futures),
116
- desc=f"Compressing HDF5 files ({self.method})",
117
- unit="file",
118
- ):
119
- pth = futures[fut]
120
- try:
121
- fut.result()
122
- except Exception as e:
123
- print(f"Failed to compress '{pth}': {e}")
124
-
125
- t1 = time.time()
126
- elapsed = t1 - t0
127
- total_mb = total_bytes / (1024 * 1024)
128
- rate_mb_s = total_mb / elapsed if elapsed > 0 else float("inf")
129
- print(f"\nTotal elapsed time: {elapsed:.3f}s")
130
- print(f"Data processed: {total_mb:.2f} MB ({rate_mb_s:.2f} MB/s)\n")
131
-
132
- def overwrite_files(self, file_list: list[str]) -> None:
133
- """
134
- Overwrites files only if they have a compressed sibling:
135
-
136
- 1) Rename <file>.h5 → <file>.h5.bak
137
- 2) Rename <file>_<method>.h5 → <file>.h5
138
-
139
- After processing all files, removes the backup .h5.bak files.
140
- """
141
- backups = []
142
- for ipath in file_list:
143
- if not ipath.lower().endswith(".h5"):
144
- continue
145
-
146
- base, _ = os.path.splitext(ipath)
147
- compressed_path = f"{base}_{self.method}.h5"
148
-
149
- if os.path.exists(compressed_path):
150
- backup = ipath + ".bak"
151
- try:
152
- os.replace(ipath, backup)
153
- os.replace(compressed_path, ipath)
154
- backups.append(backup)
155
- print(f"Overwritten '{ipath}' (backup at '{backup}').")
156
- except Exception as e:
157
- print(f"ERROR overwriting '{ipath}': {e}")
158
- else:
159
- print(f"SKIP (no compressed file): {ipath}")
160
-
161
- # Remove all backup files
162
- for backup in backups:
163
- try:
164
- os.remove(backup)
165
- print(f"Deleted backup '{backup}'.")
166
- except Exception as e:
167
- print(f"ERROR deleting backup '{backup}': {e}")