esrf-data-compressor 0.1.2__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {esrf_data_compressor-0.1.2/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.0}/PKG-INFO +4 -2
  2. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/README.md +4 -2
  3. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/pyproject.toml +2 -2
  4. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/run_check.py +6 -5
  5. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/cli.py +15 -3
  6. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/base.py +69 -11
  7. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_cli.py +53 -3
  8. esrf_data_compressor-0.2.0/src/esrf_data_compressor/tests/test_paths.py +36 -0
  9. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_run_check.py +18 -0
  10. esrf_data_compressor-0.2.0/src/esrf_data_compressor/utils/paths.py +81 -0
  11. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info}/PKG-INFO +4 -2
  12. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/SOURCES.txt +2 -0
  13. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/LICENSE +0 -0
  14. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/setup.cfg +0 -0
  15. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/__init__.py +0 -0
  16. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/ssim.py +0 -0
  17. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/__init__.py +0 -0
  18. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/compressors/jp2k.py +0 -0
  19. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/finder/finder.py +0 -0
  20. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/__init__.py +0 -0
  21. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_finder.py +0 -0
  22. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_hdf5_helpers.py +0 -0
  23. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_jp2k.py +0 -0
  24. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_ssim.py +0 -0
  25. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_utils.py +0 -0
  26. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/hdf5_helpers.py +0 -0
  27. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/utils/utils.py +0 -0
  28. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/dependency_links.txt +0 -0
  29. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/entry_points.txt +0 -0
  30. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/requires.txt +0 -0
  31. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.1.2
3
+ Version: 0.2.0
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -84,7 +84,9 @@ Dynamic: license-file
84
84
 
85
85
  * **Non-destructive workflow**
86
86
 
87
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
87
+ 1. `compress` writes compressed files either:
88
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
89
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
88
90
  2. `check` computes SSIM (first and last frames) and writes a report
89
91
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
90
92
 
@@ -23,7 +23,9 @@
23
23
 
24
24
  * **Non-destructive workflow**
25
25
 
26
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
26
+ 1. `compress` writes compressed files either:
27
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
28
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
27
29
  2. `check` computes SSIM (first and last frames) and writes a report
28
30
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
29
31
 
@@ -119,4 +121,4 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
119
121
  * Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
120
122
  * Parallelism with worker×thread auto-factoring.
121
123
 
122
- For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
124
+ For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "esrf-data-compressor"
7
- version = "0.1.2"
7
+ version = "0.2.0"
8
8
  authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
9
9
  description = "A library to compress ESRF data and reduce their footprint"
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -69,4 +69,4 @@ omit = ["*/tests/*"]
69
69
 
70
70
  [tool.isort]
71
71
  profile = "black"
72
- force_single_line = true
72
+ force_single_line = true
@@ -3,12 +3,15 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
3
3
  from tqdm import tqdm
4
4
 
5
5
  from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
6
+ from esrf_data_compressor.utils.paths import resolve_compressed_path
6
7
 
7
8
 
8
- def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
9
+ def run_ssim_check(
10
+ raw_files: list[str], method: str, report_path: str, layout: str = "sibling"
11
+ ) -> None:
9
12
  """
10
13
  Given a list of raw HDF5 file paths, partitions into:
11
- to_check → those with a sibling <stem>_<method>.h5
14
+ to_check → those with an expected compressed counterpart according to `layout`
12
15
  missing → those without one
13
16
 
14
17
  Writes a report to `report_path`:
@@ -21,9 +24,7 @@ def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
21
24
 
22
25
  # partition
23
26
  for orig in raw_files:
24
- dirname, fname = os.path.dirname(orig), os.path.basename(orig)
25
- stem, _ = os.path.splitext(fname)
26
- comp_path = os.path.join(dirname, f"{stem}_{method}.h5")
27
+ comp_path = resolve_compressed_path(orig, method, layout=layout)
27
28
  if os.path.exists(comp_path):
28
29
  to_check.append((orig, comp_path))
29
30
  else:
@@ -50,9 +50,9 @@ def do_compress(args):
50
50
  return
51
51
 
52
52
  print(
53
- f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method and ratio {args.cratio} …"
53
+ f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method, ratio {args.cratio}, layout '{args.layout}' …"
54
54
  )
55
- mgr = CompressorManager(cratio=args.cratio, method=args.method)
55
+ mgr = CompressorManager(cratio=args.cratio, method=args.method, layout=args.layout)
56
56
  mgr.compress_files(files)
57
57
  print("Compression complete.\n")
58
58
 
@@ -72,7 +72,7 @@ def do_check(args):
72
72
  report_path = os.path.abspath(report_fname)
73
73
 
74
74
  try:
75
- run_ssim_check(files, args.method, report_path)
75
+ run_ssim_check(files, args.method, report_path, layout=args.layout)
76
76
  except SystemExit as e:
77
77
  exit_with_error(str(e))
78
78
 
@@ -142,6 +142,12 @@ def main():
142
142
  default="jp2k",
143
143
  help="Compression method",
144
144
  )
145
+ p.add_argument(
146
+ "--layout",
147
+ choices=["sibling", "mirror"],
148
+ default="mirror",
149
+ help="Output layout: sibling (next to each source) or mirror (under RAW_DATA_COMPRESSED, preserving source names).",
150
+ )
145
151
  p.set_defaults(func=do_compress)
146
152
 
147
153
  p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
@@ -151,6 +157,12 @@ def main():
151
157
  p.add_argument(
152
158
  "--method", choices=["jp2k"], default="jp2k", help="Compression method"
153
159
  )
160
+ p.add_argument(
161
+ "--layout",
162
+ choices=["sibling", "mirror"],
163
+ default="mirror",
164
+ help="Location of compressed files to check.",
165
+ )
154
166
  p.set_defaults(func=do_check)
155
167
 
156
168
  p = sub.add_parser(
@@ -1,8 +1,14 @@
1
1
  import os
2
+ import shutil
2
3
  from concurrent.futures import ProcessPoolExecutor, as_completed
3
4
  from tqdm import tqdm
4
5
 
5
6
  from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
7
+ from esrf_data_compressor.utils.paths import (
8
+ find_dataset_base_h5,
9
+ resolve_compressed_path,
10
+ resolve_mirror_path,
11
+ )
6
12
 
7
13
 
8
14
  class Compressor:
@@ -31,7 +37,11 @@ class CompressorManager:
31
37
  """
32
38
 
33
39
  def __init__(
34
- self, workers: int | None = None, cratio: int = 10, method: str = "jp2k"
40
+ self,
41
+ workers: int | None = None,
42
+ cratio: int = 10,
43
+ method: str = "jp2k",
44
+ layout: str = "sibling",
35
45
  ):
36
46
  total_cores = os.cpu_count() or 1
37
47
  default_nthreads = 4 if total_cores >= 4 else 1
@@ -49,6 +59,7 @@ class CompressorManager:
49
59
  self.nthreads = max(1, nthreads)
50
60
  self.cratio = cratio
51
61
  self.method = method
62
+ self.layout = layout
52
63
 
53
64
  if self.method == "jp2k":
54
65
  self.compressor = JP2KCompressorWrapper(
@@ -58,6 +69,7 @@ class CompressorManager:
58
69
  raise ValueError(f"Unsupported compression method: {self.method}")
59
70
 
60
71
  print(f"Compression method: {self.method}")
72
+ print(f"Output layout: {self.layout}")
61
73
  print(f"Total CPU cores: {total_cores}")
62
74
  print(f"Worker processes: {self.workers}")
63
75
  print(f"Threads per worker: {self.nthreads}")
@@ -66,25 +78,69 @@ class CompressorManager:
66
78
  def _compress_worker(self, ipath: str) -> tuple[str, str]:
67
79
  """
68
80
  Worker function for ProcessPoolExecutor: compress a single HDF5:
69
- <ipath>.h5 <same_dir>/<basename>_<method>.h5
81
+ - sibling layout: <same_dir>/<basename>_<method>.h5
82
+ - mirror layout: mirror RAW_DATA tree under RAW_DATA_COMPRESSED
70
83
  """
71
- base, _ = os.path.splitext(ipath)
72
- outp = f"{base}_{self.method}.h5"
84
+ outp = resolve_compressed_path(ipath, self.method, layout=self.layout)
85
+ os.makedirs(os.path.dirname(outp), exist_ok=True)
73
86
  self.compressor.compress_file(
74
87
  ipath, outp, cratio=self.cratio, nthreads=self.nthreads
75
88
  )
76
89
  return ipath, "success"
77
90
 
91
+ def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
92
+ source_targets = {os.path.realpath(p) for p in file_list}
93
+ mirror_roots: set[str] = set()
94
+ for ipath in file_list:
95
+ base_h5 = find_dataset_base_h5(ipath)
96
+ dataset_dir = (
97
+ os.path.dirname(base_h5) if base_h5 else os.path.dirname(ipath)
98
+ )
99
+ # Mirror the parent sample folder too, so sidecar files next to
100
+ # dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
101
+ mirror_roots.add(os.path.dirname(dataset_dir))
102
+
103
+ for src_dir in sorted(mirror_roots):
104
+ try:
105
+ dst_dir = resolve_mirror_path(src_dir)
106
+ except ValueError:
107
+ print(f"WARNING: Cannot mirror folder outside RAW_DATA: '{src_dir}'")
108
+ continue
109
+
110
+ for cur, dirs, files in os.walk(src_dir):
111
+ rel_cur = os.path.relpath(cur, src_dir)
112
+ target_cur = (
113
+ dst_dir if rel_cur == "." else os.path.join(dst_dir, rel_cur)
114
+ )
115
+ os.makedirs(target_cur, exist_ok=True)
116
+
117
+ for dname in dirs:
118
+ os.makedirs(os.path.join(target_cur, dname), exist_ok=True)
119
+
120
+ for fname in files:
121
+ src_file = os.path.join(cur, fname)
122
+ if os.path.realpath(src_file) in source_targets:
123
+ # Do not copy raw files that will be produced by compression.
124
+ continue
125
+ dst_file = os.path.join(target_cur, fname)
126
+ shutil.copy2(src_file, dst_file)
127
+
78
128
  def compress_files(self, file_list: list[str]) -> None:
79
129
  """
80
- Compress each .h5 in file_list in parallel, producing <basename>_<method>.h5
81
- next to each source file. Does not overwrite originals. At the end, prints
82
- total elapsed time and data rate in MB/s.
130
+ Compress each .h5 in file_list in parallel.
131
+ - sibling layout: produce <basename>_<method>.h5 next to each source.
132
+ - mirror layout: write compressed files to RAW_DATA_COMPRESSED with same file names.
133
+ Does not overwrite originals. At the end, prints total elapsed time and data rate in MB/s.
83
134
  """
84
135
  valid = [p for p in file_list if p.lower().endswith(".h5")]
85
136
  if not valid:
86
137
  print("No valid .h5 files to compress.")
87
138
  return
139
+ if self.layout == "mirror":
140
+ print(
141
+ "Preparing RAW_DATA_COMPRESSED with non-compressed dataset content..."
142
+ )
143
+ self._mirror_non_compressed_dataset_content(valid)
88
144
 
89
145
  total_bytes = 0
90
146
  for f in valid:
@@ -130,8 +186,9 @@ class CompressorManager:
130
186
  if not ipath.lower().endswith(".h5"):
131
187
  continue
132
188
 
133
- base, _ = os.path.splitext(ipath)
134
- compressed_path = f"{base}_{self.method}.h5"
189
+ compressed_path = resolve_compressed_path(
190
+ ipath, self.method, layout=self.layout
191
+ )
135
192
 
136
193
  if os.path.exists(compressed_path):
137
194
  backup = ipath + ".bak"
@@ -184,9 +241,10 @@ class CompressorManager:
184
241
  if not ipath.lower().endswith(".h5"):
185
242
  continue
186
243
 
187
- base, _ = os.path.splitext(ipath)
188
244
  backup = ipath + ".bak"
189
- method_path = f"{base}_{self.method}.h5"
245
+ method_path = resolve_compressed_path(
246
+ ipath, self.method, layout=self.layout
247
+ )
190
248
 
191
249
  if not os.path.exists(backup):
192
250
  print(f"SKIP (no backup): {ipath}")
@@ -109,7 +109,7 @@ def test_commands_with_non_empty_list(
109
109
  # Run command
110
110
  argv = [cmd, "-i", "report.txt"]
111
111
  if cmd == "compress":
112
- argv += ["--cratio", "5", "--method", "jp2k"]
112
+ argv += ["--cratio", "5", "--method", "jp2k", "--layout", "sibling"]
113
113
  argv_runner(argv)
114
114
  out = capsys.readouterr().out
115
115
  assert msg_start in out
@@ -167,17 +167,67 @@ def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_pat
167
167
  def test_check_success_writes_report(argv_runner, monkeypatch, capsys, tmp_path):
168
168
  monkeypatch.setattr(cli, "parse_report", lambda rpt: ["f"])
169
169
 
170
- def run(files, method, out):
170
+ def run(files, method, out, layout):
171
+ assert layout == "sibling"
171
172
  with open(out, "w") as f:
172
173
  f.write("ok")
173
174
 
174
175
  monkeypatch.setattr(cli, "run_ssim_check", run)
175
176
  report = tmp_path / "rpt.txt"
176
- argv_runner(["check", "-i", str(report), "--method", "jp2k"])
177
+ argv_runner(["check", "-i", str(report), "--method", "jp2k", "--layout", "sibling"])
177
178
  out = capsys.readouterr().out
178
179
  assert "SSIM report written to" in out
179
180
 
180
181
 
182
+ def test_compress_mirror_layout_creates_under_raw_data_compressed(
183
+ argv_runner, monkeypatch, tmp_path
184
+ ):
185
+ ds = tmp_path / "RAW_DATA" / "sampleA" / "ds1"
186
+ src = ds / "scan0001" / "f1.h5"
187
+ src.parent.mkdir(parents=True)
188
+ src.write_text("data")
189
+ base = ds / "dataset.h5"
190
+ base.write_text("base")
191
+ sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
192
+ sample_sidecar.write_text("sidecar")
193
+ side = ds / "scan0002" / "meta.txt"
194
+ side.parent.mkdir(parents=True)
195
+ side.write_text("meta")
196
+ monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(src)])
197
+ monkeypatch.setattr(
198
+ JP2KCompressorWrapper,
199
+ "compress_file",
200
+ lambda self, inp, out, **kw: open(out, "w").close(),
201
+ )
202
+
203
+ argv_runner(
204
+ [
205
+ "compress",
206
+ "-i",
207
+ "report.txt",
208
+ "--cratio",
209
+ "5",
210
+ "--method",
211
+ "jp2k",
212
+ "--layout",
213
+ "mirror",
214
+ ]
215
+ )
216
+
217
+ # The dataset base/filter file is mirrored under RAW_DATA_COMPRESSED.
218
+ assert (
219
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "dataset.h5"
220
+ ).exists()
221
+ # Compressed file keeps the same source name under mirrored scan path.
222
+ assert (
223
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0001" / "f1.h5"
224
+ ).exists()
225
+ assert (
226
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
227
+ ).exists()
228
+ assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
229
+
230
+
181
231
  def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
182
232
  # Prepare a file and its backup
183
233
  (tmp_path / "f1.h5").write_text("current")
@@ -0,0 +1,36 @@
1
+ import pytest
2
+
3
+ from esrf_data_compressor.utils.paths import (
4
+ find_dataset_base_h5,
5
+ resolve_compressed_path,
6
+ resolve_mirror_path,
7
+ )
8
+
9
+
10
+ def test_resolve_compressed_path_sibling():
11
+ p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
12
+ out = resolve_compressed_path(p, "jp2k", layout="sibling")
13
+ assert out == "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1_jp2k.h5"
14
+
15
+
16
+ def test_resolve_compressed_path_mirror():
17
+ p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
18
+ out = resolve_compressed_path(p, "jp2k", layout="mirror")
19
+ assert out == "/data/visitor/e/bl/s/RAW_DATA_COMPRESSED/sample/ds/f1.h5"
20
+
21
+
22
+ def test_resolve_mirror_path_requires_raw_data():
23
+ with pytest.raises(ValueError):
24
+ resolve_mirror_path("/tmp/no_raw_data_here/f1.h5")
25
+
26
+
27
+ def test_find_dataset_base_h5(tmp_path):
28
+ ds = tmp_path / "RAW_DATA" / "sample" / "ds1"
29
+ scan = ds / "scan0001"
30
+ scan.mkdir(parents=True)
31
+ base = ds / "dataset.h5"
32
+ base.write_text("base")
33
+ src = scan / "frames.h5"
34
+ src.write_text("source")
35
+
36
+ assert find_dataset_base_h5(str(src)) == str(base)
@@ -105,3 +105,21 @@ def test_ssim_error_handling(tmp_path, monkeypatch):
105
105
  # should include an ERROR line mentioning the exception message
106
106
  assert any("ERROR processing file pair" in line for line in lines)
107
107
  assert any("Error" in line for line in lines)
108
+
109
+
110
+ def test_mirror_layout_finds_compressed_file(tmp_path, monkeypatch):
111
+ raw = tmp_path / "RAW_DATA" / "sample" / "ds" / "d3.h5"
112
+ comp = tmp_path / "RAW_DATA_COMPRESSED" / "sample" / "ds" / "d3.h5"
113
+ raw.parent.mkdir(parents=True)
114
+ comp.parent.mkdir(parents=True)
115
+ raw.write_text("r3")
116
+ comp.write_text("c3")
117
+ report = tmp_path / "report.txt"
118
+
119
+ monkeypatch.setattr(rs, "compute_ssim_for_file_pair", lambda o, c: ("d3", ["ok"]))
120
+
121
+ rs.run_ssim_check(
122
+ [str(raw)], method="method", report_path=str(report), layout="mirror"
123
+ )
124
+ lines = _read_report(report)
125
+ assert lines[2] == f"Compressed file: {comp}"
@@ -0,0 +1,81 @@
1
+ import os
2
+ from pathlib import Path
3
+ import re
4
+
5
+
6
+ def resolve_mirror_path(
7
+ input_path: str,
8
+ *,
9
+ source_root: str = "RAW_DATA",
10
+ target_root: str = "RAW_DATA_COMPRESSED",
11
+ ) -> str:
12
+ """
13
+ Build a mirrored path under `target_root` by replacing the `source_root`
14
+ segment in `input_path`.
15
+ """
16
+ parts = Path(input_path).parts
17
+ if source_root not in parts:
18
+ raise ValueError(
19
+ f"Cannot mirror path '{input_path}': missing '{source_root}' segment."
20
+ )
21
+ idx = parts.index(source_root)
22
+ return str(Path(*parts[:idx], target_root, *parts[idx + 1 :]))
23
+
24
+
25
+ def resolve_compressed_path(
26
+ input_path: str,
27
+ method: str,
28
+ *,
29
+ layout: str = "sibling",
30
+ source_root: str = "RAW_DATA",
31
+ target_root: str = "RAW_DATA_COMPRESSED",
32
+ ) -> str:
33
+ if layout == "sibling":
34
+ base_name = os.path.splitext(os.path.basename(input_path))[0]
35
+ compressed_name = f"{base_name}_{method}.h5"
36
+ return os.path.join(os.path.dirname(input_path), compressed_name)
37
+ if layout == "mirror":
38
+ # In mirror mode, compressed files keep the same file name as source.
39
+ return resolve_mirror_path(
40
+ input_path, source_root=source_root, target_root=target_root
41
+ )
42
+ raise ValueError(f"Unsupported layout: {layout}")
43
+
44
+
45
+ def find_dataset_base_h5(
46
+ input_path: str,
47
+ *,
48
+ source_root: str = "RAW_DATA",
49
+ ) -> str | None:
50
+ """
51
+ Walk up from `input_path` to find the dataset directory that contains:
52
+ - exactly one .h5 file (the base/filter file)
53
+ - at least one scanXXXX subdirectory
54
+ Returns the absolute path to that .h5, or None when not found.
55
+ """
56
+ scan_re = re.compile(r"^scan\d{4}$", re.IGNORECASE)
57
+ p = Path(input_path).resolve()
58
+ parts = p.parts
59
+ if source_root not in parts:
60
+ return None
61
+
62
+ root_idx = parts.index(source_root)
63
+ cur = p.parent
64
+ while True:
65
+ if len(cur.parts) < root_idx + 1:
66
+ return None
67
+
68
+ try:
69
+ entries = list(cur.iterdir())
70
+ except OSError:
71
+ entries = []
72
+
73
+ h5_files = [e for e in entries if e.is_file() and e.suffix.lower() == ".h5"]
74
+ has_scan = any(e.is_dir() and scan_re.match(e.name) for e in entries)
75
+
76
+ if has_scan and len(h5_files) == 1:
77
+ return str(h5_files[0])
78
+
79
+ if len(cur.parts) == root_idx + 1:
80
+ return None
81
+ cur = cur.parent
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.1.2
3
+ Version: 0.2.0
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -84,7 +84,9 @@ Dynamic: license-file
84
84
 
85
85
  * **Non-destructive workflow**
86
86
 
87
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
87
+ 1. `compress` writes compressed files either:
88
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
89
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
88
90
  2. `check` computes SSIM (first and last frames) and writes a report
89
91
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
90
92
 
@@ -20,8 +20,10 @@ src/esrf_data_compressor/tests/test_cli.py
20
20
  src/esrf_data_compressor/tests/test_finder.py
21
21
  src/esrf_data_compressor/tests/test_hdf5_helpers.py
22
22
  src/esrf_data_compressor/tests/test_jp2k.py
23
+ src/esrf_data_compressor/tests/test_paths.py
23
24
  src/esrf_data_compressor/tests/test_run_check.py
24
25
  src/esrf_data_compressor/tests/test_ssim.py
25
26
  src/esrf_data_compressor/tests/test_utils.py
26
27
  src/esrf_data_compressor/utils/hdf5_helpers.py
28
+ src/esrf_data_compressor/utils/paths.py
27
29
  src/esrf_data_compressor/utils/utils.py