esrf-data-compressor 0.1.2__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {esrf_data_compressor-0.1.2/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.1}/PKG-INFO +6 -4
  2. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/README.md +6 -4
  3. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/pyproject.toml +2 -2
  4. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/checker/run_check.py +7 -6
  5. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/cli.py +15 -3
  6. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/base.py +96 -17
  7. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/jp2k.py +3 -3
  8. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_cli.py +57 -3
  9. esrf_data_compressor-0.2.1/src/esrf_data_compressor/tests/test_paths.py +36 -0
  10. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_run_check.py +18 -0
  11. esrf_data_compressor-0.2.1/src/esrf_data_compressor/utils/paths.py +129 -0
  12. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1/src/esrf_data_compressor.egg-info}/PKG-INFO +6 -4
  13. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/SOURCES.txt +2 -0
  14. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/LICENSE +0 -0
  15. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/setup.cfg +0 -0
  16. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/__init__.py +0 -0
  17. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/checker/ssim.py +0 -0
  18. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/__init__.py +0 -0
  19. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/finder/finder.py +0 -0
  20. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/__init__.py +0 -0
  21. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_finder.py +0 -0
  22. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_hdf5_helpers.py +0 -0
  23. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_jp2k.py +0 -0
  24. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_ssim.py +0 -0
  25. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_utils.py +0 -0
  26. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/hdf5_helpers.py +0 -0
  27. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/utils.py +0 -0
  28. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/dependency_links.txt +0 -0
  29. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/entry_points.txt +0 -0
  30. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/requires.txt +0 -0
  31. {esrf_data_compressor-0.1.2 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -79,12 +79,14 @@ Dynamic: license-file
79
79
 
80
80
  * **Parallel execution**
81
81
 
82
- * Automatically factors CPU cores into worker processes × per-process threads
83
- * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
82
+ * Automatically factors CPU cores into worker processes × per-process threads
83
+ * By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
84
84
 
85
85
  * **Non-destructive workflow**
86
86
 
87
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
87
+ 1. `compress` writes compressed files either:
88
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
89
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
88
90
  2. `check` computes SSIM (first and last frames) and writes a report
89
91
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
90
92
 
@@ -18,12 +18,14 @@
18
18
 
19
19
  * **Parallel execution**
20
20
 
21
- * Automatically factors CPU cores into worker processes × per-process threads
22
- * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
21
+ * Automatically factors CPU cores into worker processes × per-process threads
22
+ * By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
23
23
 
24
24
  * **Non-destructive workflow**
25
25
 
26
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
26
+ 1. `compress` writes compressed files either:
27
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
28
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
27
29
  2. `check` computes SSIM (first and last frames) and writes a report
28
30
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
29
31
 
@@ -119,4 +121,4 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
119
121
  * Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
120
122
  * Parallelism with worker×thread auto-factoring.
121
123
 
122
- For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
124
+ For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "esrf-data-compressor"
7
- version = "0.1.2"
7
+ version = "0.2.1"
8
8
  authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
9
9
  description = "A library to compress ESRF data and reduce their footprint"
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -69,4 +69,4 @@ omit = ["*/tests/*"]
69
69
 
70
70
  [tool.isort]
71
71
  profile = "black"
72
- force_single_line = true
72
+ force_single_line = true
@@ -3,12 +3,15 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
3
3
  from tqdm import tqdm
4
4
 
5
5
  from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
6
+ from esrf_data_compressor.utils.paths import get_available_cpus, resolve_compressed_path
6
7
 
7
8
 
8
- def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
9
+ def run_ssim_check(
10
+ raw_files: list[str], method: str, report_path: str, layout: str = "sibling"
11
+ ) -> None:
9
12
  """
10
13
  Given a list of raw HDF5 file paths, partitions into:
11
- to_check → those with a sibling <stem>_<method>.h5
14
+ to_check → those with an expected compressed counterpart according to `layout`
12
15
  missing → those without one
13
16
 
14
17
  Writes a report to `report_path`:
@@ -21,9 +24,7 @@ def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
21
24
 
22
25
  # partition
23
26
  for orig in raw_files:
24
- dirname, fname = os.path.dirname(orig), os.path.basename(orig)
25
- stem, _ = os.path.splitext(fname)
26
- comp_path = os.path.join(dirname, f"{stem}_{method}.h5")
27
+ comp_path = resolve_compressed_path(orig, method, layout=layout)
27
28
  if os.path.exists(comp_path):
28
29
  to_check.append((orig, comp_path))
29
30
  else:
@@ -45,7 +46,7 @@ def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
45
46
  return
46
47
 
47
48
  # run SSIM in parallel
48
- n_workers = min(len(to_check), os.cpu_count() or 1)
49
+ n_workers = min(len(to_check), get_available_cpus())
49
50
  with ProcessPoolExecutor(max_workers=n_workers) as exe:
50
51
  futures = {
51
52
  exe.submit(compute_ssim_for_file_pair, orig, comp): (orig, comp)
@@ -50,9 +50,9 @@ def do_compress(args):
50
50
  return
51
51
 
52
52
  print(
53
- f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method and ratio {args.cratio} …"
53
+ f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method, ratio {args.cratio}, layout '{args.layout}' …"
54
54
  )
55
- mgr = CompressorManager(cratio=args.cratio, method=args.method)
55
+ mgr = CompressorManager(cratio=args.cratio, method=args.method, layout=args.layout)
56
56
  mgr.compress_files(files)
57
57
  print("Compression complete.\n")
58
58
 
@@ -72,7 +72,7 @@ def do_check(args):
72
72
  report_path = os.path.abspath(report_fname)
73
73
 
74
74
  try:
75
- run_ssim_check(files, args.method, report_path)
75
+ run_ssim_check(files, args.method, report_path, layout=args.layout)
76
76
  except SystemExit as e:
77
77
  exit_with_error(str(e))
78
78
 
@@ -142,6 +142,12 @@ def main():
142
142
  default="jp2k",
143
143
  help="Compression method",
144
144
  )
145
+ p.add_argument(
146
+ "--layout",
147
+ choices=["sibling", "mirror"],
148
+ default="mirror",
149
+ help="Output layout: sibling (next to each source) or mirror (under RAW_DATA_COMPRESSED, preserving source names).",
150
+ )
145
151
  p.set_defaults(func=do_compress)
146
152
 
147
153
  p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
@@ -151,6 +157,12 @@ def main():
151
157
  p.add_argument(
152
158
  "--method", choices=["jp2k"], default="jp2k", help="Compression method"
153
159
  )
160
+ p.add_argument(
161
+ "--layout",
162
+ choices=["sibling", "mirror"],
163
+ default="mirror",
164
+ help="Location of compressed files to check.",
165
+ )
154
166
  p.set_defaults(func=do_check)
155
167
 
156
168
  p = sub.add_parser(
@@ -1,8 +1,15 @@
1
1
  import os
2
- from concurrent.futures import ProcessPoolExecutor, as_completed
2
+ import shutil
3
+ from pathlib import Path
4
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
3
5
  from tqdm import tqdm
4
6
 
5
7
  from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
8
+ from esrf_data_compressor.utils.paths import (
9
+ get_available_cpus,
10
+ resolve_compressed_path,
11
+ resolve_mirror_path,
12
+ )
6
13
 
7
14
 
8
15
  class Compressor:
@@ -18,11 +25,11 @@ class CompressorManager:
18
25
  """
19
26
  Manages parallel compression and overwrite.
20
27
 
21
- Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
28
+ Each worker process is given up to 2 Blosc2 threads (or fewer if the machine
22
29
  has fewer than 4 cores). The number of worker processes is then
23
30
  total_cores // threads_per_worker (at least 1). If the user explicitly
24
31
  passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
25
- = min(4, total_cores // workers).
32
+ = min(2, total_cores // workers).
26
33
 
27
34
  Usage:
28
35
  mgr = CompressorManager(cratio=10, method='jp2k')
@@ -31,10 +38,14 @@ class CompressorManager:
31
38
  """
32
39
 
33
40
  def __init__(
34
- self, workers: int | None = None, cratio: int = 10, method: str = "jp2k"
41
+ self,
42
+ workers: int | None = None,
43
+ cratio: int = 10,
44
+ method: str = "jp2k",
45
+ layout: str = "sibling",
35
46
  ):
36
- total_cores = os.cpu_count() or 1
37
- default_nthreads = 4 if total_cores >= 4 else 1
47
+ total_cores = get_available_cpus()
48
+ default_nthreads = 2 if total_cores >= 2 else 1
38
49
  default_workers = max(1, total_cores // default_nthreads)
39
50
 
40
51
  if workers is None:
@@ -43,12 +54,13 @@ class CompressorManager:
43
54
  else:
44
55
  w = min(workers, total_cores)
45
56
  possible = total_cores // w
46
- nthreads = min(possible, 4) if possible >= 1 else 1
57
+ nthreads = min(possible, 2) if possible >= 1 else 1
47
58
 
48
59
  self.workers = max(1, w)
49
60
  self.nthreads = max(1, nthreads)
50
61
  self.cratio = cratio
51
62
  self.method = method
63
+ self.layout = layout
52
64
 
53
65
  if self.method == "jp2k":
54
66
  self.compressor = JP2KCompressorWrapper(
@@ -58,33 +70,98 @@ class CompressorManager:
58
70
  raise ValueError(f"Unsupported compression method: {self.method}")
59
71
 
60
72
  print(f"Compression method: {self.method}")
73
+ print(f"Output layout: {self.layout}")
61
74
  print(f"Total CPU cores: {total_cores}")
62
75
  print(f"Worker processes: {self.workers}")
63
76
  print(f"Threads per worker: {self.nthreads}")
64
77
  print(f"Total threads: {self.workers * self.nthreads}")
65
78
 
79
+ @staticmethod
80
+ def _find_raw_root(path: str) -> str | None:
81
+ p = Path(os.path.abspath(path))
82
+ parts = p.parts
83
+ if "RAW_DATA" not in parts:
84
+ return None
85
+ return str(Path(*parts[: parts.index("RAW_DATA") + 1]))
86
+
66
87
  def _compress_worker(self, ipath: str) -> tuple[str, str]:
67
88
  """
68
89
  Worker function for ProcessPoolExecutor: compress a single HDF5:
69
- <ipath>.h5 <same_dir>/<basename>_<method>.h5
90
+ - sibling layout: <same_dir>/<basename>_<method>.h5
91
+ - mirror layout: mirror RAW_DATA tree under RAW_DATA_COMPRESSED
70
92
  """
71
- base, _ = os.path.splitext(ipath)
72
- outp = f"{base}_{self.method}.h5"
93
+ outp = resolve_compressed_path(ipath, self.method, layout=self.layout)
94
+ os.makedirs(os.path.dirname(outp), exist_ok=True)
73
95
  self.compressor.compress_file(
74
96
  ipath, outp, cratio=self.cratio, nthreads=self.nthreads
75
97
  )
76
98
  return ipath, "success"
77
99
 
100
+ def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
101
+ source_targets = {os.path.realpath(p) for p in file_list}
102
+ raw_roots: set[str] = set()
103
+ for ipath in file_list:
104
+ raw_root = self._find_raw_root(ipath)
105
+ if raw_root:
106
+ raw_roots.add(raw_root)
107
+
108
+ copy_tasks: list[tuple[str, str]] = []
109
+ for src_dir in sorted(raw_roots):
110
+ try:
111
+ dst_dir = resolve_mirror_path(src_dir)
112
+ except ValueError:
113
+ print(f"WARNING: Cannot mirror folder outside RAW_DATA: '{src_dir}'")
114
+ continue
115
+
116
+ for cur, dirs, files in os.walk(src_dir):
117
+ rel_cur = os.path.relpath(cur, src_dir)
118
+ target_cur = (
119
+ dst_dir if rel_cur == "." else os.path.join(dst_dir, rel_cur)
120
+ )
121
+ os.makedirs(target_cur, exist_ok=True)
122
+
123
+ for dname in dirs:
124
+ os.makedirs(os.path.join(target_cur, dname), exist_ok=True)
125
+
126
+ for fname in files:
127
+ src_file = os.path.join(cur, fname)
128
+ if os.path.realpath(src_file) in source_targets:
129
+ # Do not copy raw files that will be produced by compression.
130
+ continue
131
+ dst_file = os.path.join(target_cur, fname)
132
+ copy_tasks.append((src_file, dst_file))
133
+
134
+ if not copy_tasks:
135
+ return
136
+
137
+ max_workers = min(len(copy_tasks), max(1, get_available_cpus()), 8)
138
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
139
+ futures = {
140
+ executor.submit(shutil.copy2, s, d): (s, d) for s, d in copy_tasks
141
+ }
142
+ for fut in as_completed(futures):
143
+ src_file, dst_file = futures[fut]
144
+ try:
145
+ fut.result()
146
+ except Exception as e:
147
+ print(f"WARNING: Failed to copy '{src_file}' → '{dst_file}': {e}")
148
+
78
149
  def compress_files(self, file_list: list[str]) -> None:
79
150
  """
80
- Compress each .h5 in file_list in parallel, producing <basename>_<method>.h5
81
- next to each source file. Does not overwrite originals. At the end, prints
82
- total elapsed time and data rate in MB/s.
151
+ Compress each .h5 in file_list in parallel.
152
+ - sibling layout: produce <basename>_<method>.h5 next to each source.
153
+ - mirror layout: write compressed files to RAW_DATA_COMPRESSED with same file names.
154
+ Does not overwrite originals. At the end, prints total elapsed time and data rate in MB/s.
83
155
  """
84
156
  valid = [p for p in file_list if p.lower().endswith(".h5")]
85
157
  if not valid:
86
158
  print("No valid .h5 files to compress.")
87
159
  return
160
+ if self.layout == "mirror":
161
+ print(
162
+ "Preparing RAW_DATA_COMPRESSED with non-compressed dataset content..."
163
+ )
164
+ self._mirror_non_compressed_dataset_content(valid)
88
165
 
89
166
  total_bytes = 0
90
167
  for f in valid:
@@ -130,8 +207,9 @@ class CompressorManager:
130
207
  if not ipath.lower().endswith(".h5"):
131
208
  continue
132
209
 
133
- base, _ = os.path.splitext(ipath)
134
- compressed_path = f"{base}_{self.method}.h5"
210
+ compressed_path = resolve_compressed_path(
211
+ ipath, self.method, layout=self.layout
212
+ )
135
213
 
136
214
  if os.path.exists(compressed_path):
137
215
  backup = ipath + ".bak"
@@ -184,9 +262,10 @@ class CompressorManager:
184
262
  if not ipath.lower().endswith(".h5"):
185
263
  continue
186
264
 
187
- base, _ = os.path.splitext(ipath)
188
265
  backup = ipath + ".bak"
189
- method_path = f"{base}_{self.method}.h5"
266
+ method_path = resolve_compressed_path(
267
+ ipath, self.method, layout=self.layout
268
+ )
190
269
 
191
270
  if not os.path.exists(backup):
192
271
  print(f"SKIP (no backup): {ipath}")
@@ -54,8 +54,7 @@ class JP2KCompressor:
54
54
  )
55
55
 
56
56
  def _compress_3d(self, name: str, src_dset: h5py.Dataset, dst_grp: h5py.Group):
57
- data = src_dset[()]
58
- Z, Y, X = data.shape
57
+ Z, Y, X = src_dset.shape
59
58
 
60
59
  dst_dset = dst_grp.create_dataset(
61
60
  name,
@@ -70,7 +69,8 @@ class JP2KCompressor:
70
69
  t0 = time.perf_counter()
71
70
 
72
71
  for z in range(Z):
73
- plane = data[z, :, :]
72
+ # Read one slice at a time to reduce peak RAM usage.
73
+ plane = src_dset[z, :, :]
74
74
  t1 = time.perf_counter()
75
75
  b2im = blosc2.asarray(
76
76
  plane[np.newaxis, ...],
@@ -109,7 +109,7 @@ def test_commands_with_non_empty_list(
109
109
  # Run command
110
110
  argv = [cmd, "-i", "report.txt"]
111
111
  if cmd == "compress":
112
- argv += ["--cratio", "5", "--method", "jp2k"]
112
+ argv += ["--cratio", "5", "--method", "jp2k", "--layout", "sibling"]
113
113
  argv_runner(argv)
114
114
  out = capsys.readouterr().out
115
115
  assert msg_start in out
@@ -167,17 +167,71 @@ def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_pat
167
167
  def test_check_success_writes_report(argv_runner, monkeypatch, capsys, tmp_path):
168
168
  monkeypatch.setattr(cli, "parse_report", lambda rpt: ["f"])
169
169
 
170
- def run(files, method, out):
170
+ def run(files, method, out, layout):
171
+ assert layout == "sibling"
171
172
  with open(out, "w") as f:
172
173
  f.write("ok")
173
174
 
174
175
  monkeypatch.setattr(cli, "run_ssim_check", run)
175
176
  report = tmp_path / "rpt.txt"
176
- argv_runner(["check", "-i", str(report), "--method", "jp2k"])
177
+ argv_runner(["check", "-i", str(report), "--method", "jp2k", "--layout", "sibling"])
177
178
  out = capsys.readouterr().out
178
179
  assert "SSIM report written to" in out
179
180
 
180
181
 
182
+ def test_compress_mirror_layout_creates_under_raw_data_compressed(
183
+ argv_runner, monkeypatch, tmp_path
184
+ ):
185
+ ds = tmp_path / "RAW_DATA" / "sampleA" / "ds1"
186
+ src = ds / "scan0001" / "f1.h5"
187
+ src.parent.mkdir(parents=True)
188
+ src.write_text("data")
189
+ base = ds / "dataset.h5"
190
+ base.write_text("base")
191
+ sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
192
+ sample_sidecar.write_text("sidecar")
193
+ other_sample_sidecar = tmp_path / "RAW_DATA" / "sampleB" / "other_sidecar.h5"
194
+ other_sample_sidecar.parent.mkdir(parents=True)
195
+ other_sample_sidecar.write_text("other")
196
+ side = ds / "scan0002" / "meta.txt"
197
+ side.parent.mkdir(parents=True)
198
+ side.write_text("meta")
199
+ monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(src)])
200
+ monkeypatch.setattr(
201
+ JP2KCompressorWrapper,
202
+ "compress_file",
203
+ lambda self, inp, out, **kw: open(out, "w").close(),
204
+ )
205
+
206
+ argv_runner(
207
+ [
208
+ "compress",
209
+ "-i",
210
+ "report.txt",
211
+ "--cratio",
212
+ "5",
213
+ "--method",
214
+ "jp2k",
215
+ "--layout",
216
+ "mirror",
217
+ ]
218
+ )
219
+
220
+ # The dataset base/filter file is mirrored under RAW_DATA_COMPRESSED.
221
+ assert (
222
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "dataset.h5"
223
+ ).exists()
224
+ # Compressed file keeps the same source name under mirrored scan path.
225
+ assert (
226
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0001" / "f1.h5"
227
+ ).exists()
228
+ assert (
229
+ tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
230
+ ).exists()
231
+ assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
232
+ assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleB" / "other_sidecar.h5").exists()
233
+
234
+
181
235
  def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
182
236
  # Prepare a file and its backup
183
237
  (tmp_path / "f1.h5").write_text("current")
@@ -0,0 +1,36 @@
1
+ import pytest
2
+
3
+ from esrf_data_compressor.utils.paths import (
4
+ find_dataset_base_h5,
5
+ resolve_compressed_path,
6
+ resolve_mirror_path,
7
+ )
8
+
9
+
10
+ def test_resolve_compressed_path_sibling():
11
+ p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
12
+ out = resolve_compressed_path(p, "jp2k", layout="sibling")
13
+ assert out == "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1_jp2k.h5"
14
+
15
+
16
+ def test_resolve_compressed_path_mirror():
17
+ p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
18
+ out = resolve_compressed_path(p, "jp2k", layout="mirror")
19
+ assert out == "/data/visitor/e/bl/s/RAW_DATA_COMPRESSED/sample/ds/f1.h5"
20
+
21
+
22
+ def test_resolve_mirror_path_requires_raw_data():
23
+ with pytest.raises(ValueError):
24
+ resolve_mirror_path("/tmp/no_raw_data_here/f1.h5")
25
+
26
+
27
+ def test_find_dataset_base_h5(tmp_path):
28
+ ds = tmp_path / "RAW_DATA" / "sample" / "ds1"
29
+ scan = ds / "scan0001"
30
+ scan.mkdir(parents=True)
31
+ base = ds / "dataset.h5"
32
+ base.write_text("base")
33
+ src = scan / "frames.h5"
34
+ src.write_text("source")
35
+
36
+ assert find_dataset_base_h5(str(src)) == str(base)
@@ -105,3 +105,21 @@ def test_ssim_error_handling(tmp_path, monkeypatch):
105
105
  # should include an ERROR line mentioning the exception message
106
106
  assert any("ERROR processing file pair" in line for line in lines)
107
107
  assert any("Error" in line for line in lines)
108
+
109
+
110
+ def test_mirror_layout_finds_compressed_file(tmp_path, monkeypatch):
111
+ raw = tmp_path / "RAW_DATA" / "sample" / "ds" / "d3.h5"
112
+ comp = tmp_path / "RAW_DATA_COMPRESSED" / "sample" / "ds" / "d3.h5"
113
+ raw.parent.mkdir(parents=True)
114
+ comp.parent.mkdir(parents=True)
115
+ raw.write_text("r3")
116
+ comp.write_text("c3")
117
+ report = tmp_path / "report.txt"
118
+
119
+ monkeypatch.setattr(rs, "compute_ssim_for_file_pair", lambda o, c: ("d3", ["ok"]))
120
+
121
+ rs.run_ssim_check(
122
+ [str(raw)], method="method", report_path=str(report), layout="mirror"
123
+ )
124
+ lines = _read_report(report)
125
+ assert lines[2] == f"Compressed file: {comp}"
@@ -0,0 +1,129 @@
1
+ import os
2
+ from pathlib import Path
3
+ import re
4
+
5
+
6
+ def _parse_slurm_cpus_env() -> int | None:
7
+ """
8
+ Return CPU count from SLURM env vars if available.
9
+ """
10
+ candidates = [
11
+ ("SLURM_CPUS_PER_TASK", None),
12
+ ("SLURM_CPUS_ON_NODE", None),
13
+ ("SLURM_JOB_CPUS_PER_NODE", "1"),
14
+ ("SLURM_TASKS_PER_NODE", None),
15
+ ]
16
+ for key, fallback in candidates:
17
+ val = os.environ.get(key)
18
+ if not val:
19
+ continue
20
+ if key == "SLURM_JOB_CPUS_PER_NODE":
21
+ # Formats like "32(x2)" or "32,32" or "32"
22
+ val = val.split(",")[0]
23
+ if "(x" in val:
24
+ val = val.split("(x", 1)[0]
25
+ if key == "SLURM_TASKS_PER_NODE":
26
+ # Often like "1" or "2(x3)"
27
+ if "(x" in val:
28
+ val = val.split("(x", 1)[0]
29
+ try:
30
+ n = int(val)
31
+ if n > 0:
32
+ return n
33
+ except ValueError:
34
+ if fallback is not None:
35
+ try:
36
+ n = int(fallback)
37
+ if n > 0:
38
+ return n
39
+ except ValueError:
40
+ pass
41
+ return None
42
+
43
+
44
+ def get_available_cpus() -> int:
45
+ """
46
+ Use SLURM-provided CPU count when available; otherwise fall back to os.cpu_count().
47
+ """
48
+ slurm = _parse_slurm_cpus_env()
49
+ if slurm is not None:
50
+ return slurm
51
+ return os.cpu_count() or 1
52
+
53
+
54
+ def resolve_mirror_path(
55
+ input_path: str,
56
+ *,
57
+ source_root: str = "RAW_DATA",
58
+ target_root: str = "RAW_DATA_COMPRESSED",
59
+ ) -> str:
60
+ """
61
+ Build a mirrored path under `target_root` by replacing the `source_root`
62
+ segment in `input_path`.
63
+ """
64
+ parts = Path(input_path).parts
65
+ if source_root not in parts:
66
+ raise ValueError(
67
+ f"Cannot mirror path '{input_path}': missing '{source_root}' segment."
68
+ )
69
+ idx = parts.index(source_root)
70
+ return str(Path(*parts[:idx], target_root, *parts[idx + 1 :]))
71
+
72
+
73
+ def resolve_compressed_path(
74
+ input_path: str,
75
+ method: str,
76
+ *,
77
+ layout: str = "sibling",
78
+ source_root: str = "RAW_DATA",
79
+ target_root: str = "RAW_DATA_COMPRESSED",
80
+ ) -> str:
81
+ if layout == "sibling":
82
+ base_name = os.path.splitext(os.path.basename(input_path))[0]
83
+ compressed_name = f"{base_name}_{method}.h5"
84
+ return os.path.join(os.path.dirname(input_path), compressed_name)
85
+ if layout == "mirror":
86
+ # In mirror mode, compressed files keep the same file name as source.
87
+ return resolve_mirror_path(
88
+ input_path, source_root=source_root, target_root=target_root
89
+ )
90
+ raise ValueError(f"Unsupported layout: {layout}")
91
+
92
+
93
+ def find_dataset_base_h5(
94
+ input_path: str,
95
+ *,
96
+ source_root: str = "RAW_DATA",
97
+ ) -> str | None:
98
+ """
99
+ Walk up from `input_path` to find the dataset directory that contains:
100
+ - exactly one .h5 file (the base/filter file)
101
+ - at least one scanXXXX subdirectory
102
+ Returns the absolute path to that .h5, or None when not found.
103
+ """
104
+ scan_re = re.compile(r"^scan\d{4}$", re.IGNORECASE)
105
+ p = Path(input_path).resolve()
106
+ parts = p.parts
107
+ if source_root not in parts:
108
+ return None
109
+
110
+ root_idx = parts.index(source_root)
111
+ cur = p.parent
112
+ while True:
113
+ if len(cur.parts) < root_idx + 1:
114
+ return None
115
+
116
+ try:
117
+ entries = list(cur.iterdir())
118
+ except OSError:
119
+ entries = []
120
+
121
+ h5_files = [e for e in entries if e.is_file() and e.suffix.lower() == ".h5"]
122
+ has_scan = any(e.is_dir() and scan_re.match(e.name) for e in entries)
123
+
124
+ if has_scan and len(h5_files) == 1:
125
+ return str(h5_files[0])
126
+
127
+ if len(cur.parts) == root_idx + 1:
128
+ return None
129
+ cur = cur.parent
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -79,12 +79,14 @@ Dynamic: license-file
79
79
 
80
80
  * **Parallel execution**
81
81
 
82
- * Automatically factors CPU cores into worker processes × per-process threads
83
- * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
82
+ * Automatically factors CPU cores into worker processes × per-process threads
83
+ * By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
84
84
 
85
85
  * **Non-destructive workflow**
86
86
 
87
- 1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
87
+ 1. `compress` writes compressed files either:
88
+ - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
89
+ - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
88
90
  2. `check` computes SSIM (first and last frames) and writes a report
89
91
  3. `overwrite` (optional) swaps out the raw frame file (irreversible)
90
92
 
@@ -20,8 +20,10 @@ src/esrf_data_compressor/tests/test_cli.py
20
20
  src/esrf_data_compressor/tests/test_finder.py
21
21
  src/esrf_data_compressor/tests/test_hdf5_helpers.py
22
22
  src/esrf_data_compressor/tests/test_jp2k.py
23
+ src/esrf_data_compressor/tests/test_paths.py
23
24
  src/esrf_data_compressor/tests/test_run_check.py
24
25
  src/esrf_data_compressor/tests/test_ssim.py
25
26
  src/esrf_data_compressor/tests/test_utils.py
26
27
  src/esrf_data_compressor/utils/hdf5_helpers.py
28
+ src/esrf_data_compressor/utils/paths.py
27
29
  src/esrf_data_compressor/utils/utils.py