PyPI - esrf-data-compressor - Versions diffs - 0.1.1__tar.gz → 0.2.0__tar.gz - Mend

esrf-data-compressor 0.1.1tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{esrf_data_compressor-0.1.1/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: esrf-data-compressor
-Version: 0.1.1
+Version: 0.2.0
 Summary: A library to compress ESRF data and reduce their footprint
 Author-email: ESRF <dau-pydev@esrf.fr>
 License: MIT License
@@ -84,7 +84,9 @@ Dynamic: license-file
 * **Non-destructive workflow**
-  1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
+  1. `compress` writes compressed files either:
+     - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
+     - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
   2. `check` computes SSIM (first and last frames) and writes a report
   3. `overwrite` (optional) swaps out the raw frame file (irreversible)

{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/README.md RENAMED Viewed

@@ -23,7 +23,9 @@
 * **Non-destructive workflow**
-  1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
+  1. `compress` writes compressed files either:
+     - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
+     - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
   2. `check` computes SSIM (first and last frames) and writes a report
   3. `overwrite` (optional) swaps out the raw frame file (irreversible)
@@ -119,4 +121,4 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
 * Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
 * Parallelism with worker×thread auto-factoring.
-For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
+For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).

{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "esrf-data-compressor"
-version = "0.1.1"
+version = "0.2.0"
 authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
 description = "A library to compress ESRF data and reduce their footprint"
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -69,4 +69,4 @@ omit = ["*/tests/*"]
 [tool.isort]
 profile = "black"
-force_single_line = true
+force_single_line = true

{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/checker/run_check.py RENAMED Viewed

@@ -3,12 +3,15 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
 from tqdm import tqdm
 from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
+from esrf_data_compressor.utils.paths import resolve_compressed_path
-def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
+def run_ssim_check(
+    raw_files: list[str], method: str, report_path: str, layout: str = "sibling"
+) -> None:
     """
     Given a list of raw HDF5 file paths, partitions into:
-      to_check → those with a sibling <stem>_<method>.h5
+      to_check → those with an expected compressed counterpart according to `layout`
       missing  → those without one
     Writes a report to `report_path`:
@@ -21,9 +24,7 @@ def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
     # partition
     for orig in raw_files:
-        dirname, fname = os.path.dirname(orig), os.path.basename(orig)
-        stem, _ = os.path.splitext(fname)
-        comp_path = os.path.join(dirname, f"{stem}_{method}.h5")
+        comp_path = resolve_compressed_path(orig, method, layout=layout)
         if os.path.exists(comp_path):
             to_check.append((orig, comp_path))
         else:

{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/cli.py RENAMED Viewed

@@ -46,13 +46,13 @@ def do_compress(args):
         exit_with_error(f"Failed to read report '{report}': {e}")
     if not files:
-        print("Nothing to compress (TO COMPRESS list is empty).")
+        print("Nothing to compress (TO COMPRESS list is empty).")
         return
     print(
-        f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method and ratio {args.cratio} …"
+        f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method, ratio {args.cratio}, layout '{args.layout}' …"
     )
-    mgr = CompressorManager(cratio=args.cratio, method=args.method)
+    mgr = CompressorManager(cratio=args.cratio, method=args.method, layout=args.layout)
     mgr.compress_files(files)
     print("Compression complete.\n")
@@ -65,15 +65,14 @@ def do_check(args):
         exit_with_error(f"Failed to read report '{report}': {e}")
     if not files:
-        print("Nothing to check (TO COMPRESS list is empty).")
+        print("Nothing to check (TO COMPRESS list is empty).")
         return
-    # We reuse run_ssim_check in its 3‑arg form (raw_files, method, report_path)
     report_fname = f"{os.path.splitext(report)[0]}_{args.method}_ssim_report.txt"
     report_path = os.path.abspath(report_fname)
     try:
-        run_ssim_check(files, args.method, report_path)
+        run_ssim_check(files, args.method, report_path, layout=args.layout)
     except SystemExit as e:
         exit_with_error(str(e))
@@ -81,9 +80,6 @@ def do_check(args):
 def do_overwrite(args):
-    """
-    Overwrite TO COMPRESS files with their original sources.
-    """
     report = args.input or "file_list.txt"
     try:
         files = parse_report(report)
@@ -91,13 +87,26 @@ def do_overwrite(args):
         exit_with_error(f"Failed to read report '{report}': {e}")
     if not files:
-        print("Nothing to overwrite (TO COMPRESS list is empty).")
+        print("Nothing to process (TO COMPRESS list is empty).")
         return
-    print(f"Overwriting {len(files)} file(s) from '{report}' …")
     mgr = CompressorManager()
+    if args.final:
+        print(f"Finalizing overwrite for {len(files)} file(s) from '{report}' …")
+        mgr.remove_backups(files)
+        print("Finalize step complete.\n")
+        return
+    if args.undo:
+        print(f"Undoing overwrite for {len(files)} file(s) from '{report}' …")
+        mgr.restore_backups(files)
+        print("Undo step complete.\n")
+        return
+    print(f"Overwriting {len(files)} file(s) from '{report}' …")
     mgr.overwrite_files(files)
-    print("Overwrite complete.\n")
+    print("Overwrite complete (backups kept).\n")
 def main():
@@ -106,7 +115,6 @@ def main():
     )
     sub = parser.add_subparsers(dest="command", required=True)
-    # list
     p = sub.add_parser("list", help="Report VDS sources → TO COMPRESS vs REMAINING")
     p.add_argument("experiment", help="Experiment ID")
     p.add_argument("beamline", nargs="?", help="Optional beamline")
@@ -115,13 +123,12 @@ def main():
     p.add_argument(
         "--filter",
         metavar="KEY:VAL[,KEY2:VAL2...]",
-        help="Dataset‑level attribute substring filters",
+        help="Dataset-level attribute substring filters",
     )
     p.add_argument("--output", help="Report file (default = file_list.txt)")
     p.set_defaults(func=do_list)
-    # compress
-    p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
+    p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
     p.add_argument(
         "--input",
         "-i",
@@ -135,23 +142,47 @@ def main():
         default="jp2k",
         help="Compression method",
     )
+    p.add_argument(
+        "--layout",
+        choices=["sibling", "mirror"],
+        default="mirror",
+        help="Output layout: sibling (next to each source) or mirror (under RAW_DATA_COMPRESSED, preserving source names).",
+    )
     p.set_defaults(func=do_compress)
-    # check
-    p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
+    p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
     p.add_argument(
         "--input", "-i", help="Report file to read (default = file_list.txt)"
     )
     p.add_argument(
         "--method", choices=["jp2k"], default="jp2k", help="Compression method"
     )
+    p.add_argument(
+        "--layout",
+        choices=["sibling", "mirror"],
+        default="mirror",
+        help="Location of compressed files to check.",
+    )
     p.set_defaults(func=do_check)
-    # overwrite
-    p = sub.add_parser("overwrite", help="Overwrite only TO COMPRESS files")
+    p = sub.add_parser(
+        "overwrite",
+        help="Swap in compressed files and keep backups; with --final or --undo, perform cleanup/restore only.",
+    )
     p.add_argument(
         "--input", "-i", help="Report file to read (default = file_list.txt)"
     )
+    group = p.add_mutually_exclusive_group()
+    group.add_argument(
+        "--final",
+        action="store_true",
+        help="Cleanup only: delete existing *.h5.bak backups after confirmation (no overwrite).",
+    )
+    group.add_argument(
+        "--undo",
+        action="store_true",
+        help="Restore only: move <file>.h5.bak back to <file>.h5 and preserve the current file as <file>_<method>.h5 when needed.",
+    )
     p.set_defaults(func=do_overwrite)
     args = parser.parse_args()

esrf_data_compressor-0.2.0/src/esrf_data_compressor/compressors/base.py ADDED Viewed

@@ -0,0 +1,271 @@
+import os
+import shutil
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
+from esrf_data_compressor.utils.paths import (
+    find_dataset_base_h5,
+    resolve_compressed_path,
+    resolve_mirror_path,
+)
+class Compressor:
+    """
+    Abstract base class. Subclasses must implement compress_file().
+    """
+    def compress_file(self, input_path: str, output_path: str, **kwargs):
+        raise NotImplementedError
+class CompressorManager:
+    """
+    Manages parallel compression and overwrite.
+    Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
+    has fewer than 4 cores).  The number of worker processes is then
+    total_cores // threads_per_worker (at least 1).  If the user explicitly
+    passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
+    = min(4, total_cores // workers).
+    Usage:
+        mgr = CompressorManager(cratio=10, method='jp2k')
+        mgr.compress_files([...])
+        mgr.overwrite_files([...])
+    """
+    def __init__(
+        self,
+        workers: int | None = None,
+        cratio: int = 10,
+        method: str = "jp2k",
+        layout: str = "sibling",
+    ):
+        total_cores = os.cpu_count() or 1
+        default_nthreads = 4 if total_cores >= 4 else 1
+        default_workers = max(1, total_cores // default_nthreads)
+        if workers is None:
+            w = default_workers
+            nthreads = default_nthreads
+        else:
+            w = min(workers, total_cores)
+            possible = total_cores // w
+            nthreads = min(possible, 4) if possible >= 1 else 1
+        self.workers = max(1, w)
+        self.nthreads = max(1, nthreads)
+        self.cratio = cratio
+        self.method = method
+        self.layout = layout
+        if self.method == "jp2k":
+            self.compressor = JP2KCompressorWrapper(
+                cratio=cratio, nthreads=self.nthreads
+            )
+        else:
+            raise ValueError(f"Unsupported compression method: {self.method}")
+        print(f"Compression method: {self.method}")
+        print(f"Output layout: {self.layout}")
+        print(f"Total CPU cores: {total_cores}")
+        print(f"Worker processes: {self.workers}")
+        print(f"Threads per worker: {self.nthreads}")
+        print(f"Total threads: {self.workers * self.nthreads}")
+    def _compress_worker(self, ipath: str) -> tuple[str, str]:
+        """
+        Worker function for ProcessPoolExecutor: compress a single HDF5:
+        - sibling layout: <same_dir>/<basename>_<method>.h5
+        - mirror layout:  mirror RAW_DATA tree under RAW_DATA_COMPRESSED
+        """
+        outp = resolve_compressed_path(ipath, self.method, layout=self.layout)
+        os.makedirs(os.path.dirname(outp), exist_ok=True)
+        self.compressor.compress_file(
+            ipath, outp, cratio=self.cratio, nthreads=self.nthreads
+        )
+        return ipath, "success"
+    def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
+        source_targets = {os.path.realpath(p) for p in file_list}
+        mirror_roots: set[str] = set()
+        for ipath in file_list:
+            base_h5 = find_dataset_base_h5(ipath)
+            dataset_dir = (
+                os.path.dirname(base_h5) if base_h5 else os.path.dirname(ipath)
+            )
+            # Mirror the parent sample folder too, so sidecar files next to
+            # dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
+            mirror_roots.add(os.path.dirname(dataset_dir))
+        for src_dir in sorted(mirror_roots):
+            try:
+                dst_dir = resolve_mirror_path(src_dir)
+            except ValueError:
+                print(f"WARNING: Cannot mirror folder outside RAW_DATA: '{src_dir}'")
+                continue
+            for cur, dirs, files in os.walk(src_dir):
+                rel_cur = os.path.relpath(cur, src_dir)
+                target_cur = (
+                    dst_dir if rel_cur == "." else os.path.join(dst_dir, rel_cur)
+                )
+                os.makedirs(target_cur, exist_ok=True)
+                for dname in dirs:
+                    os.makedirs(os.path.join(target_cur, dname), exist_ok=True)
+                for fname in files:
+                    src_file = os.path.join(cur, fname)
+                    if os.path.realpath(src_file) in source_targets:
+                        # Do not copy raw files that will be produced by compression.
+                        continue
+                    dst_file = os.path.join(target_cur, fname)
+                    shutil.copy2(src_file, dst_file)
+    def compress_files(self, file_list: list[str]) -> None:
+        """
+        Compress each .h5 in file_list in parallel.
+        - sibling layout: produce <basename>_<method>.h5 next to each source.
+        - mirror layout: write compressed files to RAW_DATA_COMPRESSED with same file names.
+        Does not overwrite originals. At the end, prints total elapsed time and data rate in MB/s.
+        """
+        valid = [p for p in file_list if p.lower().endswith(".h5")]
+        if not valid:
+            print("No valid .h5 files to compress.")
+            return
+        if self.layout == "mirror":
+            print(
+                "Preparing RAW_DATA_COMPRESSED with non-compressed dataset content..."
+            )
+            self._mirror_non_compressed_dataset_content(valid)
+        total_bytes = 0
+        for f in valid:
+            try:
+                total_bytes += os.path.getsize(f)
+            except OSError:
+                pass
+        import time
+        t0 = time.time()
+        with ProcessPoolExecutor(max_workers=self.workers) as executor:
+            futures = {executor.submit(self._compress_worker, p): p for p in valid}
+            for fut in tqdm(
+                as_completed(futures),
+                total=len(futures),
+                desc=f"Compressing HDF5 files ({self.method})",
+                unit="file",
+            ):
+                pth = futures[fut]
+                try:
+                    fut.result()
+                except Exception as e:
+                    print(f"Failed to compress '{pth}': {e}")
+        elapsed = time.time() - t0
+        total_mb = total_bytes / (1024 * 1024)
+        rate_mb_s = total_mb / elapsed if elapsed > 0 else float("inf")
+        print(f"\nTotal elapsed time: {elapsed:.3f}s")
+        print(f"Data processed: {total_mb:.2f} MB  ({rate_mb_s:.2f} MB/s)\n")
+    def overwrite_files(self, file_list: list[str]) -> None:
+        """
+        Overwrites files only if they have a compressed sibling:
+          1) Rename <file>.h5 → <file>.h5.bak
+          2) Rename <file>_<method>.h5 → <file>.h5
+        After processing all files, removes the backup .h5.bak files.
+        """
+        for ipath in file_list:
+            if not ipath.lower().endswith(".h5"):
+                continue
+            compressed_path = resolve_compressed_path(
+                ipath, self.method, layout=self.layout
+            )
+            if os.path.exists(compressed_path):
+                backup = ipath + ".bak"
+                try:
+                    os.replace(ipath, backup)
+                    os.replace(compressed_path, ipath)
+                    print(f"Overwritten '{ipath}' (backup at '{backup}').")
+                except Exception as e:
+                    print(f"ERROR overwriting '{ipath}': {e}")
+            else:
+                print(f"SKIP (no compressed file): {ipath}")
+    def remove_backups(self, file_list: list[str]) -> None:
+        candidates = {p + ".bak" for p in file_list if p.lower().endswith(".h5")}
+        backups = [b for b in candidates if os.path.exists(b)]
+        if not backups:
+            print("No backup files to remove.")
+            return
+        total_bytes = 0
+        for b in backups:
+            try:
+                total_bytes += os.path.getsize(b)
+            except OSError:
+                pass
+        total_mb = total_bytes / (1024 * 1024)
+        print(
+            f"About to remove {len(backups)} backup file(s), ~{total_mb:.2f} MB total."
+        )
+        ans = input("Proceed? [y/N]: ").strip().lower()
+        if ans not in ("y", "yes"):
+            print("Backups kept.")
+            return
+        removed = 0
+        for b in backups:
+            try:
+                os.remove(b)
+                removed += 1
+            except Exception as e:
+                print(f"ERROR deleting backup '{b}': {e}")
+        print(f"Deleted {removed} backup file(s).")
+    def restore_backups(self, file_list: list[str]) -> None:
+        restored = 0
+        preserved = 0
+        for ipath in file_list:
+            if not ipath.lower().endswith(".h5"):
+                continue
+            backup = ipath + ".bak"
+            method_path = resolve_compressed_path(
+                ipath, self.method, layout=self.layout
+            )
+            if not os.path.exists(backup):
+                print(f"SKIP (no backup): {ipath}")
+                continue
+            if os.path.exists(ipath) and not os.path.exists(method_path):
+                try:
+                    os.replace(ipath, method_path)
+                    preserved += 1
+                    print(f"Preserved current file to '{method_path}'.")
+                except Exception as e:
+                    print(f"ERROR preserving current '{ipath}' to '{method_path}': {e}")
+                    continue
+            try:
+                os.replace(backup, ipath)
+                restored += 1
+                print(f"Restored '{ipath}' from backup.")
+            except Exception as e:
+                print(f"ERROR restoring '{ipath}' from '{backup}': {e}")
+        print(
+            f"Restore complete. Restored: {restored}, preserved compressed copies: {preserved}."
+        )

{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_cli.py RENAMED Viewed

@@ -109,7 +109,7 @@ def test_commands_with_non_empty_list(
     # Run command
     argv = [cmd, "-i", "report.txt"]
     if cmd == "compress":
-        argv += ["--cratio", "5", "--method", "jp2k"]
+        argv += ["--cratio", "5", "--method", "jp2k", "--layout", "sibling"]
     argv_runner(argv)
     out = capsys.readouterr().out
     assert msg_start in out
@@ -119,12 +119,14 @@ def test_commands_with_non_empty_list(
         for f in files:
             comp = tmp_path / f.replace(".h5", "_jp2k.h5")
             assert comp.exists()
-    # For overwrite, verify original replaced and backup removed
+    # For overwrite, verify original replaced and backup KEPT
     if cmd == "overwrite":
         # f1 was overwritten, f2 was skipped
         assert (tmp_path / "f1.h5").exists()
-        # no backup remains
-        assert not (tmp_path / "f1.h5.bak").exists()
+        # backup remains by default
+        assert (tmp_path / "f1.h5.bak").exists()
+        # f2 had no compressed sibling → no backup
+        assert not (tmp_path / "f2.h5.bak").exists()
 def test_list_success_and_output_file(argv_runner, monkeypatch, capsys, tmp_path):
@@ -150,7 +152,7 @@ def test_list_success_and_output_file(argv_runner, monkeypatch, capsys, tmp_path
     [
         ("compress", "Nothing to compress"),
         ("check", "Nothing to check"),
-        ("overwrite", "Nothing to overwrite"),
+        ("overwrite", "Nothing to process"),
     ],
 )
 def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_path):
@@ -165,12 +167,96 @@ def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_pat
 def test_check_success_writes_report(argv_runner, monkeypatch, capsys, tmp_path):
     monkeypatch.setattr(cli, "parse_report", lambda rpt: ["f"])
-    def run(files, method, out):
+    def run(files, method, out, layout):
+        assert layout == "sibling"
         with open(out, "w") as f:
             f.write("ok")
     monkeypatch.setattr(cli, "run_ssim_check", run)
     report = tmp_path / "rpt.txt"
-    argv_runner(["check", "-i", str(report), "--method", "jp2k"])
+    argv_runner(["check", "-i", str(report), "--method", "jp2k", "--layout", "sibling"])
     out = capsys.readouterr().out
     assert "SSIM report written to" in out
+def test_compress_mirror_layout_creates_under_raw_data_compressed(
+    argv_runner, monkeypatch, tmp_path
+):
+    ds = tmp_path / "RAW_DATA" / "sampleA" / "ds1"
+    src = ds / "scan0001" / "f1.h5"
+    src.parent.mkdir(parents=True)
+    src.write_text("data")
+    base = ds / "dataset.h5"
+    base.write_text("base")
+    sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
+    sample_sidecar.write_text("sidecar")
+    side = ds / "scan0002" / "meta.txt"
+    side.parent.mkdir(parents=True)
+    side.write_text("meta")
+    monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(src)])
+    monkeypatch.setattr(
+        JP2KCompressorWrapper,
+        "compress_file",
+        lambda self, inp, out, **kw: open(out, "w").close(),
+    )
+    argv_runner(
+        [
+            "compress",
+            "-i",
+            "report.txt",
+            "--cratio",
+            "5",
+            "--method",
+            "jp2k",
+            "--layout",
+            "mirror",
+        ]
+    )
+    # The dataset base/filter file is mirrored under RAW_DATA_COMPRESSED.
+    assert (
+        tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "dataset.h5"
+    ).exists()
+    # Compressed file keeps the same source name under mirrored scan path.
+    assert (
+        tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0001" / "f1.h5"
+    ).exists()
+    assert (
+        tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
+    ).exists()
+    assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
+def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
+    # Prepare a file and its backup
+    (tmp_path / "f1.h5").write_text("current")
+    (tmp_path / "f1.h5.bak").write_text("backup")
+    # parse_report returns the original .h5 path(s)
+    monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(tmp_path / "f1.h5")])
+    # auto-confirm deletion
+    monkeypatch.setattr("builtins.input", lambda *a, **k: "y")
+    argv_runner(["overwrite", "-i", "report.txt", "--final"])
+    out = capsys.readouterr().out
+    assert "About to remove" in out
+    assert not (tmp_path / "f1.h5.bak").exists()
+def test_overwrite_undo_restores_and_preserves(
+    argv_runner, monkeypatch, capsys, tmp_path
+):
+    # Start with current file and a backup; no <method> file yet
+    (tmp_path / "f1.h5").write_text("CUR")
+    (tmp_path / "f1.h5.bak").write_text("BAK")
+    monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(tmp_path / "f1.h5")])
+    argv_runner(["overwrite", "-i", "report.txt", "--undo"])
+    out = capsys.readouterr().out
+    assert "Undoing overwrite" in out
+    # Backup should have been restored to f1.h5
+    assert (tmp_path / "f1.h5").read_text() == "BAK"
+    # Previous current should have been preserved as f1_jp2k.h5
+    assert (tmp_path / "f1_jp2k.h5").read_text() == "CUR"
+    # .bak should be gone after restore (moved)
+    assert not (tmp_path / "f1.h5.bak").exists()

esrf_data_compressor-0.2.0/src/esrf_data_compressor/tests/test_paths.py ADDED Viewed

@@ -0,0 +1,36 @@
+import pytest
+from esrf_data_compressor.utils.paths import (
+    find_dataset_base_h5,
+    resolve_compressed_path,
+    resolve_mirror_path,
+)
+def test_resolve_compressed_path_sibling():
+    p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
+    out = resolve_compressed_path(p, "jp2k", layout="sibling")
+    assert out == "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1_jp2k.h5"
+def test_resolve_compressed_path_mirror():
+    p = "/data/visitor/e/bl/s/RAW_DATA/sample/ds/f1.h5"
+    out = resolve_compressed_path(p, "jp2k", layout="mirror")
+    assert out == "/data/visitor/e/bl/s/RAW_DATA_COMPRESSED/sample/ds/f1.h5"
+def test_resolve_mirror_path_requires_raw_data():
+    with pytest.raises(ValueError):
+        resolve_mirror_path("/tmp/no_raw_data_here/f1.h5")
+def test_find_dataset_base_h5(tmp_path):
+    ds = tmp_path / "RAW_DATA" / "sample" / "ds1"
+    scan = ds / "scan0001"
+    scan.mkdir(parents=True)
+    base = ds / "dataset.h5"
+    base.write_text("base")
+    src = scan / "frames.h5"
+    src.write_text("source")
+    assert find_dataset_base_h5(str(src)) == str(base)

{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor/tests/test_run_check.py RENAMED Viewed

@@ -105,3 +105,21 @@ def test_ssim_error_handling(tmp_path, monkeypatch):
     # should include an ERROR line mentioning the exception message
     assert any("ERROR processing file pair" in line for line in lines)
     assert any("Error" in line for line in lines)
+def test_mirror_layout_finds_compressed_file(tmp_path, monkeypatch):
+    raw = tmp_path / "RAW_DATA" / "sample" / "ds" / "d3.h5"
+    comp = tmp_path / "RAW_DATA_COMPRESSED" / "sample" / "ds" / "d3.h5"
+    raw.parent.mkdir(parents=True)
+    comp.parent.mkdir(parents=True)
+    raw.write_text("r3")
+    comp.write_text("c3")
+    report = tmp_path / "report.txt"
+    monkeypatch.setattr(rs, "compute_ssim_for_file_pair", lambda o, c: ("d3", ["ok"]))
+    rs.run_ssim_check(
+        [str(raw)], method="method", report_path=str(report), layout="mirror"
+    )
+    lines = _read_report(report)
+    assert lines[2] == f"Compressed file: {comp}"

esrf_data_compressor-0.2.0/src/esrf_data_compressor/utils/paths.py ADDED Viewed

@@ -0,0 +1,81 @@
+import os
+from pathlib import Path
+import re
+def resolve_mirror_path(
+    input_path: str,
+    *,
+    source_root: str = "RAW_DATA",
+    target_root: str = "RAW_DATA_COMPRESSED",
+) -> str:
+    """
+    Build a mirrored path under `target_root` by replacing the `source_root`
+    segment in `input_path`.
+    """
+    parts = Path(input_path).parts
+    if source_root not in parts:
+        raise ValueError(
+            f"Cannot mirror path '{input_path}': missing '{source_root}' segment."
+        )
+    idx = parts.index(source_root)
+    return str(Path(*parts[:idx], target_root, *parts[idx + 1 :]))
+def resolve_compressed_path(
+    input_path: str,
+    method: str,
+    *,
+    layout: str = "sibling",
+    source_root: str = "RAW_DATA",
+    target_root: str = "RAW_DATA_COMPRESSED",
+) -> str:
+    if layout == "sibling":
+        base_name = os.path.splitext(os.path.basename(input_path))[0]
+        compressed_name = f"{base_name}_{method}.h5"
+        return os.path.join(os.path.dirname(input_path), compressed_name)
+    if layout == "mirror":
+        # In mirror mode, compressed files keep the same file name as source.
+        return resolve_mirror_path(
+            input_path, source_root=source_root, target_root=target_root
+        )
+    raise ValueError(f"Unsupported layout: {layout}")
+def find_dataset_base_h5(
+    input_path: str,
+    *,
+    source_root: str = "RAW_DATA",
+) -> str | None:
+    """
+    Walk up from `input_path` to find the dataset directory that contains:
+      - exactly one .h5 file (the base/filter file)
+      - at least one scanXXXX subdirectory
+    Returns the absolute path to that .h5, or None when not found.
+    """
+    scan_re = re.compile(r"^scan\d{4}$", re.IGNORECASE)
+    p = Path(input_path).resolve()
+    parts = p.parts
+    if source_root not in parts:
+        return None
+    root_idx = parts.index(source_root)
+    cur = p.parent
+    while True:
+        if len(cur.parts) < root_idx + 1:
+            return None
+        try:
+            entries = list(cur.iterdir())
+        except OSError:
+            entries = []
+        h5_files = [e for e in entries if e.is_file() and e.suffix.lower() == ".h5"]
+        has_scan = any(e.is_dir() and scan_re.match(e.name) for e in entries)
+        if has_scan and len(h5_files) == 1:
+            return str(h5_files[0])
+        if len(cur.parts) == root_idx + 1:
+            return None
+        cur = cur.parent

{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: esrf-data-compressor
-Version: 0.1.1
+Version: 0.2.0
 Summary: A library to compress ESRF data and reduce their footprint
 Author-email: ESRF <dau-pydev@esrf.fr>
 License: MIT License
@@ -84,7 +84,9 @@ Dynamic: license-file
 * **Non-destructive workflow**
-  1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
+  1. `compress` writes compressed files either:
+     - next to each source as `<basename>_<compression_method>.h5` (`--layout sibling`), or
+     - under a mirrored `RAW_DATA_COMPRESSED` tree using the same source file names, while copying non-compressed folders/files (`--layout mirror`, default)
   2. `check` computes SSIM (first and last frames) and writes a report
   3. `overwrite` (optional) swaps out the raw frame file (irreversible)

{esrf_data_compressor-0.1.1 → esrf_data_compressor-0.2.0}/src/esrf_data_compressor.egg-info/SOURCES.txt RENAMED Viewed

@@ -20,8 +20,10 @@ src/esrf_data_compressor/tests/test_cli.py
 src/esrf_data_compressor/tests/test_finder.py
 src/esrf_data_compressor/tests/test_hdf5_helpers.py
 src/esrf_data_compressor/tests/test_jp2k.py
+src/esrf_data_compressor/tests/test_paths.py
 src/esrf_data_compressor/tests/test_run_check.py
 src/esrf_data_compressor/tests/test_ssim.py
 src/esrf_data_compressor/tests/test_utils.py
 src/esrf_data_compressor/utils/hdf5_helpers.py
+src/esrf_data_compressor/utils/paths.py
 src/esrf_data_compressor/utils/utils.py

esrf_data_compressor-0.1.1/src/esrf_data_compressor/compressors/base.py DELETED Viewed

@@ -1,167 +0,0 @@
-import os
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from tqdm import tqdm
-from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
-class Compressor:
-    """
-    Abstract base class. Subclasses must implement compress_file().
-    """
-    def compress_file(self, input_path: str, output_path: str, **kwargs):
-        raise NotImplementedError
-class CompressorManager:
-    """
-    Manages parallel compression and overwrite.
-    Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
-    has fewer than 4 cores).  The number of worker processes is then
-    total_cores // threads_per_worker (at least 1).  If the user explicitly
-    passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
-    = min(4, total_cores // workers).
-    Usage:
-        mgr = CompressorManager(cratio=10, method='jp2k')
-        mgr.compress_files([...])
-        mgr.overwrite_files([...])
-    """
-    def __init__(
-        self, workers: int | None = None, cratio: int = 10, method: str = "jp2k"
-    ):
-        total_cores = os.cpu_count() or 1
-        # Determine default threads per worker (4, or fewer if total_cores < 4)
-        if total_cores >= 4:
-            default_nthreads = 4
-        else:
-            default_nthreads = 1
-        # Default worker count
-        default_workers = max(1, total_cores // default_nthreads)
-        if workers is None:
-            # Use default workers and default_nthreads
-            w = default_workers
-            nthreads = default_nthreads
-        else:
-            # Cap workers to total_cores
-            w = min(workers, total_cores)
-            # Recompute threads per worker so that (w * nthreads) ≤ total_cores, up to 4
-            possible = total_cores // w
-            nthreads = min(possible, 4) if possible >= 1 else 1
-        self.workers = max(1, w)
-        self.nthreads = max(1, nthreads)
-        self.cratio = cratio
-        self.method = method
-        # Instantiate compressor based on method
-        if self.method == "jp2k":
-            self.compressor = JP2KCompressorWrapper(
-                cratio=cratio, nthreads=self.nthreads
-            )
-        else:
-            raise ValueError(f"Unsupported compression method: {self.method}")
-        print(f"Compression method: {self.method}")
-        print(f"Total CPU cores: {total_cores}")
-        print(f"Worker processes: {self.workers}")
-        print(f"Threads per worker: {self.nthreads}")
-        print(f"Total threads: {self.workers * self.nthreads}")
-    def _compress_worker(self, ipath: str) -> tuple[str, str]:
-        """
-        Worker function for ProcessPoolExecutor: compress a single HDF5:
-        <ipath>.h5 → <same_dir>/<basename>_<method>.h5
-        """
-        base, _ = os.path.splitext(ipath)
-        outp = f"{base}_{self.method}.h5"
-        self.compressor.compress_file(
-            ipath, outp, cratio=self.cratio, nthreads=self.nthreads
-        )
-        return ipath, "success"
-    def compress_files(self, file_list: list[str]) -> None:
-        """
-        Compress each .h5 in file_list in parallel, producing <basename>_<method>.h5
-        next to each source file. Does not overwrite originals. At the end, prints
-        total elapsed time and data rate in MB/s.
-        """
-        valid = [p for p in file_list if p.lower().endswith(".h5")]
-        if not valid:
-            print("No valid .h5 files to compress.")
-            return
-        total_bytes = 0
-        for f in valid:
-            try:
-                total_bytes += os.path.getsize(f)
-            except OSError:
-                pass
-        import time
-        t0 = time.time()
-        with ProcessPoolExecutor(max_workers=self.workers) as executor:
-            futures = {executor.submit(self._compress_worker, p): p for p in valid}
-            for fut in tqdm(
-                as_completed(futures),
-                total=len(futures),
-                desc=f"Compressing HDF5 files ({self.method})",
-                unit="file",
-            ):
-                pth = futures[fut]
-                try:
-                    fut.result()
-                except Exception as e:
-                    print(f"Failed to compress '{pth}': {e}")
-        t1 = time.time()
-        elapsed = t1 - t0
-        total_mb = total_bytes / (1024 * 1024)
-        rate_mb_s = total_mb / elapsed if elapsed > 0 else float("inf")
-        print(f"\nTotal elapsed time: {elapsed:.3f}s")
-        print(f"Data processed: {total_mb:.2f} MB  ({rate_mb_s:.2f} MB/s)\n")
-    def overwrite_files(self, file_list: list[str]) -> None:
-        """
-        Overwrites files only if they have a compressed sibling:
-          1) Rename <file>.h5 → <file>.h5.bak
-          2) Rename <file>_<method>.h5 → <file>.h5
-        After processing all files, removes the backup .h5.bak files.
-        """
-        backups = []
-        for ipath in file_list:
-            if not ipath.lower().endswith(".h5"):
-                continue
-            base, _ = os.path.splitext(ipath)
-            compressed_path = f"{base}_{self.method}.h5"
-            if os.path.exists(compressed_path):
-                backup = ipath + ".bak"
-                try:
-                    os.replace(ipath, backup)
-                    os.replace(compressed_path, ipath)
-                    backups.append(backup)
-                    print(f"Overwritten '{ipath}' (backup at '{backup}').")
-                except Exception as e:
-                    print(f"ERROR overwriting '{ipath}': {e}")
-            else:
-                print(f"SKIP (no compressed file): {ipath}")
-        # Remove all backup files
-        for backup in backups:
-            try:
-                os.remove(backup)
-                print(f"Deleted backup '{backup}'.")
-            except Exception as e:
-                print(f"ERROR deleting backup '{backup}': {e}")