PyPI - esrf-data-compressor - Versions diffs - 0.2.0__py3-none-any.whl - Mend

esrf-data-compressor 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

esrf_data_compressor/__init__.py +0 -0
esrf_data_compressor/checker/run_check.py +77 -0
esrf_data_compressor/checker/ssim.py +87 -0
esrf_data_compressor/cli.py +193 -0
esrf_data_compressor/compressors/__init__.py +0 -0
esrf_data_compressor/compressors/base.py +271 -0
esrf_data_compressor/compressors/jp2k.py +149 -0
esrf_data_compressor/finder/finder.py +203 -0
esrf_data_compressor/tests/__init__.py +0 -0
esrf_data_compressor/tests/test_cli.py +262 -0
esrf_data_compressor/tests/test_finder.py +70 -0
esrf_data_compressor/tests/test_hdf5_helpers.py +9 -0
esrf_data_compressor/tests/test_jp2k.py +87 -0
esrf_data_compressor/tests/test_paths.py +36 -0
esrf_data_compressor/tests/test_run_check.py +125 -0
esrf_data_compressor/tests/test_ssim.py +106 -0
esrf_data_compressor/tests/test_utils.py +64 -0
esrf_data_compressor/utils/hdf5_helpers.py +18 -0
esrf_data_compressor/utils/paths.py +81 -0
esrf_data_compressor/utils/utils.py +34 -0
esrf_data_compressor-0.2.0.dist-info/METADATA +185 -0
esrf_data_compressor-0.2.0.dist-info/RECORD +26 -0
esrf_data_compressor-0.2.0.dist-info/WHEEL +5 -0
esrf_data_compressor-0.2.0.dist-info/entry_points.txt +2 -0
esrf_data_compressor-0.2.0.dist-info/licenses/LICENSE +20 -0
esrf_data_compressor-0.2.0.dist-info/top_level.txt +1 -0

esrf_data_compressor/__init__.py ADDED Viewed

File without changes

esrf_data_compressor/checker/run_check.py ADDED Viewed

@@ -0,0 +1,77 @@
+import os
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
+from esrf_data_compressor.utils.paths import resolve_compressed_path
+def run_ssim_check(
+    raw_files: list[str], method: str, report_path: str, layout: str = "sibling"
+) -> None:
+    """
+    Given a list of raw HDF5 file paths, partitions into:
+      to_check → those with an expected compressed counterpart according to `layout`
+      missing  → those without one
+    Writes a report to `report_path`:
+      - '=== NOT COMPRESSED FILES ===' listing each missing
+      - then for each to_check pair, computes SSIM in parallel and appends
+        per‐dataset SSIM lines under '=== <stem> ===' with full paths
+    """
+    to_check: list[tuple[str, str]] = []
+    missing: list[str] = []
+    # partition
+    for orig in raw_files:
+        comp_path = resolve_compressed_path(orig, method, layout=layout)
+        if os.path.exists(comp_path):
+            to_check.append((orig, comp_path))
+        else:
+            missing.append(orig)
+    print(
+        f"Found {len(to_check)} file pairs to check, {len(missing)} missing compressed files."
+    )
+    # write report
+    with open(report_path, "w") as rpt:
+        if missing:
+            rpt.write("=== NOT COMPRESSED FILES ===\n")
+            for orig in missing:
+                rpt.write(f"{orig} :: NO COMPRESSED DATASET FOUND\n")
+            rpt.write("\n")
+        if not to_check:
+            rpt.write("No file pairs to check (no compressed siblings found).\n")
+            return
+        # run SSIM in parallel
+        n_workers = min(len(to_check), os.cpu_count() or 1)
+        with ProcessPoolExecutor(max_workers=n_workers) as exe:
+            futures = {
+                exe.submit(compute_ssim_for_file_pair, orig, comp): (orig, comp)
+                for orig, comp in to_check
+            }
+            for fut in tqdm(
+                as_completed(futures),
+                total=len(futures),
+                desc="Checking SSIM (files)",
+                unit="file",
+            ):
+                orig, comp = futures[fut]
+                fname = os.path.basename(orig)
+                comp_name = os.path.basename(comp)
+                tqdm.write(f"Checking file: {fname} ↔ {comp_name}")
+                try:
+                    # get results
+                    basename, lines = fut.result()
+                    # write section with both file paths
+                    rpt.write(f"=== {basename} ===\n")
+                    rpt.write(f"Uncompressed file: {orig}\n")
+                    rpt.write(f"Compressed file: {comp}\n")
+                    for line in lines:
+                        rpt.write(line + "\n")
+                    rpt.write("\n")
+                except Exception as e:
+                    rpt.write(f"{orig} :: ERROR processing file pair: {e}\n\n")

esrf_data_compressor/checker/ssim.py ADDED Viewed

@@ -0,0 +1,87 @@
+# src/esrf_data_compressor/checker/ssim.py
+import os
+import numpy as np
+import h5py
+from skimage.metrics import structural_similarity as ssim
+def _select_win_size(H: int, W: int) -> int:
+    """
+    Choose an odd, valid window size for SSIM given slice dimensions H×W.
+    win_size = min(H, W, 7), made odd, at least 3.
+    """
+    win = min(H, W, 7)
+    if win % 2 == 0:
+        win -= 1
+    return max(win, 3)
+def compute_ssim_for_dataset_pair(
+    orig_path: str, comp_path: str, dataset_relpath: str
+) -> tuple[float, float]:
+    """
+    Given two HDF5 files and the relative 3D dataset path (e.g., 'entry_0000/ESRF-ID11/marana/data'),
+    compute SSIM on the first (z=0) and last (z=Z-1) slices.
+    Returns (ssim_first, ssim_last). If a slice is constant, SSIM = 1.0.
+    """
+    with h5py.File(orig_path, "r") as fo, h5py.File(comp_path, "r") as fc:
+        ds_o = fo[dataset_relpath]
+        ds_c = fc[dataset_relpath]
+        # Ensure both datasets are 3D
+        if ds_o.ndim != 3 or ds_c.ndim != 3:
+            raise IndexError(
+                f"Dataset '{dataset_relpath}' is not 3D (orig: {ds_o.ndim}D, comp: {ds_c.ndim}D)"
+            )
+        first_o = ds_o[0].astype(np.float64)
+        last_o = ds_o[-1].astype(np.float64)
+        first_c = ds_c[0].astype(np.float64)
+        last_c = ds_c[-1].astype(np.float64)
+        H, W = first_o.shape
+        win = _select_win_size(H, W)
+        def _slice_ssim(a: np.ndarray, b: np.ndarray) -> float:
+            amin, amax = a.min(), a.max()
+            if amax == amin:
+                return 1.0
+            dr = amax - amin
+            return ssim(a, b, data_range=dr, win_size=win)
+        s0 = _slice_ssim(first_o, first_c)
+        s1 = _slice_ssim(last_o, last_c)
+        return s0, s1
+def compute_ssim_for_file_pair(orig_path: str, comp_path: str) -> tuple[str, list[str]]:
+    """
+    Compute SSIM for every 3D dataset under `orig_path` vs. `comp_path`.
+    Returns (basename, [report_lines…]), where each line is either:
+    "<dataset_relpath>: SSIM_first=… SSIM_last=…" or an error message.
+    """
+    basename = os.path.basename(orig_path)
+    report_lines: list[str] = []
+    with h5py.File(orig_path, "r") as fo:
+        ds_paths: list[str] = []
+        def visitor(name, obj):
+            if isinstance(obj, h5py.Dataset) and obj.ndim == 3:
+                ds_paths.append(name)
+        fo.visititems(visitor)
+    if not ds_paths:
+        report_lines.append(f"No 3D datasets found in {basename}")
+        return basename, report_lines
+    for ds in ds_paths:
+        try:
+            s0, s1 = compute_ssim_for_dataset_pair(orig_path, comp_path, ds)
+            report_lines.append(f"{ds}: SSIM_first={s0:.4f}  SSIM_last={s1:.4f}")
+        except Exception as e:
+            report_lines.append(f"{ds}: ERROR computing SSIM: {e}")
+    return basename, report_lines

esrf_data_compressor/cli.py ADDED Viewed

@@ -0,0 +1,193 @@
+import os
+import argparse
+from esrf_data_compressor.finder.finder import find_vds_files, write_report
+from esrf_data_compressor.compressors.base import CompressorManager
+from esrf_data_compressor.checker.run_check import run_ssim_check
+from esrf_data_compressor.utils.hdf5_helpers import exit_with_error
+from esrf_data_compressor.utils.utils import parse_report
+def get_path_components(args):
+    comps = [args.experiment]
+    if args.beamline:
+        comps.append(args.beamline)
+    if args.session:
+        comps.append(args.session)
+    return comps
+def do_list(args):
+    """
+    1) Discover datasets under RAW_DATA/<components...>
+    2) Apply dataset‑level filters (--filter key:val[,key2:val2...])
+    3) Extract VDS source files from every dataset
+    4) Write two‑section report:
+         ## TO COMPRESS ##   (sources from matching datasets)
+         ## REMAINING ##     (sources from non‑matching datasets)
+    """
+    comps = get_path_components(args)
+    try:
+        to_c, rem = find_vds_files(comps, base_root=args.root, filter_expr=args.filter)
+    except SystemExit as e:
+        exit_with_error(str(e))
+    report_path = args.output or "file_list.txt"
+    write_report(to_c, rem, report_path)
+    print(f"Report written to {report_path}")
+def do_compress(args):
+    report = args.input
+    if not report:
+        exit_with_error("The --input report file must be specified for compress")
+    try:
+        files = parse_report(report)
+    except Exception as e:
+        exit_with_error(f"Failed to read report '{report}': {e}")
+    if not files:
+        print("Nothing to compress (TO COMPRESS list is empty).")
+        return
+    print(
+        f"Compressing {len(files)} file(s) from '{report}' using '{args.method}' method, ratio {args.cratio}, layout '{args.layout}' …"
+    )
+    mgr = CompressorManager(cratio=args.cratio, method=args.method, layout=args.layout)
+    mgr.compress_files(files)
+    print("Compression complete.\n")
+def do_check(args):
+    report = args.input or "file_list.txt"
+    try:
+        files = parse_report(report)
+    except Exception as e:
+        exit_with_error(f"Failed to read report '{report}': {e}")
+    if not files:
+        print("Nothing to check (TO COMPRESS list is empty).")
+        return
+    report_fname = f"{os.path.splitext(report)[0]}_{args.method}_ssim_report.txt"
+    report_path = os.path.abspath(report_fname)
+    try:
+        run_ssim_check(files, args.method, report_path, layout=args.layout)
+    except SystemExit as e:
+        exit_with_error(str(e))
+    print(f"SSIM report written to {report_path}\n")
+def do_overwrite(args):
+    report = args.input or "file_list.txt"
+    try:
+        files = parse_report(report)
+    except Exception as e:
+        exit_with_error(f"Failed to read report '{report}': {e}")
+    if not files:
+        print("Nothing to process (TO COMPRESS list is empty).")
+        return
+    mgr = CompressorManager()
+    if args.final:
+        print(f"Finalizing overwrite for {len(files)} file(s) from '{report}' …")
+        mgr.remove_backups(files)
+        print("Finalize step complete.\n")
+        return
+    if args.undo:
+        print(f"Undoing overwrite for {len(files)} file(s) from '{report}' …")
+        mgr.restore_backups(files)
+        print("Undo step complete.\n")
+        return
+    print(f"Overwriting {len(files)} file(s) from '{report}' …")
+    mgr.overwrite_files(files)
+    print("Overwrite complete (backups kept).\n")
+def main():
+    parser = argparse.ArgumentParser(
+        description="List, compress, check or overwrite ESRF HDF5 VDS sources."
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+    p = sub.add_parser("list", help="Report VDS sources → TO COMPRESS vs REMAINING")
+    p.add_argument("experiment", help="Experiment ID")
+    p.add_argument("beamline", nargs="?", help="Optional beamline")
+    p.add_argument("session", nargs="?", help="Optional session")
+    p.add_argument("--root", default="/data/visitor", help="Base directory")
+    p.add_argument(
+        "--filter",
+        metavar="KEY:VAL[,KEY2:VAL2...]",
+        help="Dataset-level attribute substring filters",
+    )
+    p.add_argument("--output", help="Report file (default = file_list.txt)")
+    p.set_defaults(func=do_list)
+    p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
+    p.add_argument(
+        "--input",
+        "-i",
+        required=True,
+        help="Report file to read (must be produced by `list`)",
+    )
+    p.add_argument("--cratio", type=int, default=10, help="Compression ratio")
+    p.add_argument(
+        "--method",
+        choices=["jp2k"],
+        default="jp2k",
+        help="Compression method",
+    )
+    p.add_argument(
+        "--layout",
+        choices=["sibling", "mirror"],
+        default="mirror",
+        help="Output layout: sibling (next to each source) or mirror (under RAW_DATA_COMPRESSED, preserving source names).",
+    )
+    p.set_defaults(func=do_compress)
+    p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
+    p.add_argument(
+        "--input", "-i", help="Report file to read (default = file_list.txt)"
+    )
+    p.add_argument(
+        "--method", choices=["jp2k"], default="jp2k", help="Compression method"
+    )
+    p.add_argument(
+        "--layout",
+        choices=["sibling", "mirror"],
+        default="mirror",
+        help="Location of compressed files to check.",
+    )
+    p.set_defaults(func=do_check)
+    p = sub.add_parser(
+        "overwrite",
+        help="Swap in compressed files and keep backups; with --final or --undo, perform cleanup/restore only.",
+    )
+    p.add_argument(
+        "--input", "-i", help="Report file to read (default = file_list.txt)"
+    )
+    group = p.add_mutually_exclusive_group()
+    group.add_argument(
+        "--final",
+        action="store_true",
+        help="Cleanup only: delete existing *.h5.bak backups after confirmation (no overwrite).",
+    )
+    group.add_argument(
+        "--undo",
+        action="store_true",
+        help="Restore only: move <file>.h5.bak back to <file>.h5 and preserve the current file as <file>_<method>.h5 when needed.",
+    )
+    p.set_defaults(func=do_overwrite)
+    args = parser.parse_args()
+    args.func(args)
+if __name__ == "__main__":
+    main()

esrf_data_compressor/compressors/__init__.py ADDED Viewed

File without changes

esrf_data_compressor/compressors/base.py ADDED Viewed

@@ -0,0 +1,271 @@
+import os
+import shutil
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
+from esrf_data_compressor.utils.paths import (
+    find_dataset_base_h5,
+    resolve_compressed_path,
+    resolve_mirror_path,
+)
+class Compressor:
+    """
+    Abstract base class. Subclasses must implement compress_file().
+    """
+    def compress_file(self, input_path: str, output_path: str, **kwargs):
+        raise NotImplementedError
+class CompressorManager:
+    """
+    Manages parallel compression and overwrite.
+    Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
+    has fewer than 4 cores).  The number of worker processes is then
+    total_cores // threads_per_worker (at least 1).  If the user explicitly
+    passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
+    = min(4, total_cores // workers).
+    Usage:
+        mgr = CompressorManager(cratio=10, method='jp2k')
+        mgr.compress_files([...])
+        mgr.overwrite_files([...])
+    """
+    def __init__(
+        self,
+        workers: int | None = None,
+        cratio: int = 10,
+        method: str = "jp2k",
+        layout: str = "sibling",
+    ):
+        total_cores = os.cpu_count() or 1
+        default_nthreads = 4 if total_cores >= 4 else 1
+        default_workers = max(1, total_cores // default_nthreads)
+        if workers is None:
+            w = default_workers
+            nthreads = default_nthreads
+        else:
+            w = min(workers, total_cores)
+            possible = total_cores // w
+            nthreads = min(possible, 4) if possible >= 1 else 1
+        self.workers = max(1, w)
+        self.nthreads = max(1, nthreads)
+        self.cratio = cratio
+        self.method = method
+        self.layout = layout
+        if self.method == "jp2k":
+            self.compressor = JP2KCompressorWrapper(
+                cratio=cratio, nthreads=self.nthreads
+            )
+        else:
+            raise ValueError(f"Unsupported compression method: {self.method}")
+        print(f"Compression method: {self.method}")
+        print(f"Output layout: {self.layout}")
+        print(f"Total CPU cores: {total_cores}")
+        print(f"Worker processes: {self.workers}")
+        print(f"Threads per worker: {self.nthreads}")
+        print(f"Total threads: {self.workers * self.nthreads}")
+    def _compress_worker(self, ipath: str) -> tuple[str, str]:
+        """
+        Worker function for ProcessPoolExecutor: compress a single HDF5:
+        - sibling layout: <same_dir>/<basename>_<method>.h5
+        - mirror layout:  mirror RAW_DATA tree under RAW_DATA_COMPRESSED
+        """
+        outp = resolve_compressed_path(ipath, self.method, layout=self.layout)
+        os.makedirs(os.path.dirname(outp), exist_ok=True)
+        self.compressor.compress_file(
+            ipath, outp, cratio=self.cratio, nthreads=self.nthreads
+        )
+        return ipath, "success"
+    def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
+        source_targets = {os.path.realpath(p) for p in file_list}
+        mirror_roots: set[str] = set()
+        for ipath in file_list:
+            base_h5 = find_dataset_base_h5(ipath)
+            dataset_dir = (
+                os.path.dirname(base_h5) if base_h5 else os.path.dirname(ipath)
+            )
+            # Mirror the parent sample folder too, so sidecar files next to
+            # dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
+            mirror_roots.add(os.path.dirname(dataset_dir))
+        for src_dir in sorted(mirror_roots):
+            try:
+                dst_dir = resolve_mirror_path(src_dir)
+            except ValueError:
+                print(f"WARNING: Cannot mirror folder outside RAW_DATA: '{src_dir}'")
+                continue
+            for cur, dirs, files in os.walk(src_dir):
+                rel_cur = os.path.relpath(cur, src_dir)
+                target_cur = (
+                    dst_dir if rel_cur == "." else os.path.join(dst_dir, rel_cur)
+                )
+                os.makedirs(target_cur, exist_ok=True)
+                for dname in dirs:
+                    os.makedirs(os.path.join(target_cur, dname), exist_ok=True)
+                for fname in files:
+                    src_file = os.path.join(cur, fname)
+                    if os.path.realpath(src_file) in source_targets:
+                        # Do not copy raw files that will be produced by compression.
+                        continue
+                    dst_file = os.path.join(target_cur, fname)
+                    shutil.copy2(src_file, dst_file)
+    def compress_files(self, file_list: list[str]) -> None:
+        """
+        Compress each .h5 in file_list in parallel.
+        - sibling layout: produce <basename>_<method>.h5 next to each source.
+        - mirror layout: write compressed files to RAW_DATA_COMPRESSED with same file names.
+        Does not overwrite originals. At the end, prints total elapsed time and data rate in MB/s.
+        """
+        valid = [p for p in file_list if p.lower().endswith(".h5")]
+        if not valid:
+            print("No valid .h5 files to compress.")
+            return
+        if self.layout == "mirror":
+            print(
+                "Preparing RAW_DATA_COMPRESSED with non-compressed dataset content..."
+            )
+            self._mirror_non_compressed_dataset_content(valid)
+        total_bytes = 0
+        for f in valid:
+            try:
+                total_bytes += os.path.getsize(f)
+            except OSError:
+                pass
+        import time
+        t0 = time.time()
+        with ProcessPoolExecutor(max_workers=self.workers) as executor:
+            futures = {executor.submit(self._compress_worker, p): p for p in valid}
+            for fut in tqdm(
+                as_completed(futures),
+                total=len(futures),
+                desc=f"Compressing HDF5 files ({self.method})",
+                unit="file",
+            ):
+                pth = futures[fut]
+                try:
+                    fut.result()
+                except Exception as e:
+                    print(f"Failed to compress '{pth}': {e}")
+        elapsed = time.time() - t0
+        total_mb = total_bytes / (1024 * 1024)
+        rate_mb_s = total_mb / elapsed if elapsed > 0 else float("inf")
+        print(f"\nTotal elapsed time: {elapsed:.3f}s")
+        print(f"Data processed: {total_mb:.2f} MB  ({rate_mb_s:.2f} MB/s)\n")
+    def overwrite_files(self, file_list: list[str]) -> None:
+        """
+        Overwrites files only if they have a compressed sibling:
+          1) Rename <file>.h5 → <file>.h5.bak
+          2) Rename <file>_<method>.h5 → <file>.h5
+        After processing all files, removes the backup .h5.bak files.
+        """
+        for ipath in file_list:
+            if not ipath.lower().endswith(".h5"):
+                continue
+            compressed_path = resolve_compressed_path(
+                ipath, self.method, layout=self.layout
+            )
+            if os.path.exists(compressed_path):
+                backup = ipath + ".bak"
+                try:
+                    os.replace(ipath, backup)
+                    os.replace(compressed_path, ipath)
+                    print(f"Overwritten '{ipath}' (backup at '{backup}').")
+                except Exception as e:
+                    print(f"ERROR overwriting '{ipath}': {e}")
+            else:
+                print(f"SKIP (no compressed file): {ipath}")
+    def remove_backups(self, file_list: list[str]) -> None:
+        candidates = {p + ".bak" for p in file_list if p.lower().endswith(".h5")}
+        backups = [b for b in candidates if os.path.exists(b)]
+        if not backups:
+            print("No backup files to remove.")
+            return
+        total_bytes = 0
+        for b in backups:
+            try:
+                total_bytes += os.path.getsize(b)
+            except OSError:
+                pass
+        total_mb = total_bytes / (1024 * 1024)
+        print(
+            f"About to remove {len(backups)} backup file(s), ~{total_mb:.2f} MB total."
+        )
+        ans = input("Proceed? [y/N]: ").strip().lower()
+        if ans not in ("y", "yes"):
+            print("Backups kept.")
+            return
+        removed = 0
+        for b in backups:
+            try:
+                os.remove(b)
+                removed += 1
+            except Exception as e:
+                print(f"ERROR deleting backup '{b}': {e}")
+        print(f"Deleted {removed} backup file(s).")
+    def restore_backups(self, file_list: list[str]) -> None:
+        restored = 0
+        preserved = 0
+        for ipath in file_list:
+            if not ipath.lower().endswith(".h5"):
+                continue
+            backup = ipath + ".bak"
+            method_path = resolve_compressed_path(
+                ipath, self.method, layout=self.layout
+            )
+            if not os.path.exists(backup):
+                print(f"SKIP (no backup): {ipath}")
+                continue
+            if os.path.exists(ipath) and not os.path.exists(method_path):
+                try:
+                    os.replace(ipath, method_path)
+                    preserved += 1
+                    print(f"Preserved current file to '{method_path}'.")
+                except Exception as e:
+                    print(f"ERROR preserving current '{ipath}' to '{method_path}': {e}")
+                    continue
+            try:
+                os.replace(backup, ipath)
+                restored += 1
+                print(f"Restored '{ipath}' from backup.")
+            except Exception as e:
+                print(f"ERROR restoring '{ipath}' from '{backup}': {e}")
+        print(
+            f"Restore complete. Restored: {restored}, preserved compressed copies: {preserved}."
+        )