PyPI - esrf-data-compressor - Versions diffs - 0.2.0__tar.gz → 0.2.1__tar.gz - Mend

esrf-data-compressor 0.2.0tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: esrf-data-compressor
-Version: 0.2.0
+Version: 0.2.1
 Summary: A library to compress ESRF data and reduce their footprint
 Author-email: ESRF <dau-pydev@esrf.fr>
 License: MIT License
@@ -79,8 +79,8 @@ Dynamic: license-file
 * **Parallel execution**
-  * Automatically factors CPU cores into worker processes × per-process threads
-  * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
+* Automatically factors CPU cores into worker processes × per-process threads
+* By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
 * **Non-destructive workflow**

{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/README.md RENAMED Viewed

@@ -18,8 +18,8 @@
 * **Parallel execution**
-  * Automatically factors CPU cores into worker processes × per-process threads
-  * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
+* Automatically factors CPU cores into worker processes × per-process threads
+* By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
 * **Non-destructive workflow**

{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "esrf-data-compressor"
-version = "0.2.0"
+version = "0.2.1"
 authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
 description = "A library to compress ESRF data and reduce their footprint"
 readme = { file = "README.md", content-type = "text/markdown" }

{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/checker/run_check.py RENAMED Viewed

@@ -3,7 +3,7 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
 from tqdm import tqdm
 from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
-from esrf_data_compressor.utils.paths import resolve_compressed_path
+from esrf_data_compressor.utils.paths import get_available_cpus, resolve_compressed_path
 def run_ssim_check(
@@ -46,7 +46,7 @@ def run_ssim_check(
             return
         # run SSIM in parallel
-        n_workers = min(len(to_check), os.cpu_count() or 1)
+        n_workers = min(len(to_check), get_available_cpus())
         with ProcessPoolExecutor(max_workers=n_workers) as exe:
             futures = {
                 exe.submit(compute_ssim_for_file_pair, orig, comp): (orig, comp)

{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/base.py RENAMED Viewed

@@ -1,11 +1,12 @@
 import os
 import shutil
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 from tqdm import tqdm
 from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
 from esrf_data_compressor.utils.paths import (
-    find_dataset_base_h5,
+    get_available_cpus,
     resolve_compressed_path,
     resolve_mirror_path,
 )
@@ -24,11 +25,11 @@ class CompressorManager:
     """
     Manages parallel compression and overwrite.
-    Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
+    Each worker process is given up to 2 Blosc2 threads (or fewer if the machine
     has fewer than 4 cores).  The number of worker processes is then
     total_cores // threads_per_worker (at least 1).  If the user explicitly
     passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
-    = min(4, total_cores // workers).
+    = min(2, total_cores // workers).
     Usage:
         mgr = CompressorManager(cratio=10, method='jp2k')
@@ -43,8 +44,8 @@ class CompressorManager:
         method: str = "jp2k",
         layout: str = "sibling",
     ):
-        total_cores = os.cpu_count() or 1
-        default_nthreads = 4 if total_cores >= 4 else 1
+        total_cores = get_available_cpus()
+        default_nthreads = 2 if total_cores >= 2 else 1
         default_workers = max(1, total_cores // default_nthreads)
         if workers is None:
@@ -53,7 +54,7 @@ class CompressorManager:
         else:
             w = min(workers, total_cores)
             possible = total_cores // w
-            nthreads = min(possible, 4) if possible >= 1 else 1
+            nthreads = min(possible, 2) if possible >= 1 else 1
         self.workers = max(1, w)
         self.nthreads = max(1, nthreads)
@@ -75,6 +76,14 @@ class CompressorManager:
         print(f"Threads per worker: {self.nthreads}")
         print(f"Total threads: {self.workers * self.nthreads}")
+    @staticmethod
+    def _find_raw_root(path: str) -> str | None:
+        p = Path(os.path.abspath(path))
+        parts = p.parts
+        if "RAW_DATA" not in parts:
+            return None
+        return str(Path(*parts[: parts.index("RAW_DATA") + 1]))
     def _compress_worker(self, ipath: str) -> tuple[str, str]:
         """
         Worker function for ProcessPoolExecutor: compress a single HDF5:
@@ -90,17 +99,14 @@ class CompressorManager:
     def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
         source_targets = {os.path.realpath(p) for p in file_list}
-        mirror_roots: set[str] = set()
+        raw_roots: set[str] = set()
         for ipath in file_list:
-            base_h5 = find_dataset_base_h5(ipath)
-            dataset_dir = (
-                os.path.dirname(base_h5) if base_h5 else os.path.dirname(ipath)
-            )
-            # Mirror the parent sample folder too, so sidecar files next to
-            # dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
-            mirror_roots.add(os.path.dirname(dataset_dir))
+            raw_root = self._find_raw_root(ipath)
+            if raw_root:
+                raw_roots.add(raw_root)
-        for src_dir in sorted(mirror_roots):
+        copy_tasks: list[tuple[str, str]] = []
+        for src_dir in sorted(raw_roots):
             try:
                 dst_dir = resolve_mirror_path(src_dir)
             except ValueError:
@@ -123,7 +129,22 @@ class CompressorManager:
                         # Do not copy raw files that will be produced by compression.
                         continue
                     dst_file = os.path.join(target_cur, fname)
-                    shutil.copy2(src_file, dst_file)
+                    copy_tasks.append((src_file, dst_file))
+        if not copy_tasks:
+            return
+        max_workers = min(len(copy_tasks), max(1, get_available_cpus()), 8)
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = {
+                executor.submit(shutil.copy2, s, d): (s, d) for s, d in copy_tasks
+            }
+            for fut in as_completed(futures):
+                src_file, dst_file = futures[fut]
+                try:
+                    fut.result()
+                except Exception as e:
+                    print(f"WARNING: Failed to copy '{src_file}' → '{dst_file}': {e}")
     def compress_files(self, file_list: list[str]) -> None:
         """

{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/jp2k.py RENAMED Viewed

@@ -54,8 +54,7 @@ class JP2KCompressor:
         )
     def _compress_3d(self, name: str, src_dset: h5py.Dataset, dst_grp: h5py.Group):
-        data = src_dset[()]
-        Z, Y, X = data.shape
+        Z, Y, X = src_dset.shape
         dst_dset = dst_grp.create_dataset(
             name,
@@ -70,7 +69,8 @@ class JP2KCompressor:
         t0 = time.perf_counter()
         for z in range(Z):
-            plane = data[z, :, :]
+            # Read one slice at a time to reduce peak RAM usage.
+            plane = src_dset[z, :, :]
             t1 = time.perf_counter()
             b2im = blosc2.asarray(
                 plane[np.newaxis, ...],

{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_cli.py RENAMED Viewed

@@ -190,6 +190,9 @@ def test_compress_mirror_layout_creates_under_raw_data_compressed(
     base.write_text("base")
     sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
     sample_sidecar.write_text("sidecar")
+    other_sample_sidecar = tmp_path / "RAW_DATA" / "sampleB" / "other_sidecar.h5"
+    other_sample_sidecar.parent.mkdir(parents=True)
+    other_sample_sidecar.write_text("other")
     side = ds / "scan0002" / "meta.txt"
     side.parent.mkdir(parents=True)
     side.write_text("meta")
@@ -226,6 +229,7 @@ def test_compress_mirror_layout_creates_under_raw_data_compressed(
         tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
     ).exists()
     assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
+    assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleB" / "other_sidecar.h5").exists()
 def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):

{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/paths.py RENAMED Viewed

@@ -3,6 +3,54 @@ from pathlib import Path
 import re
+def _parse_slurm_cpus_env() -> int | None:
+    """
+    Return CPU count from SLURM env vars if available.
+    """
+    candidates = [
+        ("SLURM_CPUS_PER_TASK", None),
+        ("SLURM_CPUS_ON_NODE", None),
+        ("SLURM_JOB_CPUS_PER_NODE", "1"),
+        ("SLURM_TASKS_PER_NODE", None),
+    ]
+    for key, fallback in candidates:
+        val = os.environ.get(key)
+        if not val:
+            continue
+        if key == "SLURM_JOB_CPUS_PER_NODE":
+            # Formats like "32(x2)" or "32,32" or "32"
+            val = val.split(",")[0]
+            if "(x" in val:
+                val = val.split("(x", 1)[0]
+        if key == "SLURM_TASKS_PER_NODE":
+            # Often like "1" or "2(x3)"
+            if "(x" in val:
+                val = val.split("(x", 1)[0]
+        try:
+            n = int(val)
+            if n > 0:
+                return n
+        except ValueError:
+            if fallback is not None:
+                try:
+                    n = int(fallback)
+                    if n > 0:
+                        return n
+                except ValueError:
+                    pass
+    return None
+def get_available_cpus() -> int:
+    """
+    Use SLURM-provided CPU count when available; otherwise fall back to os.cpu_count().
+    """
+    slurm = _parse_slurm_cpus_env()
+    if slurm is not None:
+        return slurm
+    return os.cpu_count() or 1
 def resolve_mirror_path(
     input_path: str,
     *,

{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1/src/esrf_data_compressor.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: esrf-data-compressor
-Version: 0.2.0
+Version: 0.2.1
 Summary: A library to compress ESRF data and reduce their footprint
 Author-email: ESRF <dau-pydev@esrf.fr>
 License: MIT License
@@ -79,8 +79,8 @@ Dynamic: license-file
 * **Parallel execution**
-  * Automatically factors CPU cores into worker processes × per-process threads
-  * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
+* Automatically factors CPU cores into worker processes × per-process threads
+* By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
 * **Non-destructive workflow**