esrf-data-compressor 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.1}/PKG-INFO +3 -3
  2. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/README.md +2 -2
  3. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/pyproject.toml +1 -1
  4. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/checker/run_check.py +2 -2
  5. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/base.py +38 -17
  6. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/jp2k.py +3 -3
  7. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_cli.py +4 -0
  8. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/paths.py +48 -0
  9. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1/src/esrf_data_compressor.egg-info}/PKG-INFO +3 -3
  10. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/LICENSE +0 -0
  11. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/setup.cfg +0 -0
  12. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/__init__.py +0 -0
  13. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/checker/ssim.py +0 -0
  14. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/cli.py +0 -0
  15. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/__init__.py +0 -0
  16. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/finder/finder.py +0 -0
  17. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/__init__.py +0 -0
  18. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_finder.py +0 -0
  19. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_hdf5_helpers.py +0 -0
  20. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_jp2k.py +0 -0
  21. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_paths.py +0 -0
  22. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_run_check.py +0 -0
  23. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_ssim.py +0 -0
  24. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_utils.py +0 -0
  25. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/hdf5_helpers.py +0 -0
  26. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/utils.py +0 -0
  27. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/SOURCES.txt +0 -0
  28. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/dependency_links.txt +0 -0
  29. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/entry_points.txt +0 -0
  30. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/requires.txt +0 -0
  31. {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -79,8 +79,8 @@ Dynamic: license-file
79
79
 
80
80
  * **Parallel execution**
81
81
 
82
- * Automatically factors CPU cores into worker processes × per-process threads
83
- * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
82
+ * Automatically factors CPU cores into worker processes × per-process threads
83
+ * By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
84
84
 
85
85
  * **Non-destructive workflow**
86
86
 
@@ -18,8 +18,8 @@
18
18
 
19
19
  * **Parallel execution**
20
20
 
21
- * Automatically factors CPU cores into worker processes × per-process threads
22
- * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
21
+ * Automatically factors CPU cores into worker processes × per-process threads
22
+ * By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
23
23
 
24
24
  * **Non-destructive workflow**
25
25
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "esrf-data-compressor"
7
- version = "0.2.0"
7
+ version = "0.2.1"
8
8
  authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
9
9
  description = "A library to compress ESRF data and reduce their footprint"
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -3,7 +3,7 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
3
3
  from tqdm import tqdm
4
4
 
5
5
  from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
6
- from esrf_data_compressor.utils.paths import resolve_compressed_path
6
+ from esrf_data_compressor.utils.paths import get_available_cpus, resolve_compressed_path
7
7
 
8
8
 
9
9
  def run_ssim_check(
@@ -46,7 +46,7 @@ def run_ssim_check(
46
46
  return
47
47
 
48
48
  # run SSIM in parallel
49
- n_workers = min(len(to_check), os.cpu_count() or 1)
49
+ n_workers = min(len(to_check), get_available_cpus())
50
50
  with ProcessPoolExecutor(max_workers=n_workers) as exe:
51
51
  futures = {
52
52
  exe.submit(compute_ssim_for_file_pair, orig, comp): (orig, comp)
@@ -1,11 +1,12 @@
1
1
  import os
2
2
  import shutil
3
- from concurrent.futures import ProcessPoolExecutor, as_completed
3
+ from pathlib import Path
4
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
4
5
  from tqdm import tqdm
5
6
 
6
7
  from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
7
8
  from esrf_data_compressor.utils.paths import (
8
- find_dataset_base_h5,
9
+ get_available_cpus,
9
10
  resolve_compressed_path,
10
11
  resolve_mirror_path,
11
12
  )
@@ -24,11 +25,11 @@ class CompressorManager:
24
25
  """
25
26
  Manages parallel compression and overwrite.
26
27
 
27
- Each worker process is given up to 4 Blosc2 threads (or fewer if the machine
28
+ Each worker process is given up to 2 Blosc2 threads (or fewer if the machine
28
29
  has fewer than 4 cores). The number of worker processes is then
29
30
  total_cores // threads_per_worker (at least 1). If the user explicitly
30
31
  passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
31
- = min(4, total_cores // workers).
32
+ = min(2, total_cores // workers).
32
33
 
33
34
  Usage:
34
35
  mgr = CompressorManager(cratio=10, method='jp2k')
@@ -43,8 +44,8 @@ class CompressorManager:
43
44
  method: str = "jp2k",
44
45
  layout: str = "sibling",
45
46
  ):
46
- total_cores = os.cpu_count() or 1
47
- default_nthreads = 4 if total_cores >= 4 else 1
47
+ total_cores = get_available_cpus()
48
+ default_nthreads = 2 if total_cores >= 2 else 1
48
49
  default_workers = max(1, total_cores // default_nthreads)
49
50
 
50
51
  if workers is None:
@@ -53,7 +54,7 @@ class CompressorManager:
53
54
  else:
54
55
  w = min(workers, total_cores)
55
56
  possible = total_cores // w
56
- nthreads = min(possible, 4) if possible >= 1 else 1
57
+ nthreads = min(possible, 2) if possible >= 1 else 1
57
58
 
58
59
  self.workers = max(1, w)
59
60
  self.nthreads = max(1, nthreads)
@@ -75,6 +76,14 @@ class CompressorManager:
75
76
  print(f"Threads per worker: {self.nthreads}")
76
77
  print(f"Total threads: {self.workers * self.nthreads}")
77
78
 
79
+ @staticmethod
80
+ def _find_raw_root(path: str) -> str | None:
81
+ p = Path(os.path.abspath(path))
82
+ parts = p.parts
83
+ if "RAW_DATA" not in parts:
84
+ return None
85
+ return str(Path(*parts[: parts.index("RAW_DATA") + 1]))
86
+
78
87
  def _compress_worker(self, ipath: str) -> tuple[str, str]:
79
88
  """
80
89
  Worker function for ProcessPoolExecutor: compress a single HDF5:
@@ -90,17 +99,14 @@ class CompressorManager:
90
99
 
91
100
  def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
92
101
  source_targets = {os.path.realpath(p) for p in file_list}
93
- mirror_roots: set[str] = set()
102
+ raw_roots: set[str] = set()
94
103
  for ipath in file_list:
95
- base_h5 = find_dataset_base_h5(ipath)
96
- dataset_dir = (
97
- os.path.dirname(base_h5) if base_h5 else os.path.dirname(ipath)
98
- )
99
- # Mirror the parent sample folder too, so sidecar files next to
100
- # dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
101
- mirror_roots.add(os.path.dirname(dataset_dir))
104
+ raw_root = self._find_raw_root(ipath)
105
+ if raw_root:
106
+ raw_roots.add(raw_root)
102
107
 
103
- for src_dir in sorted(mirror_roots):
108
+ copy_tasks: list[tuple[str, str]] = []
109
+ for src_dir in sorted(raw_roots):
104
110
  try:
105
111
  dst_dir = resolve_mirror_path(src_dir)
106
112
  except ValueError:
@@ -123,7 +129,22 @@ class CompressorManager:
123
129
  # Do not copy raw files that will be produced by compression.
124
130
  continue
125
131
  dst_file = os.path.join(target_cur, fname)
126
- shutil.copy2(src_file, dst_file)
132
+ copy_tasks.append((src_file, dst_file))
133
+
134
+ if not copy_tasks:
135
+ return
136
+
137
+ max_workers = min(len(copy_tasks), max(1, get_available_cpus()), 8)
138
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
139
+ futures = {
140
+ executor.submit(shutil.copy2, s, d): (s, d) for s, d in copy_tasks
141
+ }
142
+ for fut in as_completed(futures):
143
+ src_file, dst_file = futures[fut]
144
+ try:
145
+ fut.result()
146
+ except Exception as e:
147
+ print(f"WARNING: Failed to copy '{src_file}' → '{dst_file}': {e}")
127
148
 
128
149
  def compress_files(self, file_list: list[str]) -> None:
129
150
  """
@@ -54,8 +54,7 @@ class JP2KCompressor:
54
54
  )
55
55
 
56
56
  def _compress_3d(self, name: str, src_dset: h5py.Dataset, dst_grp: h5py.Group):
57
- data = src_dset[()]
58
- Z, Y, X = data.shape
57
+ Z, Y, X = src_dset.shape
59
58
 
60
59
  dst_dset = dst_grp.create_dataset(
61
60
  name,
@@ -70,7 +69,8 @@ class JP2KCompressor:
70
69
  t0 = time.perf_counter()
71
70
 
72
71
  for z in range(Z):
73
- plane = data[z, :, :]
72
+ # Read one slice at a time to reduce peak RAM usage.
73
+ plane = src_dset[z, :, :]
74
74
  t1 = time.perf_counter()
75
75
  b2im = blosc2.asarray(
76
76
  plane[np.newaxis, ...],
@@ -190,6 +190,9 @@ def test_compress_mirror_layout_creates_under_raw_data_compressed(
190
190
  base.write_text("base")
191
191
  sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
192
192
  sample_sidecar.write_text("sidecar")
193
+ other_sample_sidecar = tmp_path / "RAW_DATA" / "sampleB" / "other_sidecar.h5"
194
+ other_sample_sidecar.parent.mkdir(parents=True)
195
+ other_sample_sidecar.write_text("other")
193
196
  side = ds / "scan0002" / "meta.txt"
194
197
  side.parent.mkdir(parents=True)
195
198
  side.write_text("meta")
@@ -226,6 +229,7 @@ def test_compress_mirror_layout_creates_under_raw_data_compressed(
226
229
  tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
227
230
  ).exists()
228
231
  assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
232
+ assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleB" / "other_sidecar.h5").exists()
229
233
 
230
234
 
231
235
  def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
@@ -3,6 +3,54 @@ from pathlib import Path
3
3
  import re
4
4
 
5
5
 
6
+ def _parse_slurm_cpus_env() -> int | None:
7
+ """
8
+ Return CPU count from SLURM env vars if available.
9
+ """
10
+ candidates = [
11
+ ("SLURM_CPUS_PER_TASK", None),
12
+ ("SLURM_CPUS_ON_NODE", None),
13
+ ("SLURM_JOB_CPUS_PER_NODE", "1"),
14
+ ("SLURM_TASKS_PER_NODE", None),
15
+ ]
16
+ for key, fallback in candidates:
17
+ val = os.environ.get(key)
18
+ if not val:
19
+ continue
20
+ if key == "SLURM_JOB_CPUS_PER_NODE":
21
+ # Formats like "32(x2)" or "32,32" or "32"
22
+ val = val.split(",")[0]
23
+ if "(x" in val:
24
+ val = val.split("(x", 1)[0]
25
+ if key == "SLURM_TASKS_PER_NODE":
26
+ # Often like "1" or "2(x3)"
27
+ if "(x" in val:
28
+ val = val.split("(x", 1)[0]
29
+ try:
30
+ n = int(val)
31
+ if n > 0:
32
+ return n
33
+ except ValueError:
34
+ if fallback is not None:
35
+ try:
36
+ n = int(fallback)
37
+ if n > 0:
38
+ return n
39
+ except ValueError:
40
+ pass
41
+ return None
42
+
43
+
44
+ def get_available_cpus() -> int:
45
+ """
46
+ Use SLURM-provided CPU count when available; otherwise fall back to os.cpu_count().
47
+ """
48
+ slurm = _parse_slurm_cpus_env()
49
+ if slurm is not None:
50
+ return slurm
51
+ return os.cpu_count() or 1
52
+
53
+
6
54
  def resolve_mirror_path(
7
55
  input_path: str,
8
56
  *,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -79,8 +79,8 @@ Dynamic: license-file
79
79
 
80
80
  * **Parallel execution**
81
81
 
82
- * Automatically factors CPU cores into worker processes × per-process threads
83
- * By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
82
+ * Automatically factors CPU cores into worker processes × per-process threads
83
+ * By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
84
84
 
85
85
  * **Non-destructive workflow**
86
86