esrf-data-compressor 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.1}/PKG-INFO +3 -3
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/README.md +2 -2
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/pyproject.toml +1 -1
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/checker/run_check.py +2 -2
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/base.py +38 -17
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/jp2k.py +3 -3
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_cli.py +4 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/paths.py +48 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1/src/esrf_data_compressor.egg-info}/PKG-INFO +3 -3
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/LICENSE +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/setup.cfg +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/__init__.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/checker/ssim.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/cli.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/compressors/__init__.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/finder/finder.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/__init__.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_finder.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_hdf5_helpers.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_jp2k.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_paths.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_run_check.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_ssim.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_utils.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/hdf5_helpers.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/utils.py +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/SOURCES.txt +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/dependency_links.txt +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/entry_points.txt +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/requires.txt +0 -0
- {esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor.egg-info/top_level.txt +0 -0
{esrf_data_compressor-0.2.0/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.2.1}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: esrf-data-compressor
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: A library to compress ESRF data and reduce their footprint
|
|
5
5
|
Author-email: ESRF <dau-pydev@esrf.fr>
|
|
6
6
|
License: MIT License
|
|
@@ -79,8 +79,8 @@ Dynamic: license-file
|
|
|
79
79
|
|
|
80
80
|
* **Parallel execution**
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
* Automatically factors CPU cores into worker processes × per-process threads
|
|
83
|
+
* By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
|
|
84
84
|
|
|
85
85
|
* **Non-destructive workflow**
|
|
86
86
|
|
|
@@ -18,8 +18,8 @@
|
|
|
18
18
|
|
|
19
19
|
* **Parallel execution**
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
* Automatically factors CPU cores into worker processes × per-process threads
|
|
22
|
+
* By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
|
|
23
23
|
|
|
24
24
|
* **Non-destructive workflow**
|
|
25
25
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "esrf-data-compressor"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.1"
|
|
8
8
|
authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
|
|
9
9
|
description = "A library to compress ESRF data and reduce their footprint"
|
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
@@ -3,7 +3,7 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
3
3
|
from tqdm import tqdm
|
|
4
4
|
|
|
5
5
|
from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
|
|
6
|
-
from esrf_data_compressor.utils.paths import resolve_compressed_path
|
|
6
|
+
from esrf_data_compressor.utils.paths import get_available_cpus, resolve_compressed_path
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def run_ssim_check(
|
|
@@ -46,7 +46,7 @@ def run_ssim_check(
|
|
|
46
46
|
return
|
|
47
47
|
|
|
48
48
|
# run SSIM in parallel
|
|
49
|
-
n_workers = min(len(to_check),
|
|
49
|
+
n_workers = min(len(to_check), get_available_cpus())
|
|
50
50
|
with ProcessPoolExecutor(max_workers=n_workers) as exe:
|
|
51
51
|
futures = {
|
|
52
52
|
exe.submit(compute_ssim_for_file_pair, orig, comp): (orig, comp)
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import shutil
|
|
3
|
-
from
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
|
4
5
|
from tqdm import tqdm
|
|
5
6
|
|
|
6
7
|
from esrf_data_compressor.compressors.jp2k import JP2KCompressorWrapper
|
|
7
8
|
from esrf_data_compressor.utils.paths import (
|
|
8
|
-
|
|
9
|
+
get_available_cpus,
|
|
9
10
|
resolve_compressed_path,
|
|
10
11
|
resolve_mirror_path,
|
|
11
12
|
)
|
|
@@ -24,11 +25,11 @@ class CompressorManager:
|
|
|
24
25
|
"""
|
|
25
26
|
Manages parallel compression and overwrite.
|
|
26
27
|
|
|
27
|
-
Each worker process is given up to
|
|
28
|
+
Each worker process is given up to 2 Blosc2 threads (or fewer if the machine
|
|
28
29
|
has fewer than 4 cores). The number of worker processes is then
|
|
29
30
|
total_cores // threads_per_worker (at least 1). If the user explicitly
|
|
30
31
|
passes `workers`, we cap it to `total_cores`, then recompute threads_per_worker
|
|
31
|
-
= min(
|
|
32
|
+
= min(2, total_cores // workers).
|
|
32
33
|
|
|
33
34
|
Usage:
|
|
34
35
|
mgr = CompressorManager(cratio=10, method='jp2k')
|
|
@@ -43,8 +44,8 @@ class CompressorManager:
|
|
|
43
44
|
method: str = "jp2k",
|
|
44
45
|
layout: str = "sibling",
|
|
45
46
|
):
|
|
46
|
-
total_cores =
|
|
47
|
-
default_nthreads =
|
|
47
|
+
total_cores = get_available_cpus()
|
|
48
|
+
default_nthreads = 2 if total_cores >= 2 else 1
|
|
48
49
|
default_workers = max(1, total_cores // default_nthreads)
|
|
49
50
|
|
|
50
51
|
if workers is None:
|
|
@@ -53,7 +54,7 @@ class CompressorManager:
|
|
|
53
54
|
else:
|
|
54
55
|
w = min(workers, total_cores)
|
|
55
56
|
possible = total_cores // w
|
|
56
|
-
nthreads = min(possible,
|
|
57
|
+
nthreads = min(possible, 2) if possible >= 1 else 1
|
|
57
58
|
|
|
58
59
|
self.workers = max(1, w)
|
|
59
60
|
self.nthreads = max(1, nthreads)
|
|
@@ -75,6 +76,14 @@ class CompressorManager:
|
|
|
75
76
|
print(f"Threads per worker: {self.nthreads}")
|
|
76
77
|
print(f"Total threads: {self.workers * self.nthreads}")
|
|
77
78
|
|
|
79
|
+
@staticmethod
|
|
80
|
+
def _find_raw_root(path: str) -> str | None:
|
|
81
|
+
p = Path(os.path.abspath(path))
|
|
82
|
+
parts = p.parts
|
|
83
|
+
if "RAW_DATA" not in parts:
|
|
84
|
+
return None
|
|
85
|
+
return str(Path(*parts[: parts.index("RAW_DATA") + 1]))
|
|
86
|
+
|
|
78
87
|
def _compress_worker(self, ipath: str) -> tuple[str, str]:
|
|
79
88
|
"""
|
|
80
89
|
Worker function for ProcessPoolExecutor: compress a single HDF5:
|
|
@@ -90,17 +99,14 @@ class CompressorManager:
|
|
|
90
99
|
|
|
91
100
|
def _mirror_non_compressed_dataset_content(self, file_list: list[str]) -> None:
|
|
92
101
|
source_targets = {os.path.realpath(p) for p in file_list}
|
|
93
|
-
|
|
102
|
+
raw_roots: set[str] = set()
|
|
94
103
|
for ipath in file_list:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
)
|
|
99
|
-
# Mirror the parent sample folder too, so sidecar files next to
|
|
100
|
-
# dataset folders are preserved (e.g. RAW_DATA/<sample>/*.h5).
|
|
101
|
-
mirror_roots.add(os.path.dirname(dataset_dir))
|
|
104
|
+
raw_root = self._find_raw_root(ipath)
|
|
105
|
+
if raw_root:
|
|
106
|
+
raw_roots.add(raw_root)
|
|
102
107
|
|
|
103
|
-
|
|
108
|
+
copy_tasks: list[tuple[str, str]] = []
|
|
109
|
+
for src_dir in sorted(raw_roots):
|
|
104
110
|
try:
|
|
105
111
|
dst_dir = resolve_mirror_path(src_dir)
|
|
106
112
|
except ValueError:
|
|
@@ -123,7 +129,22 @@ class CompressorManager:
|
|
|
123
129
|
# Do not copy raw files that will be produced by compression.
|
|
124
130
|
continue
|
|
125
131
|
dst_file = os.path.join(target_cur, fname)
|
|
126
|
-
|
|
132
|
+
copy_tasks.append((src_file, dst_file))
|
|
133
|
+
|
|
134
|
+
if not copy_tasks:
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
max_workers = min(len(copy_tasks), max(1, get_available_cpus()), 8)
|
|
138
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
139
|
+
futures = {
|
|
140
|
+
executor.submit(shutil.copy2, s, d): (s, d) for s, d in copy_tasks
|
|
141
|
+
}
|
|
142
|
+
for fut in as_completed(futures):
|
|
143
|
+
src_file, dst_file = futures[fut]
|
|
144
|
+
try:
|
|
145
|
+
fut.result()
|
|
146
|
+
except Exception as e:
|
|
147
|
+
print(f"WARNING: Failed to copy '{src_file}' → '{dst_file}': {e}")
|
|
127
148
|
|
|
128
149
|
def compress_files(self, file_list: list[str]) -> None:
|
|
129
150
|
"""
|
|
@@ -54,8 +54,7 @@ class JP2KCompressor:
|
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
def _compress_3d(self, name: str, src_dset: h5py.Dataset, dst_grp: h5py.Group):
|
|
57
|
-
|
|
58
|
-
Z, Y, X = data.shape
|
|
57
|
+
Z, Y, X = src_dset.shape
|
|
59
58
|
|
|
60
59
|
dst_dset = dst_grp.create_dataset(
|
|
61
60
|
name,
|
|
@@ -70,7 +69,8 @@ class JP2KCompressor:
|
|
|
70
69
|
t0 = time.perf_counter()
|
|
71
70
|
|
|
72
71
|
for z in range(Z):
|
|
73
|
-
|
|
72
|
+
# Read one slice at a time to reduce peak RAM usage.
|
|
73
|
+
plane = src_dset[z, :, :]
|
|
74
74
|
t1 = time.perf_counter()
|
|
75
75
|
b2im = blosc2.asarray(
|
|
76
76
|
plane[np.newaxis, ...],
|
{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/test_cli.py
RENAMED
|
@@ -190,6 +190,9 @@ def test_compress_mirror_layout_creates_under_raw_data_compressed(
|
|
|
190
190
|
base.write_text("base")
|
|
191
191
|
sample_sidecar = tmp_path / "RAW_DATA" / "sampleA" / "sample_sidecar.h5"
|
|
192
192
|
sample_sidecar.write_text("sidecar")
|
|
193
|
+
other_sample_sidecar = tmp_path / "RAW_DATA" / "sampleB" / "other_sidecar.h5"
|
|
194
|
+
other_sample_sidecar.parent.mkdir(parents=True)
|
|
195
|
+
other_sample_sidecar.write_text("other")
|
|
193
196
|
side = ds / "scan0002" / "meta.txt"
|
|
194
197
|
side.parent.mkdir(parents=True)
|
|
195
198
|
side.write_text("meta")
|
|
@@ -226,6 +229,7 @@ def test_compress_mirror_layout_creates_under_raw_data_compressed(
|
|
|
226
229
|
tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "ds1" / "scan0002" / "meta.txt"
|
|
227
230
|
).exists()
|
|
228
231
|
assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleA" / "sample_sidecar.h5").exists()
|
|
232
|
+
assert (tmp_path / "RAW_DATA_COMPRESSED" / "sampleB" / "other_sidecar.h5").exists()
|
|
229
233
|
|
|
230
234
|
|
|
231
235
|
def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
|
{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/paths.py
RENAMED
|
@@ -3,6 +3,54 @@ from pathlib import Path
|
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
def _parse_slurm_cpus_env() -> int | None:
|
|
7
|
+
"""
|
|
8
|
+
Return CPU count from SLURM env vars if available.
|
|
9
|
+
"""
|
|
10
|
+
candidates = [
|
|
11
|
+
("SLURM_CPUS_PER_TASK", None),
|
|
12
|
+
("SLURM_CPUS_ON_NODE", None),
|
|
13
|
+
("SLURM_JOB_CPUS_PER_NODE", "1"),
|
|
14
|
+
("SLURM_TASKS_PER_NODE", None),
|
|
15
|
+
]
|
|
16
|
+
for key, fallback in candidates:
|
|
17
|
+
val = os.environ.get(key)
|
|
18
|
+
if not val:
|
|
19
|
+
continue
|
|
20
|
+
if key == "SLURM_JOB_CPUS_PER_NODE":
|
|
21
|
+
# Formats like "32(x2)" or "32,32" or "32"
|
|
22
|
+
val = val.split(",")[0]
|
|
23
|
+
if "(x" in val:
|
|
24
|
+
val = val.split("(x", 1)[0]
|
|
25
|
+
if key == "SLURM_TASKS_PER_NODE":
|
|
26
|
+
# Often like "1" or "2(x3)"
|
|
27
|
+
if "(x" in val:
|
|
28
|
+
val = val.split("(x", 1)[0]
|
|
29
|
+
try:
|
|
30
|
+
n = int(val)
|
|
31
|
+
if n > 0:
|
|
32
|
+
return n
|
|
33
|
+
except ValueError:
|
|
34
|
+
if fallback is not None:
|
|
35
|
+
try:
|
|
36
|
+
n = int(fallback)
|
|
37
|
+
if n > 0:
|
|
38
|
+
return n
|
|
39
|
+
except ValueError:
|
|
40
|
+
pass
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_available_cpus() -> int:
|
|
45
|
+
"""
|
|
46
|
+
Use SLURM-provided CPU count when available; otherwise fall back to os.cpu_count().
|
|
47
|
+
"""
|
|
48
|
+
slurm = _parse_slurm_cpus_env()
|
|
49
|
+
if slurm is not None:
|
|
50
|
+
return slurm
|
|
51
|
+
return os.cpu_count() or 1
|
|
52
|
+
|
|
53
|
+
|
|
6
54
|
def resolve_mirror_path(
|
|
7
55
|
input_path: str,
|
|
8
56
|
*,
|
{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1/src/esrf_data_compressor.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: esrf-data-compressor
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: A library to compress ESRF data and reduce their footprint
|
|
5
5
|
Author-email: ESRF <dau-pydev@esrf.fr>
|
|
6
6
|
License: MIT License
|
|
@@ -79,8 +79,8 @@ Dynamic: license-file
|
|
|
79
79
|
|
|
80
80
|
* **Parallel execution**
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
* Automatically factors CPU cores into worker processes × per-process threads
|
|
83
|
+
* By default, each worker runs up to 2 Blosc2 threads (or falls back to 1 thread if < 2 cores)
|
|
84
84
|
|
|
85
85
|
* **Non-destructive workflow**
|
|
86
86
|
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/__init__.py
RENAMED
|
File without changes
|
{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/checker/ssim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/finder/finder.py
RENAMED
|
File without changes
|
{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/tests/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{esrf_data_compressor-0.2.0 → esrf_data_compressor-0.2.1}/src/esrf_data_compressor/utils/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|