esrf-data-compressor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esrf_data_compressor-0.1.0/LICENSE +20 -0
- esrf_data_compressor-0.1.0/PKG-INFO +183 -0
- esrf_data_compressor-0.1.0/README.md +122 -0
- esrf_data_compressor-0.1.0/pyproject.toml +72 -0
- esrf_data_compressor-0.1.0/setup.cfg +4 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/__init__.py +0 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/checker/run_check.py +76 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/checker/ssim.py +87 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/cli.py +162 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/compressors/__init__.py +0 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/compressors/base.py +167 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/compressors/jp2k.py +149 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/finder/finder.py +173 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/tests/__init__.py +0 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/tests/test_cli.py +176 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/tests/test_finder.py +70 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/tests/test_hdf5_helpers.py +9 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/tests/test_jp2k.py +87 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/tests/test_run_check.py +107 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/tests/test_ssim.py +106 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/tests/test_utils.py +64 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/utils/hdf5_helpers.py +18 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor/utils/utils.py +34 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor.egg-info/PKG-INFO +183 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor.egg-info/SOURCES.txt +27 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor.egg-info/dependency_links.txt +1 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor.egg-info/entry_points.txt +2 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor.egg-info/requires.txt +20 -0
- esrf_data_compressor-0.1.0/src/esrf_data_compressor.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
**Copyright (c) 2025 European Synchrotron Radiation Facility**
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
7
|
+
the Software without restriction, including without limitation the rights to
|
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
10
|
+
subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: esrf-data-compressor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A library to compress ESRF data and reduce their footprint
|
|
5
|
+
Author-email: ESRF <dau-pydev@esrf.fr>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
**Copyright (c) 2025 European Synchrotron Radiation Facility**
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
11
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
12
|
+
the Software without restriction, including without limitation the rights to
|
|
13
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
14
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
|
15
|
+
subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
22
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
23
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
24
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
25
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
26
|
+
|
|
27
|
+
Project-URL: Homepage, https://gitlab.esrf.fr/dau/esrf-data-compressor
|
|
28
|
+
Project-URL: Documentation, https://esrf-data-compressor.readthedocs.io/
|
|
29
|
+
Project-URL: Repository, https://gitlab.esrf.fr/dau/esrf-data-compressor
|
|
30
|
+
Project-URL: Issues, https://gitlab.esrf.fr/dau/esrf-data-compressor/issues
|
|
31
|
+
Project-URL: Changelog, https://gitlab.esrf.fr/dau/esrf-data-compressor/-/blob/main/CHANGELOG.md
|
|
32
|
+
Keywords: ESRF,pathlib
|
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Programming Language :: Python :: 3
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
40
|
+
Requires-Python: >=3.10
|
|
41
|
+
Description-Content-Type: text/markdown
|
|
42
|
+
License-File: LICENSE
|
|
43
|
+
Requires-Dist: h5py
|
|
44
|
+
Requires-Dist: hdf5plugin
|
|
45
|
+
Requires-Dist: blosc2-grok
|
|
46
|
+
Requires-Dist: scikit-image
|
|
47
|
+
Requires-Dist: tqdm
|
|
48
|
+
Provides-Extra: test
|
|
49
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: esrf-data-compressor[test]; extra == "dev"
|
|
52
|
+
Requires-Dist: black>=22; extra == "dev"
|
|
53
|
+
Requires-Dist: flake8>=4.0; extra == "dev"
|
|
54
|
+
Requires-Dist: ruff; extra == "dev"
|
|
55
|
+
Provides-Extra: doc
|
|
56
|
+
Requires-Dist: sphinx>=6.0; extra == "doc"
|
|
57
|
+
Requires-Dist: sphinxcontrib-mermaid>=0.7; extra == "doc"
|
|
58
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.16; extra == "doc"
|
|
59
|
+
Requires-Dist: pydata-sphinx-theme; extra == "doc"
|
|
60
|
+
Dynamic: license-file
|
|
61
|
+
|
|
62
|
+
# ESRF Data Compressor
|
|
63
|
+
|
|
64
|
+
**ESRF Data Compressor** is a command-line tool and Python library designed to compress large ESRF HDF5 datasets (3D volumes) and verify data consistency via SSIM. The default compression backend uses Blosc2 + Grok (JPEG2000).
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Features
|
|
69
|
+
|
|
70
|
+
* **Discover raw HDF5 dataset files** under an experiment’s `RAW_DATA`
|
|
71
|
+
|
|
72
|
+
* Goes through the HDF5 Virtual Datasets to find the data to compress
|
|
73
|
+
* Allows to filter down scan by scan based on the value of a key
|
|
74
|
+
|
|
75
|
+
* **Slice-by-slice compression**
|
|
76
|
+
|
|
77
|
+
* Uses Blosc2 + Grok (JPEG2000) on every slice of each 3D dataset (axis 0)
|
|
78
|
+
* User-configurable compression ratio (e.g. `--cratio 10`)
|
|
79
|
+
|
|
80
|
+
* **Parallel execution**
|
|
81
|
+
|
|
82
|
+
* Automatically factors CPU cores into worker processes × per-process threads
|
|
83
|
+
* By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
|
|
84
|
+
|
|
85
|
+
* **Non-destructive workflow**
|
|
86
|
+
|
|
87
|
+
1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
|
|
88
|
+
2. `check` computes SSIM (first and last frames) and writes a report
|
|
89
|
+
3. `overwrite` (optional) swaps out the raw frame file (irreversible)
|
|
90
|
+
|
|
91
|
+
* **Four simple CLI subcommands**
|
|
92
|
+
|
|
93
|
+
* `list` Show all raw HDF5 files to be processed
|
|
94
|
+
* `compress` Generate compressed siblings
|
|
95
|
+
* `check` Produce a per-dataset SSIM report between raw & compressed
|
|
96
|
+
* `overwrite` Atomically replace each raw frame file (irreversible)
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Installation
|
|
101
|
+
|
|
102
|
+
### From PyPI
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
pip install esrf-data-compressor
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Once installed, the `compress-hdf5` command will be available in your `PATH`.
|
|
109
|
+
|
|
110
|
+
### From Source (for development)
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
git clone https://gitlab.esrf.fr/dau/esrf-data-compressor.git
|
|
114
|
+
cd esrf-data-compressor
|
|
115
|
+
|
|
116
|
+
# (Optional) Create & activate a virtual environment
|
|
117
|
+
python -m venv venv
|
|
118
|
+
source venv/bin/activate
|
|
119
|
+
|
|
120
|
+
# Install build dependencies & the package itself
|
|
121
|
+
pip install .
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Documentation
|
|
127
|
+
|
|
128
|
+
Full documentation is available online:
|
|
129
|
+
[ESRF Data Compressor Docs](https://esrf-data-compressor.readthedocs.io/en/latest/index.html)
|
|
130
|
+
|
|
131
|
+
## Contributing & Development
|
|
132
|
+
|
|
133
|
+
* **Clone** the repository:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
git clone https://gitlab.esrf.fr/dau/esrf-data-compressor.git
|
|
137
|
+
cd esrf-data-compressor
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
* **Install** dependencies (in a virtual environment):
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
python -m venv venv
|
|
144
|
+
source venv/bin/activate
|
|
145
|
+
pip install -e "[dev]"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
* **Run tests** with coverage:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
pytest -v --cov=esrf_data_compressor --cov-report=term-missing
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
* **Style:**
|
|
155
|
+
|
|
156
|
+
* `black .`
|
|
157
|
+
* `flake8 .`
|
|
158
|
+
* `ruff .`
|
|
159
|
+
|
|
160
|
+
* **Build docs** (Sphinx + pydata theme):
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
sphinx-build doc build/html
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
This project is licensed under the [MIT License](LICENSE). See `LICENSE` for full text.
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## Changelog
|
|
175
|
+
|
|
176
|
+
All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1.0 marks the first public release with:
|
|
177
|
+
|
|
178
|
+
* Initial implementation of Blosc2 + Grok (JPEG2000) compression for 3D HDF5 datasets.
|
|
179
|
+
* SSIM-based integrity check (first & last slice).
|
|
180
|
+
* Four-command CLI (`list`, `compress`, `check`, `overwrite`).
|
|
181
|
+
* Parallelism with worker×thread auto-factoring.
|
|
182
|
+
|
|
183
|
+
For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# ESRF Data Compressor
|
|
2
|
+
|
|
3
|
+
**ESRF Data Compressor** is a command-line tool and Python library designed to compress large ESRF HDF5 datasets (3D volumes) and verify data consistency via SSIM. The default compression backend uses Blosc2 + Grok (JPEG2000).
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
* **Discover raw HDF5 dataset files** under an experiment’s `RAW_DATA`
|
|
10
|
+
|
|
11
|
+
* Goes through the HDF5 Virtual Datasets to find the data to compress
|
|
12
|
+
* Allows to filter down scan by scan based on the value of a key
|
|
13
|
+
|
|
14
|
+
* **Slice-by-slice compression**
|
|
15
|
+
|
|
16
|
+
* Uses Blosc2 + Grok (JPEG2000) on every slice of each 3D dataset (axis 0)
|
|
17
|
+
* User-configurable compression ratio (e.g. `--cratio 10`)
|
|
18
|
+
|
|
19
|
+
* **Parallel execution**
|
|
20
|
+
|
|
21
|
+
* Automatically factors CPU cores into worker processes × per-process threads
|
|
22
|
+
* By default, each worker runs up to 4 Blosc2 threads (or falls back to 1 thread if < 4 cores)
|
|
23
|
+
|
|
24
|
+
* **Non-destructive workflow**
|
|
25
|
+
|
|
26
|
+
1. `compress` writes a sibling file `<basename>_<compression_method>.h5` next to each original
|
|
27
|
+
2. `check` computes SSIM (first and last frames) and writes a report
|
|
28
|
+
3. `overwrite` (optional) swaps out the raw frame file (irreversible)
|
|
29
|
+
|
|
30
|
+
* **Four simple CLI subcommands**
|
|
31
|
+
|
|
32
|
+
* `list` Show all raw HDF5 files to be processed
|
|
33
|
+
* `compress` Generate compressed siblings
|
|
34
|
+
* `check` Produce a per-dataset SSIM report between raw & compressed
|
|
35
|
+
* `overwrite` Atomically replace each raw frame file (irreversible)
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
### From PyPI
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install esrf-data-compressor
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Once installed, the `compress-hdf5` command will be available in your `PATH`.
|
|
48
|
+
|
|
49
|
+
### From Source (for development)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
git clone https://gitlab.esrf.fr/dau/esrf-data-compressor.git
|
|
53
|
+
cd esrf-data-compressor
|
|
54
|
+
|
|
55
|
+
# (Optional) Create & activate a virtual environment
|
|
56
|
+
python -m venv venv
|
|
57
|
+
source venv/bin/activate
|
|
58
|
+
|
|
59
|
+
# Install build dependencies & the package itself
|
|
60
|
+
pip install .
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Documentation
|
|
66
|
+
|
|
67
|
+
Full documentation is available online:
|
|
68
|
+
[ESRF Data Compressor Docs](https://esrf-data-compressor.readthedocs.io/en/latest/index.html)
|
|
69
|
+
|
|
70
|
+
## Contributing & Development
|
|
71
|
+
|
|
72
|
+
* **Clone** the repository:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
git clone https://gitlab.esrf.fr/dau/esrf-data-compressor.git
|
|
76
|
+
cd esrf-data-compressor
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
* **Install** dependencies (in a virtual environment):
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
python -m venv venv
|
|
83
|
+
source venv/bin/activate
|
|
84
|
+
pip install -e "[dev]"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
* **Run tests** with coverage:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pytest -v --cov=esrf_data_compressor --cov-report=term-missing
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
* **Style:**
|
|
94
|
+
|
|
95
|
+
* `black .`
|
|
96
|
+
* `flake8 .`
|
|
97
|
+
* `ruff .`
|
|
98
|
+
|
|
99
|
+
* **Build docs** (Sphinx + pydata theme):
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
sphinx-build doc build/html
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
This project is licensed under the [MIT License](LICENSE). See `LICENSE` for full text.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Changelog
|
|
114
|
+
|
|
115
|
+
All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1.0 marks the first public release with:
|
|
116
|
+
|
|
117
|
+
* Initial implementation of Blosc2 + Grok (JPEG2000) compression for 3D HDF5 datasets.
|
|
118
|
+
* SSIM-based integrity check (first & last slice).
|
|
119
|
+
* Four-command CLI (`list`, `compress`, `check`, `overwrite`).
|
|
120
|
+
* Parallelism with worker×thread auto-factoring.
|
|
121
|
+
|
|
122
|
+
For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "esrf-data-compressor"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
|
|
9
|
+
description = "A library to compress ESRF data and reduce their footprint"
|
|
10
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
keywords = ["ESRF", "pathlib"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Intended Audience :: Science/Research",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
]
|
|
22
|
+
requires-python = ">=3.10" #because of blosc2
|
|
23
|
+
dependencies = [
|
|
24
|
+
"h5py",
|
|
25
|
+
"hdf5plugin",
|
|
26
|
+
"blosc2-grok",
|
|
27
|
+
"scikit-image",
|
|
28
|
+
"tqdm"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://gitlab.esrf.fr/dau/esrf-data-compressor"
|
|
33
|
+
Documentation = "https://esrf-data-compressor.readthedocs.io/"
|
|
34
|
+
Repository = "https://gitlab.esrf.fr/dau/esrf-data-compressor"
|
|
35
|
+
Issues = "https://gitlab.esrf.fr/dau/esrf-data-compressor/issues"
|
|
36
|
+
Changelog = "https://gitlab.esrf.fr/dau/esrf-data-compressor/-/blob/main/CHANGELOG.md"
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
test = [
|
|
40
|
+
"pytest >=7.0"
|
|
41
|
+
]
|
|
42
|
+
dev = [
|
|
43
|
+
"esrf-data-compressor[test]",
|
|
44
|
+
"black >=22",
|
|
45
|
+
"flake8 >=4.0",
|
|
46
|
+
"ruff"
|
|
47
|
+
]
|
|
48
|
+
doc = [
|
|
49
|
+
"sphinx >=6.0",
|
|
50
|
+
"sphinxcontrib-mermaid >=0.7",
|
|
51
|
+
"sphinx-autodoc-typehints >=1.16",
|
|
52
|
+
"pydata-sphinx-theme"
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[project.scripts]
|
|
56
|
+
compress-hdf5 = "esrf_data_compressor.cli:main"
|
|
57
|
+
|
|
58
|
+
[tool.setuptools]
|
|
59
|
+
package-dir = { "" = "src" }
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.packages.find]
|
|
62
|
+
where = ["src"]
|
|
63
|
+
|
|
64
|
+
[tool.setuptools.package-data]
|
|
65
|
+
"*" = ["*.json", "*.ipynb"]
|
|
66
|
+
|
|
67
|
+
[tool.coverage.run]
|
|
68
|
+
omit = ["*/tests/*"]
|
|
69
|
+
|
|
70
|
+
[tool.isort]
|
|
71
|
+
profile = "black"
|
|
72
|
+
force_single_line = true
|
|
File without changes
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
3
|
+
from tqdm import tqdm
|
|
4
|
+
|
|
5
|
+
from esrf_data_compressor.checker.ssim import compute_ssim_for_file_pair
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def run_ssim_check(raw_files: list[str], method: str, report_path: str) -> None:
|
|
9
|
+
"""
|
|
10
|
+
Given a list of raw HDF5 file paths, partitions into:
|
|
11
|
+
to_check → those with a sibling <stem>_<method>.h5
|
|
12
|
+
missing → those without one
|
|
13
|
+
|
|
14
|
+
Writes a report to `report_path`:
|
|
15
|
+
- '=== NOT COMPRESSED FILES ===' listing each missing
|
|
16
|
+
- then for each to_check pair, computes SSIM in parallel and appends
|
|
17
|
+
per‐dataset SSIM lines under '=== <stem> ===' with full paths
|
|
18
|
+
"""
|
|
19
|
+
to_check: list[tuple[str, str]] = []
|
|
20
|
+
missing: list[str] = []
|
|
21
|
+
|
|
22
|
+
# partition
|
|
23
|
+
for orig in raw_files:
|
|
24
|
+
dirname, fname = os.path.dirname(orig), os.path.basename(orig)
|
|
25
|
+
stem, _ = os.path.splitext(fname)
|
|
26
|
+
comp_path = os.path.join(dirname, f"{stem}_{method}.h5")
|
|
27
|
+
if os.path.exists(comp_path):
|
|
28
|
+
to_check.append((orig, comp_path))
|
|
29
|
+
else:
|
|
30
|
+
missing.append(orig)
|
|
31
|
+
print(
|
|
32
|
+
f"Found {len(to_check)} file pairs to check, {len(missing)} missing compressed files."
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# write report
|
|
36
|
+
with open(report_path, "w") as rpt:
|
|
37
|
+
if missing:
|
|
38
|
+
rpt.write("=== NOT COMPRESSED FILES ===\n")
|
|
39
|
+
for orig in missing:
|
|
40
|
+
rpt.write(f"{orig} :: NO COMPRESSED DATASET FOUND\n")
|
|
41
|
+
rpt.write("\n")
|
|
42
|
+
|
|
43
|
+
if not to_check:
|
|
44
|
+
rpt.write("No file pairs to check (no compressed siblings found).\n")
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
# run SSIM in parallel
|
|
48
|
+
n_workers = min(len(to_check), os.cpu_count() or 1)
|
|
49
|
+
with ProcessPoolExecutor(max_workers=n_workers) as exe:
|
|
50
|
+
futures = {
|
|
51
|
+
exe.submit(compute_ssim_for_file_pair, orig, comp): (orig, comp)
|
|
52
|
+
for orig, comp in to_check
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
for fut in tqdm(
|
|
56
|
+
as_completed(futures),
|
|
57
|
+
total=len(futures),
|
|
58
|
+
desc="Checking SSIM (files)",
|
|
59
|
+
unit="file",
|
|
60
|
+
):
|
|
61
|
+
orig, comp = futures[fut]
|
|
62
|
+
fname = os.path.basename(orig)
|
|
63
|
+
comp_name = os.path.basename(comp)
|
|
64
|
+
tqdm.write(f"Checking file: {fname} ↔ {comp_name}")
|
|
65
|
+
try:
|
|
66
|
+
# get results
|
|
67
|
+
basename, lines = fut.result()
|
|
68
|
+
# write section with both file paths
|
|
69
|
+
rpt.write(f"=== {basename} ===\n")
|
|
70
|
+
rpt.write(f"Uncompressed file: {orig}\n")
|
|
71
|
+
rpt.write(f"Compressed file: {comp}\n")
|
|
72
|
+
for line in lines:
|
|
73
|
+
rpt.write(line + "\n")
|
|
74
|
+
rpt.write("\n")
|
|
75
|
+
except Exception as e:
|
|
76
|
+
rpt.write(f"{orig} :: ERROR processing file pair: {e}\n\n")
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# src/esrf_data_compressor/checker/ssim.py
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import numpy as np
|
|
5
|
+
import h5py
|
|
6
|
+
from skimage.metrics import structural_similarity as ssim
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _select_win_size(H: int, W: int) -> int:
|
|
10
|
+
"""
|
|
11
|
+
Choose an odd, valid window size for SSIM given slice dimensions H×W.
|
|
12
|
+
win_size = min(H, W, 7), made odd, at least 3.
|
|
13
|
+
"""
|
|
14
|
+
win = min(H, W, 7)
|
|
15
|
+
if win % 2 == 0:
|
|
16
|
+
win -= 1
|
|
17
|
+
return max(win, 3)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def compute_ssim_for_dataset_pair(
|
|
21
|
+
orig_path: str, comp_path: str, dataset_relpath: str
|
|
22
|
+
) -> tuple[float, float]:
|
|
23
|
+
"""
|
|
24
|
+
Given two HDF5 files and the relative 3D dataset path (e.g., 'entry_0000/ESRF-ID11/marana/data'),
|
|
25
|
+
compute SSIM on the first (z=0) and last (z=Z-1) slices.
|
|
26
|
+
Returns (ssim_first, ssim_last). If a slice is constant, SSIM = 1.0.
|
|
27
|
+
"""
|
|
28
|
+
with h5py.File(orig_path, "r") as fo, h5py.File(comp_path, "r") as fc:
|
|
29
|
+
ds_o = fo[dataset_relpath]
|
|
30
|
+
ds_c = fc[dataset_relpath]
|
|
31
|
+
|
|
32
|
+
# Ensure both datasets are 3D
|
|
33
|
+
if ds_o.ndim != 3 or ds_c.ndim != 3:
|
|
34
|
+
raise IndexError(
|
|
35
|
+
f"Dataset '{dataset_relpath}' is not 3D (orig: {ds_o.ndim}D, comp: {ds_c.ndim}D)"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
first_o = ds_o[0].astype(np.float64)
|
|
39
|
+
last_o = ds_o[-1].astype(np.float64)
|
|
40
|
+
first_c = ds_c[0].astype(np.float64)
|
|
41
|
+
last_c = ds_c[-1].astype(np.float64)
|
|
42
|
+
|
|
43
|
+
H, W = first_o.shape
|
|
44
|
+
win = _select_win_size(H, W)
|
|
45
|
+
|
|
46
|
+
def _slice_ssim(a: np.ndarray, b: np.ndarray) -> float:
|
|
47
|
+
amin, amax = a.min(), a.max()
|
|
48
|
+
if amax == amin:
|
|
49
|
+
return 1.0
|
|
50
|
+
dr = amax - amin
|
|
51
|
+
return ssim(a, b, data_range=dr, win_size=win)
|
|
52
|
+
|
|
53
|
+
s0 = _slice_ssim(first_o, first_c)
|
|
54
|
+
s1 = _slice_ssim(last_o, last_c)
|
|
55
|
+
return s0, s1
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def compute_ssim_for_file_pair(orig_path: str, comp_path: str) -> tuple[str, list[str]]:
|
|
59
|
+
"""
|
|
60
|
+
Compute SSIM for every 3D dataset under `orig_path` vs. `comp_path`.
|
|
61
|
+
Returns (basename, [report_lines…]), where each line is either:
|
|
62
|
+
"<dataset_relpath>: SSIM_first=… SSIM_last=…" or an error message.
|
|
63
|
+
"""
|
|
64
|
+
basename = os.path.basename(orig_path)
|
|
65
|
+
report_lines: list[str] = []
|
|
66
|
+
|
|
67
|
+
with h5py.File(orig_path, "r") as fo:
|
|
68
|
+
ds_paths: list[str] = []
|
|
69
|
+
|
|
70
|
+
def visitor(name, obj):
|
|
71
|
+
if isinstance(obj, h5py.Dataset) and obj.ndim == 3:
|
|
72
|
+
ds_paths.append(name)
|
|
73
|
+
|
|
74
|
+
fo.visititems(visitor)
|
|
75
|
+
|
|
76
|
+
if not ds_paths:
|
|
77
|
+
report_lines.append(f"No 3D datasets found in {basename}")
|
|
78
|
+
return basename, report_lines
|
|
79
|
+
|
|
80
|
+
for ds in ds_paths:
|
|
81
|
+
try:
|
|
82
|
+
s0, s1 = compute_ssim_for_dataset_pair(orig_path, comp_path, ds)
|
|
83
|
+
report_lines.append(f"{ds}: SSIM_first={s0:.4f} SSIM_last={s1:.4f}")
|
|
84
|
+
except Exception as e:
|
|
85
|
+
report_lines.append(f"{ds}: ERROR computing SSIM: {e}")
|
|
86
|
+
|
|
87
|
+
return basename, report_lines
|