esrf-data-compressor 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {esrf_data_compressor-0.1.0/src/esrf_data_compressor.egg-info → esrf_data_compressor-0.1.2}/PKG-INFO +6 -6
  2. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/README.md +5 -5
  3. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/pyproject.toml +1 -1
  4. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/cli.py +36 -17
  5. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/compressors/base.py +67 -21
  6. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/finder/finder.py +71 -41
  7. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/tests/test_cli.py +40 -4
  8. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2/src/esrf_data_compressor.egg-info}/PKG-INFO +6 -6
  9. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/LICENSE +0 -0
  10. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/setup.cfg +0 -0
  11. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/__init__.py +0 -0
  12. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/checker/run_check.py +0 -0
  13. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/checker/ssim.py +0 -0
  14. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/compressors/__init__.py +0 -0
  15. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/compressors/jp2k.py +0 -0
  16. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/tests/__init__.py +0 -0
  17. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/tests/test_finder.py +0 -0
  18. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/tests/test_hdf5_helpers.py +0 -0
  19. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/tests/test_jp2k.py +0 -0
  20. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/tests/test_run_check.py +0 -0
  21. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/tests/test_ssim.py +0 -0
  22. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/tests/test_utils.py +0 -0
  23. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/utils/hdf5_helpers.py +0 -0
  24. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor/utils/utils.py +0 -0
  25. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor.egg-info/SOURCES.txt +0 -0
  26. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor.egg-info/dependency_links.txt +0 -0
  27. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor.egg-info/entry_points.txt +0 -0
  28. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor.egg-info/requires.txt +0 -0
  29. {esrf_data_compressor-0.1.0 → esrf_data_compressor-0.1.2}/src/esrf_data_compressor.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -90,10 +90,10 @@ Dynamic: license-file
90
90
 
91
91
  * **Four simple CLI subcommands**
92
92
 
93
- * `list`  Show all raw HDF5 files to be processed
94
- * `compress` Generate compressed siblings
95
- * `check`  Produce a per-dataset SSIM report between raw & compressed
96
- * `overwrite` Atomically replace each raw frame file (irreversible)
93
+ * `compress-hdf5 list`  Show all raw HDF5 files to be processed
94
+ * `compress-hdf5 compress` Generate compressed siblings
95
+ * `compress-hdf5 check`  Produce a per-dataset SSIM report between raw & compressed
96
+ * `compress-hdf5 overwrite` Atomically replace each raw frame file (irreversible)
97
97
 
98
98
  ---
99
99
 
@@ -177,7 +177,7 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
177
177
 
178
178
  * Initial implementation of Blosc2 + Grok (JPEG2000) compression for 3D HDF5 datasets.
179
179
  * SSIM-based integrity check (first & last slice).
180
- * Four-command CLI (`list`, `compress`, `check`, `overwrite`).
180
+ * Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
181
181
  * Parallelism with worker×thread auto-factoring.
182
182
 
183
183
  For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
@@ -29,10 +29,10 @@
29
29
 
30
30
  * **Four simple CLI subcommands**
31
31
 
32
- * `list`  Show all raw HDF5 files to be processed
33
- * `compress` Generate compressed siblings
34
- * `check`  Produce a per-dataset SSIM report between raw & compressed
35
- * `overwrite` Atomically replace each raw frame file (irreversible)
32
+ * `compress-hdf5 list`  Show all raw HDF5 files to be processed
33
+ * `compress-hdf5 compress` Generate compressed siblings
34
+ * `compress-hdf5 check`  Produce a per-dataset SSIM report between raw & compressed
35
+ * `compress-hdf5 overwrite` Atomically replace each raw frame file (irreversible)
36
36
 
37
37
  ---
38
38
 
@@ -116,7 +116,7 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
116
116
 
117
117
  * Initial implementation of Blosc2 + Grok (JPEG2000) compression for 3D HDF5 datasets.
118
118
  * SSIM-based integrity check (first & last slice).
119
- * Four-command CLI (`list`, `compress`, `check`, `overwrite`).
119
+ * Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
120
120
  * Parallelism with worker×thread auto-factoring.
121
121
 
122
122
  For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "esrf-data-compressor"
7
- version = "0.1.0"
7
+ version = "0.1.2"
8
8
  authors = [{ name = "ESRF", email = "dau-pydev@esrf.fr" }]
9
9
  description = "A library to compress ESRF data and reduce their footprint"
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
@@ -46,7 +46,7 @@ def do_compress(args):
46
46
  exit_with_error(f"Failed to read report '{report}': {e}")
47
47
 
48
48
  if not files:
49
- print("Nothing to compress (TO COMPRESS list is empty).")
49
+ print("Nothing to compress (TO COMPRESS list is empty).")
50
50
  return
51
51
 
52
52
  print(
@@ -65,10 +65,9 @@ def do_check(args):
65
65
  exit_with_error(f"Failed to read report '{report}': {e}")
66
66
 
67
67
  if not files:
68
- print("Nothing to check (TO COMPRESS list is empty).")
68
+ print("Nothing to check (TO COMPRESS list is empty).")
69
69
  return
70
70
 
71
- # We reuse run_ssim_check in its 3‑arg form (raw_files, method, report_path)
72
71
  report_fname = f"{os.path.splitext(report)[0]}_{args.method}_ssim_report.txt"
73
72
  report_path = os.path.abspath(report_fname)
74
73
 
@@ -81,9 +80,6 @@ def do_check(args):
81
80
 
82
81
 
83
82
  def do_overwrite(args):
84
- """
85
- Overwrite TO COMPRESS files with their original sources.
86
- """
87
83
  report = args.input or "file_list.txt"
88
84
  try:
89
85
  files = parse_report(report)
@@ -91,13 +87,26 @@ def do_overwrite(args):
91
87
  exit_with_error(f"Failed to read report '{report}': {e}")
92
88
 
93
89
  if not files:
94
- print("Nothing to overwrite (TO COMPRESS list is empty).")
90
+ print("Nothing to process (TO COMPRESS list is empty).")
95
91
  return
96
92
 
97
- print(f"Overwriting {len(files)} file(s) from '{report}' …")
98
93
  mgr = CompressorManager()
94
+
95
+ if args.final:
96
+ print(f"Finalizing overwrite for {len(files)} file(s) from '{report}' …")
97
+ mgr.remove_backups(files)
98
+ print("Finalize step complete.\n")
99
+ return
100
+
101
+ if args.undo:
102
+ print(f"Undoing overwrite for {len(files)} file(s) from '{report}' …")
103
+ mgr.restore_backups(files)
104
+ print("Undo step complete.\n")
105
+ return
106
+
107
+ print(f"Overwriting {len(files)} file(s) from '{report}' …")
99
108
  mgr.overwrite_files(files)
100
- print("Overwrite complete.\n")
109
+ print("Overwrite complete (backups kept).\n")
101
110
 
102
111
 
103
112
  def main():
@@ -106,7 +115,6 @@ def main():
106
115
  )
107
116
  sub = parser.add_subparsers(dest="command", required=True)
108
117
 
109
- # list
110
118
  p = sub.add_parser("list", help="Report VDS sources → TO COMPRESS vs REMAINING")
111
119
  p.add_argument("experiment", help="Experiment ID")
112
120
  p.add_argument("beamline", nargs="?", help="Optional beamline")
@@ -115,13 +123,12 @@ def main():
115
123
  p.add_argument(
116
124
  "--filter",
117
125
  metavar="KEY:VAL[,KEY2:VAL2...]",
118
- help="Datasetlevel attribute substring filters",
126
+ help="Dataset-level attribute substring filters",
119
127
  )
120
128
  p.add_argument("--output", help="Report file (default = file_list.txt)")
121
129
  p.set_defaults(func=do_list)
122
130
 
123
- # compress
124
- p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
131
+ p = sub.add_parser("compress", help="Compress only the TO COMPRESS files")
125
132
  p.add_argument(
126
133
  "--input",
127
134
  "-i",
@@ -137,8 +144,7 @@ def main():
137
144
  )
138
145
  p.set_defaults(func=do_compress)
139
146
 
140
- # check
141
- p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
147
+ p = sub.add_parser("check", help="Generate SSIM report for TO COMPRESS files")
142
148
  p.add_argument(
143
149
  "--input", "-i", help="Report file to read (default = file_list.txt)"
144
150
  )
@@ -147,11 +153,24 @@ def main():
147
153
  )
148
154
  p.set_defaults(func=do_check)
149
155
 
150
- # overwrite
151
- p = sub.add_parser("overwrite", help="Overwrite only TO COMPRESS files")
156
+ p = sub.add_parser(
157
+ "overwrite",
158
+ help="Swap in compressed files and keep backups; with --final or --undo, perform cleanup/restore only.",
159
+ )
152
160
  p.add_argument(
153
161
  "--input", "-i", help="Report file to read (default = file_list.txt)"
154
162
  )
163
+ group = p.add_mutually_exclusive_group()
164
+ group.add_argument(
165
+ "--final",
166
+ action="store_true",
167
+ help="Cleanup only: delete existing *.h5.bak backups after confirmation (no overwrite).",
168
+ )
169
+ group.add_argument(
170
+ "--undo",
171
+ action="store_true",
172
+ help="Restore only: move <file>.h5.bak back to <file>.h5 and preserve the current file as <file>_<method>.h5 when needed.",
173
+ )
155
174
  p.set_defaults(func=do_overwrite)
156
175
 
157
176
  args = parser.parse_args()
@@ -34,24 +34,14 @@ class CompressorManager:
34
34
  self, workers: int | None = None, cratio: int = 10, method: str = "jp2k"
35
35
  ):
36
36
  total_cores = os.cpu_count() or 1
37
-
38
- # Determine default threads per worker (4, or fewer if total_cores < 4)
39
- if total_cores >= 4:
40
- default_nthreads = 4
41
- else:
42
- default_nthreads = 1
43
-
44
- # Default worker count
37
+ default_nthreads = 4 if total_cores >= 4 else 1
45
38
  default_workers = max(1, total_cores // default_nthreads)
46
39
 
47
40
  if workers is None:
48
- # Use default workers and default_nthreads
49
41
  w = default_workers
50
42
  nthreads = default_nthreads
51
43
  else:
52
- # Cap workers to total_cores
53
44
  w = min(workers, total_cores)
54
- # Recompute threads per worker so that (w * nthreads) ≤ total_cores, up to 4
55
45
  possible = total_cores // w
56
46
  nthreads = min(possible, 4) if possible >= 1 else 1
57
47
 
@@ -60,7 +50,6 @@ class CompressorManager:
60
50
  self.cratio = cratio
61
51
  self.method = method
62
52
 
63
- # Instantiate compressor based on method
64
53
  if self.method == "jp2k":
65
54
  self.compressor = JP2KCompressorWrapper(
66
55
  cratio=cratio, nthreads=self.nthreads
@@ -122,8 +111,7 @@ class CompressorManager:
122
111
  except Exception as e:
123
112
  print(f"Failed to compress '{pth}': {e}")
124
113
 
125
- t1 = time.time()
126
- elapsed = t1 - t0
114
+ elapsed = time.time() - t0
127
115
  total_mb = total_bytes / (1024 * 1024)
128
116
  rate_mb_s = total_mb / elapsed if elapsed > 0 else float("inf")
129
117
  print(f"\nTotal elapsed time: {elapsed:.3f}s")
@@ -138,7 +126,6 @@ class CompressorManager:
138
126
 
139
127
  After processing all files, removes the backup .h5.bak files.
140
128
  """
141
- backups = []
142
129
  for ipath in file_list:
143
130
  if not ipath.lower().endswith(".h5"):
144
131
  continue
@@ -151,17 +138,76 @@ class CompressorManager:
151
138
  try:
152
139
  os.replace(ipath, backup)
153
140
  os.replace(compressed_path, ipath)
154
- backups.append(backup)
155
141
  print(f"Overwritten '{ipath}' (backup at '{backup}').")
156
142
  except Exception as e:
157
143
  print(f"ERROR overwriting '{ipath}': {e}")
158
144
  else:
159
145
  print(f"SKIP (no compressed file): {ipath}")
160
146
 
161
- # Remove all backup files
162
- for backup in backups:
147
+ def remove_backups(self, file_list: list[str]) -> None:
148
+ candidates = {p + ".bak" for p in file_list if p.lower().endswith(".h5")}
149
+ backups = [b for b in candidates if os.path.exists(b)]
150
+ if not backups:
151
+ print("No backup files to remove.")
152
+ return
153
+
154
+ total_bytes = 0
155
+ for b in backups:
156
+ try:
157
+ total_bytes += os.path.getsize(b)
158
+ except OSError:
159
+ pass
160
+ total_mb = total_bytes / (1024 * 1024)
161
+
162
+ print(
163
+ f"About to remove {len(backups)} backup file(s), ~{total_mb:.2f} MB total."
164
+ )
165
+ ans = input("Proceed? [y/N]: ").strip().lower()
166
+ if ans not in ("y", "yes"):
167
+ print("Backups kept.")
168
+ return
169
+
170
+ removed = 0
171
+ for b in backups:
163
172
  try:
164
- os.remove(backup)
165
- print(f"Deleted backup '{backup}'.")
173
+ os.remove(b)
174
+ removed += 1
166
175
  except Exception as e:
167
- print(f"ERROR deleting backup '{backup}': {e}")
176
+ print(f"ERROR deleting backup '{b}': {e}")
177
+
178
+ print(f"Deleted {removed} backup file(s).")
179
+
180
+ def restore_backups(self, file_list: list[str]) -> None:
181
+ restored = 0
182
+ preserved = 0
183
+ for ipath in file_list:
184
+ if not ipath.lower().endswith(".h5"):
185
+ continue
186
+
187
+ base, _ = os.path.splitext(ipath)
188
+ backup = ipath + ".bak"
189
+ method_path = f"{base}_{self.method}.h5"
190
+
191
+ if not os.path.exists(backup):
192
+ print(f"SKIP (no backup): {ipath}")
193
+ continue
194
+
195
+ if os.path.exists(ipath) and not os.path.exists(method_path):
196
+ try:
197
+ os.replace(ipath, method_path)
198
+ preserved += 1
199
+ print(f"Preserved current file to '{method_path}'.")
200
+ except Exception as e:
201
+ print(f"ERROR preserving current '{ipath}' to '{method_path}': {e}")
202
+ continue
203
+
204
+ try:
205
+ os.replace(backup, ipath)
206
+ restored += 1
207
+ print(f"Restored '{ipath}' from backup.")
208
+ except Exception as e:
209
+ print(f"ERROR restoring '{ipath}' from '{backup}': {e}")
210
+
211
+ print(
212
+ f"Restore complete. Restored: {restored}, preserved compressed copies: {preserved}."
213
+ )
@@ -3,7 +3,19 @@ import sys
3
3
  import re
4
4
  import h5py
5
5
  import h5py.h5d as h5d
6
- from typing import List, Tuple, Optional
6
+ from typing import List, Tuple, Optional, Set, Dict
7
+ from concurrent.futures import ProcessPoolExecutor, as_completed
8
+
9
+ try:
10
+ from tqdm.auto import tqdm
11
+ except Exception:
12
+
13
+ def tqdm(iterable=None, **kwargs):
14
+ return iterable if iterable is not None else range(0)
15
+
16
+
17
+ JP2K_FILTER_ID = 32026
18
+ DATASET_CHECK_PATH = "/entry_0000/measurement/data"
7
19
 
8
20
 
9
21
  def discover_datasets(path_components: List[str], base_root: str) -> List[str]:
@@ -43,8 +55,27 @@ def discover_datasets(path_components: List[str], base_root: str) -> List[str]:
43
55
  return sorted(datasets)
44
56
 
45
57
 
58
+ def _file_has_jp2k_filter(file_path: str) -> bool:
59
+ try:
60
+ with h5py.File(file_path, "r", locking=False) as src:
61
+ obj = src.get(DATASET_CHECK_PATH)
62
+ if not isinstance(obj, h5py.Dataset):
63
+ return False
64
+ plist = obj.id.get_create_plist()
65
+ for j in range(plist.get_nfilters()):
66
+ if plist.get_filter(j)[0] == JP2K_FILTER_ID:
67
+ return True
68
+ return False
69
+ except Exception:
70
+ return False
71
+
72
+
46
73
  def find_vds_files(
47
- path_components: List[str], base_root: str, filter_expr: Optional[str]
74
+ path_components: List[str],
75
+ base_root: str,
76
+ filter_expr: Optional[str],
77
+ *,
78
+ max_workers: Optional[int] = None,
48
79
  ) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
49
80
  """
50
81
  Discover each dataset HDF5, then for each top-level group (e.g. "1.1"):
@@ -59,7 +90,6 @@ def find_vds_files(
59
90
 
60
91
  Returns two lists of (vds_source_path, reason).
61
92
  """
62
- # parse filter tokens
63
93
  filters: List[Tuple[List[str], str]] = []
64
94
  if filter_expr:
65
95
  for tok in filter_expr.split(","):
@@ -77,13 +107,16 @@ def find_vds_files(
77
107
 
78
108
  datasets = discover_datasets(path_components, base_root)
79
109
 
80
- for cont_path in datasets:
81
- with h5py.File(cont_path, "r") as f:
110
+ unique_files: Set[str] = set()
111
+ occurrences: List[Tuple[str, bool, str]] = []
112
+
113
+ for cont_path in tqdm(datasets, desc="Scan datasets", unit="file"):
114
+ cont_dir = os.path.dirname(cont_path)
115
+ with h5py.File(cont_path, "r", locking=False) as f:
82
116
  for grp_name, grp in f.items():
83
117
  if not isinstance(grp, h5py.Group):
84
118
  continue
85
119
 
86
- # determine if group matches filter criteria
87
120
  group_matched = False
88
121
  reason = ""
89
122
  for parts, desired in filters:
@@ -102,55 +135,52 @@ def find_vds_files(
102
135
  if not filters:
103
136
  reason = f"{grp_name}/<no filter>"
104
137
 
105
- # harvest VDS sources under this group, detecting existing compression
106
138
  def visitor(name, obj):
107
139
  if not isinstance(obj, h5py.Dataset):
108
140
  return
109
141
  plist = obj.id.get_create_plist()
110
142
  if plist.get_layout() != h5d.VIRTUAL:
111
143
  return
112
-
113
144
  for i in range(plist.get_virtual_count()):
114
145
  fn = plist.get_virtual_filename(i)
146
+ if isinstance(fn, bytes):
147
+ fn = fn.decode("utf-8", "ignore")
115
148
  if not os.path.isabs(fn):
116
- fn = os.path.abspath(
117
- os.path.join(os.path.dirname(cont_path), fn)
118
- )
119
-
120
- # inspect file for Blosc2/Grok (JP2K) compression filter
121
- already_compressed = False
122
- try:
123
- with h5py.File(fn, "r") as src:
124
- comp_flag = [False]
125
-
126
- def _check(name2, obj2):
127
- if isinstance(obj2, h5py.Dataset):
128
- plist2 = obj2.id.get_create_plist()
129
- for j in range(plist2.get_nfilters()):
130
- fid = plist2.get_filter(j)[0]
131
- if fid == 32026:
132
- comp_flag[0] = True
133
- return
134
-
135
- src.visititems(_check)
136
- already_compressed = comp_flag[0]
137
- except Exception:
138
- already_compressed = False
139
-
140
- if already_compressed:
141
- rem_reason = f"{grp_name}/<already compressed>"
142
- remaining.append((fn, rem_reason))
143
- else:
144
- if group_matched:
145
- to_compress.append((fn, reason))
146
- else:
147
- remaining.append((fn, reason))
149
+ fn = os.path.abspath(os.path.join(cont_dir, fn))
150
+ unique_files.add(fn)
151
+ occurrences.append((fn, group_matched, reason))
148
152
 
149
153
  grp.visititems(visitor)
150
154
 
151
- if not to_compress and not remaining:
155
+ if not occurrences:
152
156
  sys.exit(f"ERROR: No VDS sources found under {base_root}/{path_components}")
153
157
 
158
+ fps = sorted(unique_files)
159
+ if max_workers is None:
160
+ cpu = os.cpu_count() or 8
161
+ max_workers = min(16, max(4, cpu))
162
+
163
+ compressed_map: Dict[str, bool] = {}
164
+ with ProcessPoolExecutor(max_workers=max_workers) as ex:
165
+ futs = {ex.submit(_file_has_jp2k_filter, fp): fp for fp in fps}
166
+ for fut in tqdm(
167
+ as_completed(futs), total=len(fps), desc="Check compression", unit="file"
168
+ ):
169
+ fp = futs[fut]
170
+ try:
171
+ compressed_map[fp] = bool(fut.result())
172
+ except Exception:
173
+ compressed_map[fp] = False
174
+
175
+ for fp, matched, reason in occurrences:
176
+ if compressed_map.get(fp, False):
177
+ remaining.append((fp, "<already compressed>"))
178
+ else:
179
+ if matched:
180
+ to_compress.append((fp, reason))
181
+ else:
182
+ remaining.append((fp, reason))
183
+
154
184
  return to_compress, remaining
155
185
 
156
186
 
@@ -119,12 +119,14 @@ def test_commands_with_non_empty_list(
119
119
  for f in files:
120
120
  comp = tmp_path / f.replace(".h5", "_jp2k.h5")
121
121
  assert comp.exists()
122
- # For overwrite, verify original replaced and backup removed
122
+ # For overwrite, verify original replaced and backup KEPT
123
123
  if cmd == "overwrite":
124
124
  # f1 was overwritten, f2 was skipped
125
125
  assert (tmp_path / "f1.h5").exists()
126
- # no backup remains
127
- assert not (tmp_path / "f1.h5.bak").exists()
126
+ # backup remains by default
127
+ assert (tmp_path / "f1.h5.bak").exists()
128
+ # f2 had no compressed sibling → no backup
129
+ assert not (tmp_path / "f2.h5.bak").exists()
128
130
 
129
131
 
130
132
  def test_list_success_and_output_file(argv_runner, monkeypatch, capsys, tmp_path):
@@ -150,7 +152,7 @@ def test_list_success_and_output_file(argv_runner, monkeypatch, capsys, tmp_path
150
152
  [
151
153
  ("compress", "Nothing to compress"),
152
154
  ("check", "Nothing to check"),
153
- ("overwrite", "Nothing to overwrite"),
155
+ ("overwrite", "Nothing to process"),
154
156
  ],
155
157
  )
156
158
  def test_empty_reports(argv_runner, monkeypatch, capsys, cmd, empty_msg, tmp_path):
@@ -174,3 +176,37 @@ def test_check_success_writes_report(argv_runner, monkeypatch, capsys, tmp_path)
174
176
  argv_runner(["check", "-i", str(report), "--method", "jp2k"])
175
177
  out = capsys.readouterr().out
176
178
  assert "SSIM report written to" in out
179
+
180
+
181
+ def test_overwrite_final_deletes_backups(argv_runner, monkeypatch, capsys, tmp_path):
182
+ # Prepare a file and its backup
183
+ (tmp_path / "f1.h5").write_text("current")
184
+ (tmp_path / "f1.h5.bak").write_text("backup")
185
+ # parse_report returns the original .h5 path(s)
186
+ monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(tmp_path / "f1.h5")])
187
+ # auto-confirm deletion
188
+ monkeypatch.setattr("builtins.input", lambda *a, **k: "y")
189
+
190
+ argv_runner(["overwrite", "-i", "report.txt", "--final"])
191
+ out = capsys.readouterr().out
192
+ assert "About to remove" in out
193
+ assert not (tmp_path / "f1.h5.bak").exists()
194
+
195
+
196
+ def test_overwrite_undo_restores_and_preserves(
197
+ argv_runner, monkeypatch, capsys, tmp_path
198
+ ):
199
+ # Start with current file and a backup; no <method> file yet
200
+ (tmp_path / "f1.h5").write_text("CUR")
201
+ (tmp_path / "f1.h5.bak").write_text("BAK")
202
+ monkeypatch.setattr(cli, "parse_report", lambda rpt: [str(tmp_path / "f1.h5")])
203
+
204
+ argv_runner(["overwrite", "-i", "report.txt", "--undo"])
205
+ out = capsys.readouterr().out
206
+ assert "Undoing overwrite" in out
207
+ # Backup should have been restored to f1.h5
208
+ assert (tmp_path / "f1.h5").read_text() == "BAK"
209
+ # Previous current should have been preserved as f1_jp2k.h5
210
+ assert (tmp_path / "f1_jp2k.h5").read_text() == "CUR"
211
+ # .bak should be gone after restore (moved)
212
+ assert not (tmp_path / "f1.h5.bak").exists()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: esrf-data-compressor
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: A library to compress ESRF data and reduce their footprint
5
5
  Author-email: ESRF <dau-pydev@esrf.fr>
6
6
  License: MIT License
@@ -90,10 +90,10 @@ Dynamic: license-file
90
90
 
91
91
  * **Four simple CLI subcommands**
92
92
 
93
- * `list`  Show all raw HDF5 files to be processed
94
- * `compress` Generate compressed siblings
95
- * `check`  Produce a per-dataset SSIM report between raw & compressed
96
- * `overwrite` Atomically replace each raw frame file (irreversible)
93
+ * `compress-hdf5 list`  Show all raw HDF5 files to be processed
94
+ * `compress-hdf5 compress` Generate compressed siblings
95
+ * `compress-hdf5 check`  Produce a per-dataset SSIM report between raw & compressed
96
+ * `compress-hdf5 overwrite` Atomically replace each raw frame file (irreversible)
97
97
 
98
98
  ---
99
99
 
@@ -177,7 +177,7 @@ All noteworthy changes are recorded in [CHANGELOG.md](CHANGELOG.md). Version 0.1
177
177
 
178
178
  * Initial implementation of Blosc2 + Grok (JPEG2000) compression for 3D HDF5 datasets.
179
179
  * SSIM-based integrity check (first & last slice).
180
- * Four-command CLI (`list`, `compress`, `check`, `overwrite`).
180
+ * Four-command CLI (`compress-hdf5 list`, `compress-hdf5 compress`, `compress-hdf5 check`, `compress-hdf5 overwrite`).
181
181
  * Parallelism with worker×thread auto-factoring.
182
182
 
183
183
  For more details, see the full history in [CHANGELOG.md](CHANGELOG.md).