mlarray 0.0.51__tar.gz → 0.0.52__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {mlarray-0.0.51 → mlarray-0.0.52}/PKG-INFO +10 -2
  2. {mlarray-0.0.51 → mlarray-0.0.52}/README.md +9 -1
  3. mlarray-0.0.52/bench/.gitignore +2 -0
  4. mlarray-0.0.52/bench/README.md +56 -0
  5. mlarray-0.0.52/bench/bench_convert_nii_to_mla_random_read.py +586 -0
  6. mlarray-0.0.52/bench/bench_io_blosc2_layouts.py +1178 -0
  7. mlarray-0.0.52/bench/helper/print_mla_layouts.py +85 -0
  8. mlarray-0.0.52/docs/cli.md +34 -0
  9. mlarray-0.0.52/mlarray/blosc2_layout_strategies.py +766 -0
  10. mlarray-0.0.52/mlarray/cli.py +133 -0
  11. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray/mlarray.py +62 -118
  12. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray.egg-info/PKG-INFO +10 -2
  13. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray.egg-info/SOURCES.txt +8 -0
  14. mlarray-0.0.52/tests/test_cli.py +139 -0
  15. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_optimization.py +38 -0
  16. mlarray-0.0.51/docs/cli.md +0 -29
  17. mlarray-0.0.51/mlarray/cli.py +0 -60
  18. {mlarray-0.0.51 → mlarray-0.0.52}/.github/workflows/workflow.yml +0 -0
  19. {mlarray-0.0.51 → mlarray-0.0.52}/.gitignore +0 -0
  20. {mlarray-0.0.51 → mlarray-0.0.52}/LICENSE +0 -0
  21. {mlarray-0.0.51 → mlarray-0.0.52}/MANIFEST.in +0 -0
  22. {mlarray-0.0.51 → mlarray-0.0.52}/assets/banner.png +0 -0
  23. {mlarray-0.0.51 → mlarray-0.0.52}/assets/banner.png~ +0 -0
  24. {mlarray-0.0.51 → mlarray-0.0.52}/docs/api.md +0 -0
  25. {mlarray-0.0.51 → mlarray-0.0.52}/docs/index.md +0 -0
  26. {mlarray-0.0.51 → mlarray-0.0.52}/docs/optimization.md +0 -0
  27. {mlarray-0.0.51 → mlarray-0.0.52}/docs/schema.md +0 -0
  28. {mlarray-0.0.51 → mlarray-0.0.52}/docs/usage.md +0 -0
  29. {mlarray-0.0.51 → mlarray-0.0.52}/docs/why.md +0 -0
  30. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_asarray.py +0 -0
  31. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_bboxes_only.py +0 -0
  32. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_channel.py +0 -0
  33. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_compress_decompress.py +0 -0
  34. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_compressed_vs_uncompressed.py +0 -0
  35. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_in_memory_constructors.py +0 -0
  36. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_metadata_only.py +0 -0
  37. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_non_spatial.py +0 -0
  38. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_open.py +0 -0
  39. {mlarray-0.0.51 → mlarray-0.0.52}/examples/example_save_load.py +0 -0
  40. {mlarray-0.0.51 → mlarray-0.0.52}/mkdocs.yml +0 -0
  41. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray/__init__.py +0 -0
  42. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray/meta.py +0 -0
  43. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray/utils.py +0 -0
  44. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray.egg-info/dependency_links.txt +0 -0
  45. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray.egg-info/entry_points.txt +0 -0
  46. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray.egg-info/requires.txt +0 -0
  47. {mlarray-0.0.51 → mlarray-0.0.52}/mlarray.egg-info/top_level.txt +0 -0
  48. {mlarray-0.0.51 → mlarray-0.0.52}/pyproject.toml +0 -0
  49. {mlarray-0.0.51 → mlarray-0.0.52}/setup.cfg +0 -0
  50. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_asarray.py +0 -0
  51. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_bboxes.py +0 -0
  52. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_compress_decompress.py +0 -0
  53. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_constructors.py +0 -0
  54. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_create.py +0 -0
  55. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_meta_safety.py +0 -0
  56. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_metadata.py +0 -0
  57. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_open.py +0 -0
  58. {mlarray-0.0.51 → mlarray-0.0.52}/tests/test_usage.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlarray
3
- Version: 0.0.51
3
+ Version: 0.0.52
4
4
  Summary: Array format specialized for Machine Learning with Blosc2 backend and standardized metadata.
5
5
  Author-email: Karol Gotkowski <karol.gotkowski@dkfz.de>
6
6
  License: MIT
@@ -236,10 +236,18 @@ mlarray_header sample.mla
236
236
 
237
237
  ### mlarray_convert
238
238
 
239
- Convert a NIfTI or NRRD file to MLArray and copy metadata.
239
+ Convert between MLArray and NIfTI/NRRD files.
240
+
241
+ When converting from NIfTI/NRRD to MLArray, source metadata is copied into
242
+ `meta.source`.
243
+
244
+ When converting from MLArray to NIfTI/NRRD, only `meta.source` is copied into
245
+ the output header. Spatial metadata (`spacing`, `origin`, `direction`) is set
246
+ explicitly from `meta.spatial`.
240
247
 
241
248
  ```bash
242
249
  mlarray_convert sample.nii.gz output.mla
250
+ mlarray_convert sample.mla output.nii.gz
243
251
  ```
244
252
 
245
253
  ## Contributing
@@ -202,10 +202,18 @@ mlarray_header sample.mla
202
202
 
203
203
  ### mlarray_convert
204
204
 
205
- Convert a NIfTI or NRRD file to MLArray and copy metadata.
205
+ Convert between MLArray and NIfTI/NRRD files.
206
+
207
+ When converting from NIfTI/NRRD to MLArray, source metadata is copied into
208
+ `meta.source`.
209
+
210
+ When converting from MLArray to NIfTI/NRRD, only `meta.source` is copied into
211
+ the output header. Spatial metadata (`spacing`, `origin`, `direction`) is set
212
+ explicitly from `meta.spatial`.
206
213
 
207
214
  ```bash
208
215
  mlarray_convert sample.nii.gz output.mla
216
+ mlarray_convert sample.mla output.nii.gz
209
217
  ```
210
218
 
211
219
  ## Contributing
@@ -0,0 +1,2 @@
1
+ data/
2
+ results/
@@ -0,0 +1,56 @@
1
+ # Benchmark Scripts
2
+
3
+ This folder contains benchmarking scripts for MLArray IO/layout experiments.
4
+
5
+ ## `bench_io_blosc2_layouts.py`
6
+
7
+ Benchmarks IO throughput across:
8
+
9
+ - layout method(s) based on `comp_blosc2_params` (currently baseline copy only)
10
+ - image size tiers (`small`, `medium`, `large`, `very_large`)
11
+ - 2D / 3D / 4D-total array cases with spatial and optional non-spatial axis
12
+ - multiple patch sizes (2D and 3D patch vectors)
13
+ - `MLArray.open(...)` mode/mmap combinations
14
+ - operations:
15
+ - `read_full`
16
+ - `read_patch_random`
17
+ - `write_patch_random`
18
+ - warm and cold cache runs
19
+
20
+ Outputs are printed to console and written to:
21
+
22
+ - `bench/results/bench_io_blosc2_layouts.csv`
23
+ - `bench/results/bench_io_blosc2_layouts.json`
24
+
25
+ ### Example
26
+
27
+ ```bash
28
+ python bench/bench_io_blosc2_layouts.py \
29
+ --tiers small medium \
30
+ --runs 3 \
31
+ --cache-mode both \
32
+ --nthreads 1
33
+ ```
34
+
35
+ If you hit native segfaults in Blosc2 during long runs, isolate each measured run
36
+ in a subprocess (slower, but robust):
37
+
38
+ ```bash
39
+ python bench/bench_io_blosc2_layouts.py \
40
+ --tiers small medium \
41
+ --runs 3 \
42
+ --cache-mode both \
43
+ --nthreads 1 \
44
+ --isolate-runs
45
+ ```
46
+
47
+ ### Cold cache note (Linux)
48
+
49
+ For cold-cache read measurements, the script drops Linux page cache **after the dataset has been created on disk and immediately before measured open/read runs**.
50
+
51
+ This requires root:
52
+
53
+ - run as root, or
54
+ - run via `sudo`
55
+
56
+ If cache dropping fails, those runs are recorded with error status in results.
@@ -0,0 +1,586 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import os
7
+ import statistics
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Any, Callable, Optional, Union
11
+
12
+ import numpy as np
13
+ from medvol import MedVol
14
+ from tqdmp import tqdmp
15
+ from tqdm import tqdm
16
+
17
+ from mlarray import MLArray
18
+
19
+ try:
20
+ from mlarray.blosc2_layout_strategies import (
21
+ comp_blosc2_params_baseline,
22
+ comp_blosc2_params_generalized,
23
+ comp_blosc2_params_spatial_only,
24
+ comp_blosc2_params_spatial_only_magnitude,
25
+ )
26
+ except ModuleNotFoundError:
27
+ from mlarray.blosc2_layout_strategies import (
28
+ comp_blosc2_params_baseline,
29
+ comp_blosc2_params_generalized,
30
+ comp_blosc2_params_spatial_only,
31
+ comp_blosc2_params_spatial_only_magnitude,
32
+ )
33
+
34
+
35
+ LAYOUT_ALGOS: dict[str, Callable[..., tuple[list[int], list[int]]]] = {
36
+ "baseline": comp_blosc2_params_baseline,
37
+ "generalized": comp_blosc2_params_generalized,
38
+ "spatial_only": comp_blosc2_params_spatial_only,
39
+ "spatial_only_magnitude": comp_blosc2_params_spatial_only_magnitude,
40
+ }
41
+
42
+
43
+ def parse_args() -> argparse.Namespace:
44
+ parser = argparse.ArgumentParser(
45
+ description=(
46
+ "Convert all .nii.gz files in a dataset directory to .mla with a selected "
47
+ "Blosc2 layout algorithm, then run random-read benchmarking."
48
+ )
49
+ )
50
+ parser.add_argument(
51
+ "--layout-algo",
52
+ required=True,
53
+ choices=sorted(LAYOUT_ALGOS.keys()),
54
+ help="Layout algorithm name.",
55
+ )
56
+ parser.add_argument(
57
+ "--input-dir",
58
+ required=True,
59
+ type=Path,
60
+ help="Input dataset directory containing .nii.gz files (searched recursively).",
61
+ )
62
+ parser.add_argument(
63
+ "--output-dir",
64
+ required=True,
65
+ type=Path,
66
+ help="Output directory for .mla files and benchmark results.",
67
+ )
68
+ parser.add_argument(
69
+ "--patch-size",
70
+ type=int,
71
+ default=192,
72
+ help="Isotropic spatial patch size used for layout and random-read benchmarking.",
73
+ )
74
+ parser.add_argument(
75
+ "--convert-processes",
76
+ type=int,
77
+ default=max(1, (os.cpu_count() or 1)),
78
+ help="Number of processes for conversion via tqdmp.",
79
+ )
80
+ parser.add_argument(
81
+ "--random-reads",
82
+ type=int,
83
+ default=10000,
84
+ help=(
85
+ "Total number of random read operations. "
86
+ "Each operation samples one random file and reads one random patch."
87
+ ),
88
+ )
89
+ parser.add_argument(
90
+ "--seed",
91
+ type=int,
92
+ default=None,
93
+ help="Base random seed for random-read benchmarking. If omitted, randomness is nondeterministic.",
94
+ )
95
+ parser.add_argument(
96
+ "--overwrite",
97
+ "--recreate",
98
+ dest="overwrite",
99
+ action="store_true",
100
+ help=(
101
+ "Force fresh conversion of all files (overwrite existing .mla files). "
102
+ "By default, valid existing outputs are reused."
103
+ ),
104
+ )
105
+ parser.add_argument(
106
+ "--mmap-mode",
107
+ default="r",
108
+ choices=["r", "c", "r+", "none"],
109
+ help="mmap_mode used by MLArray.open during random-read benchmark.",
110
+ )
111
+ parser.add_argument(
112
+ "--open-mode",
113
+ default="r",
114
+ choices=["r", "a"],
115
+ help="mode used by MLArray.open during random-read benchmark.",
116
+ )
117
+ parser.add_argument(
118
+ "--dparams-nthreads",
119
+ type=int,
120
+ default=1,
121
+ help="Blosc2 dparams nthreads for MLArray.open in read benchmark.",
122
+ )
123
+ parser.add_argument(
124
+ "--disable-progress",
125
+ action="store_true",
126
+ help="Disable tqdm progress bars.",
127
+ )
128
+ return parser.parse_args()
129
+
130
+
131
+ def infer_spatial_axis_mask(shape: tuple[int, ...]) -> list[bool]:
132
+ ndims = len(shape)
133
+ if ndims == 2:
134
+ return [True, True]
135
+ if ndims == 3:
136
+ return [True, True, True]
137
+ if ndims == 4:
138
+ # MedVol convention: channel/modality-first for 4D.
139
+ return [False, True, True, True]
140
+ raise RuntimeError(f"Unsupported dimensionality for this script: {ndims}")
141
+
142
+
143
+ def output_path_for_input(input_dir: Path, output_dir: Path, input_path: Path) -> Path:
144
+ rel = input_path.relative_to(input_dir)
145
+ rel_str = rel.as_posix()
146
+ if rel_str.endswith(".nii.gz"):
147
+ rel_out = rel_str[:-7] + ".mla"
148
+ else:
149
+ rel_out = str(rel.with_suffix(".mla"))
150
+ return output_dir / rel_out
151
+
152
+
153
+ def convert_one(
154
+ job: Union[str, dict[str, Any]],
155
+ *,
156
+ input_dir: str,
157
+ output_dir: str,
158
+ layout_algo_name: str,
159
+ patch_size: int,
160
+ ) -> dict[str, Any]:
161
+ t0 = time.perf_counter()
162
+ if isinstance(job, dict):
163
+ input_path = Path(job["input_path"])
164
+ preview_print = bool(job.get("preview_print", False))
165
+ else:
166
+ input_path = Path(job)
167
+ preview_print = False
168
+ input_dir_p = Path(input_dir)
169
+ output_dir_p = Path(output_dir)
170
+ output_path = output_path_for_input(input_dir_p, output_dir_p, input_path)
171
+
172
+ try:
173
+ med = MedVol(input_path)
174
+ array = med.array
175
+ shape = tuple(int(v) for v in array.shape)
176
+ spatial_axis_mask = infer_spatial_axis_mask(shape)
177
+ spatial_ndims = sum(1 for v in spatial_axis_mask if v)
178
+ patch = tuple(int(patch_size) for _ in range(spatial_ndims))
179
+
180
+ layout_func = LAYOUT_ALGOS[layout_algo_name]
181
+ chunk_size, block_size = layout_func(
182
+ image_size=shape,
183
+ patch_size=patch,
184
+ spatial_axis_mask=spatial_axis_mask,
185
+ bytes_per_pixel=array.dtype.itemsize,
186
+ )
187
+
188
+ output_path.parent.mkdir(parents=True, exist_ok=True)
189
+ image = MLArray(
190
+ array=array,
191
+ patch_size=None,
192
+ chunk_size=chunk_size,
193
+ block_size=block_size,
194
+ )
195
+ image.save(output_path)
196
+ image.close()
197
+ if preview_print:
198
+ print(
199
+ f"converted image={list(shape)} chunk={list(chunk_size)} block={list(block_size)} "
200
+ f"path={output_path}",
201
+ flush=True,
202
+ )
203
+
204
+ elapsed = time.perf_counter() - t0
205
+ return {
206
+ "input_path": str(input_path),
207
+ "output_path": str(output_path),
208
+ "success": True,
209
+ "skipped_existing": False,
210
+ "validated_existing": False,
211
+ "shape": list(shape),
212
+ "dtype": str(array.dtype),
213
+ "spatial_axis_mask": spatial_axis_mask,
214
+ "patch_size": list(patch),
215
+ "chunk_size": [int(v) for v in chunk_size],
216
+ "block_size": [int(v) for v in block_size],
217
+ "seconds": elapsed,
218
+ "error": "",
219
+ }
220
+ except Exception as e: # noqa: BLE001
221
+ elapsed = time.perf_counter() - t0
222
+ return {
223
+ "input_path": str(input_path),
224
+ "output_path": str(output_path),
225
+ "success": False,
226
+ "skipped_existing": False,
227
+ "validated_existing": False,
228
+ "seconds": elapsed,
229
+ "error": f"{type(e).__name__}: {e}",
230
+ }
231
+
232
+
233
+ def check_existing_mla(path: Path) -> tuple[bool, str]:
234
+ image = None
235
+ try:
236
+ image = MLArray.open(path, mode="r", mmap_mode="r", dparams={"nthreads": 1})
237
+ _ = image.shape
238
+ _ = image.dtype
239
+ return True, ""
240
+ except Exception as e: # noqa: BLE001
241
+ return False, f"{type(e).__name__}: {e}"
242
+ finally:
243
+ if image is not None:
244
+ try:
245
+ image.close()
246
+ except Exception:
247
+ pass
248
+
249
+
250
+ def random_read_slices(
251
+ shape: tuple[int, ...],
252
+ spatial_axis_mask: list[bool],
253
+ spatial_patch_size: int,
254
+ rng: np.random.Generator,
255
+ ) -> tuple[slice, ...]:
256
+ slices: list[slice] = []
257
+ for axis_len, is_spatial in zip(shape, spatial_axis_mask):
258
+ axis_len = int(axis_len)
259
+ if is_spatial:
260
+ patch_len = min(axis_len, int(spatial_patch_size))
261
+ start_max = axis_len - patch_len
262
+ start = 0 if start_max <= 0 else int(rng.integers(0, start_max + 1))
263
+ slices.append(slice(start, start + patch_len))
264
+ else:
265
+ # Read across all modalities / channels.
266
+ slices.append(slice(0, axis_len))
267
+ return tuple(slices)
268
+
269
+
270
+ def bench_random_read_one(
271
+ image: MLArray,
272
+ filepath: Path,
273
+ *,
274
+ patch_size: int,
275
+ shape: tuple[int, ...],
276
+ dtype: np.dtype[Any],
277
+ spatial_axis_mask: list[bool],
278
+ rng: np.random.Generator,
279
+ read_idx: int,
280
+ ) -> dict[str, Any]:
281
+ t0 = time.perf_counter()
282
+ slc = random_read_slices(shape, spatial_axis_mask, patch_size, rng)
283
+ patch = image[slc]
284
+ patch_data = patch.to_numpy() if hasattr(patch, "to_numpy") else np.asarray(patch)
285
+
286
+ elapsed = time.perf_counter() - t0
287
+ bytes_total = int(np.prod(patch_data.shape)) * dtype.itemsize
288
+ throughput_bps = float(bytes_total) / elapsed if elapsed > 0 else 0.0
289
+
290
+ return {
291
+ "read_idx": int(read_idx),
292
+ "output_path": str(filepath),
293
+ "success": True,
294
+ "seconds": elapsed,
295
+ "bytes_per_read": bytes_total,
296
+ "bytes_total": bytes_total,
297
+ "throughput_Bps": throughput_bps,
298
+ "throughput_GiBps": throughput_bps / (1024**3),
299
+ "shape": list(shape),
300
+ "dtype": str(dtype),
301
+ "spatial_axis_mask": spatial_axis_mask,
302
+ "slice": str(slc),
303
+ "error": "",
304
+ }
305
+
306
+
307
+ def run_random_read_benchmark(
308
+ output_paths: list[str],
309
+ *,
310
+ patch_size: int,
311
+ random_reads: int,
312
+ seed: Optional[int],
313
+ mmap_mode: Optional[str],
314
+ open_mode: str,
315
+ dparams_nthreads: int,
316
+ disable_progress: bool,
317
+ ) -> list[dict[str, Any]]:
318
+ rng = np.random.default_rng(seed)
319
+ rows: list[dict[str, Any]] = []
320
+ opened: list[dict[str, Any]] = []
321
+
322
+ for out_path in tqdm(output_paths, desc="Opening files"):
323
+ path = Path(out_path)
324
+ image = MLArray.open(
325
+ path,
326
+ mode=open_mode,
327
+ mmap_mode=mmap_mode,
328
+ dparams={"nthreads": dparams_nthreads},
329
+ )
330
+ shape = tuple(int(v) for v in image.shape)
331
+ dtype = np.dtype(image.dtype)
332
+ spatial_axis_mask = infer_spatial_axis_mask(shape)
333
+ opened.append(
334
+ {
335
+ "path": path,
336
+ "image": image,
337
+ "shape": shape,
338
+ "dtype": dtype,
339
+ "spatial_axis_mask": spatial_axis_mask,
340
+ }
341
+ )
342
+
343
+ n_files = len(opened)
344
+ if n_files == 0:
345
+ return rows
346
+
347
+ warmup_reads = int(int(random_reads) * 0.1)
348
+
349
+ for read_idx in tqdm(
350
+ range(int(random_reads)),
351
+ desc="Random read benchmark",
352
+ disable=disable_progress,
353
+ ):
354
+ picked_idx = int(rng.integers(0, n_files))
355
+ picked = opened[picked_idx]
356
+ row = bench_random_read_one(
357
+ picked["image"],
358
+ picked["path"],
359
+ patch_size=patch_size,
360
+ shape=picked["shape"],
361
+ dtype=picked["dtype"],
362
+ spatial_axis_mask=picked["spatial_axis_mask"],
363
+ rng=rng,
364
+ read_idx=read_idx,
365
+ )
366
+ row["is_warmup"] = read_idx < warmup_reads
367
+ row["in_measurement"] = not row["is_warmup"]
368
+ rows.append(row)
369
+ return rows
370
+
371
+
372
+ def summarize_conversion(rows: list[dict[str, Any]]) -> dict[str, Any]:
373
+ total = len(rows)
374
+ success_rows = [r for r in rows if r["success"]]
375
+ fail_rows = [r for r in rows if not r["success"]]
376
+ skipped = sum(1 for r in success_rows if r.get("skipped_existing", False))
377
+ secs = [float(r["seconds"]) for r in success_rows]
378
+ return {
379
+ "total": total,
380
+ "successful": len(success_rows),
381
+ "failed": len(fail_rows),
382
+ "skipped_existing": skipped,
383
+ "mean_seconds_success": float(statistics.mean(secs)) if secs else 0.0,
384
+ "median_seconds_success": float(statistics.median(secs)) if secs else 0.0,
385
+ }
386
+
387
+
388
+ def summarize_random_read(rows: list[dict[str, Any]]) -> dict[str, Any]:
389
+ total = len(rows)
390
+ warmup_total = sum(1 for r in rows if bool(r.get("is_warmup", False)))
391
+ measured_rows = [r for r in rows if bool(r.get("in_measurement", True))]
392
+ success_rows = [r for r in measured_rows if r["success"]]
393
+ fail_rows = [r for r in measured_rows if not r["success"]]
394
+
395
+ throughputs = [float(r["throughput_GiBps"]) for r in success_rows]
396
+ total_bytes = int(sum(int(r["bytes_total"]) for r in success_rows))
397
+ total_seconds = float(sum(float(r["seconds"]) for r in success_rows))
398
+ overall_gibps = (
399
+ (float(total_bytes) / total_seconds) / (1024**3)
400
+ if total_seconds > 0
401
+ else 0.0
402
+ )
403
+
404
+ return {
405
+ "total_reads": total,
406
+ "warmup_reads": warmup_total,
407
+ "measured_reads": len(measured_rows),
408
+ "successful": len(success_rows),
409
+ "failed": len(fail_rows),
410
+ "overall_GiBps": overall_gibps,
411
+ "mean_GiBps": float(statistics.mean(throughputs)) if throughputs else 0.0,
412
+ "median_GiBps": float(statistics.median(throughputs)) if throughputs else 0.0,
413
+ "min_GiBps": float(min(throughputs)) if throughputs else 0.0,
414
+ "max_GiBps": float(max(throughputs)) if throughputs else 0.0,
415
+ "total_bytes": total_bytes,
416
+ "total_seconds": total_seconds,
417
+ }
418
+
419
+
420
+ def print_summary(
421
+ conversion_summary: dict[str, Any],
422
+ read_summary: dict[str, Any],
423
+ ) -> None:
424
+ print("\n=== Conversion Summary ===")
425
+ print(
426
+ f"total={conversion_summary['total']} "
427
+ f"success={conversion_summary['successful']} "
428
+ f"failed={conversion_summary['failed']} "
429
+ f"skipped_existing={conversion_summary['skipped_existing']}"
430
+ )
431
+ print(
432
+ f"mean_s={conversion_summary['mean_seconds_success']:.3f} "
433
+ f"median_s={conversion_summary['median_seconds_success']:.3f}"
434
+ )
435
+
436
+ print("\n=== Random Read Summary ===")
437
+ print(
438
+ f"total={read_summary['total_reads']} "
439
+ f"warmup={read_summary['warmup_reads']} "
440
+ f"measured={read_summary['measured_reads']} "
441
+ f"success={read_summary['successful']} "
442
+ f"failed={read_summary['failed']}"
443
+ )
444
+ print(
445
+ f"overall_GiB/s={read_summary['overall_GiBps']:.3f} "
446
+ f"mean_GiB/s={read_summary['mean_GiBps']:.3f} "
447
+ f"median_GiB/s={read_summary['median_GiBps']:.3f} "
448
+ f"min_GiB/s={read_summary['min_GiBps']:.3f} "
449
+ f"max_GiB/s={read_summary['max_GiBps']:.3f}"
450
+ )
451
+
452
+
453
+ def main() -> None:
454
+ args = parse_args()
455
+ if args.convert_processes < 0:
456
+ raise ValueError("Process counts must be >= 0")
457
+ if args.patch_size <= 0:
458
+ raise ValueError("patch_size must be > 0")
459
+ if args.random_reads <= 0:
460
+ raise ValueError("random_reads must be > 0")
461
+
462
+ input_dir = args.input_dir.resolve()
463
+ output_dir = args.output_dir.resolve()
464
+ output_dir.mkdir(parents=True, exist_ok=True)
465
+
466
+ if not input_dir.exists() or not input_dir.is_dir():
467
+ raise RuntimeError(f"Input dir does not exist or is not a directory: {input_dir}")
468
+
469
+ nii_paths = sorted(input_dir.rglob("*.nii.gz"))
470
+ if not nii_paths:
471
+ raise RuntimeError(f"No .nii.gz files found in {input_dir}")
472
+
473
+ print(f"Found {len(nii_paths)} .nii.gz files")
474
+ print(f"Layout algorithm: {args.layout_algo}")
475
+ print("Scanning existing outputs for resumable conversion...")
476
+
477
+ conversion_rows: list[dict[str, Any]] = []
478
+ pending_inputs: list[str] = []
479
+ reused_existing = 0
480
+ corrupted_existing = 0
481
+
482
+ for input_path in nii_paths:
483
+ output_path = output_path_for_input(input_dir, output_dir, input_path)
484
+ if args.overwrite:
485
+ pending_inputs.append(str(input_path))
486
+ continue
487
+
488
+ if output_path.exists():
489
+ ok, err = check_existing_mla(output_path)
490
+ if ok:
491
+ reused_existing += 1
492
+ conversion_rows.append(
493
+ {
494
+ "input_path": str(input_path),
495
+ "output_path": str(output_path),
496
+ "success": True,
497
+ "skipped_existing": True,
498
+ "validated_existing": True,
499
+ "seconds": 0.0,
500
+ "error": "",
501
+ }
502
+ )
503
+ continue
504
+ corrupted_existing += 1
505
+ pending_inputs.append(str(input_path))
506
+ else:
507
+ pending_inputs.append(str(input_path))
508
+
509
+ print(
510
+ f"Resume scan: reuse_valid={reused_existing} "
511
+ f"reconvert={len(pending_inputs)} "
512
+ f"invalid_existing={corrupted_existing} "
513
+ f"overwrite={bool(args.overwrite)}"
514
+ )
515
+
516
+ if pending_inputs:
517
+ pending_jobs = [
518
+ {"input_path": p, "preview_print": i < 10}
519
+ for i, p in enumerate(pending_inputs)
520
+ ]
521
+ converted_rows = tqdmp(
522
+ convert_one,
523
+ pending_jobs,
524
+ num_processes=args.convert_processes,
525
+ desc="Convert NIfTI -> MLArray",
526
+ disable=args.disable_progress,
527
+ input_dir=str(input_dir),
528
+ output_dir=str(output_dir),
529
+ layout_algo_name=args.layout_algo,
530
+ patch_size=int(args.patch_size),
531
+ )
532
+ conversion_rows.extend(converted_rows)
533
+
534
+ converted_success = [row for row in conversion_rows if row["success"]]
535
+ read_paths = [row["output_path"] for row in converted_success]
536
+ if not read_paths:
537
+ raise RuntimeError("No successful conversions available for random-read benchmark.")
538
+
539
+ mmap_mode = None if args.mmap_mode == "none" else args.mmap_mode
540
+ read_rows = run_random_read_benchmark(
541
+ read_paths,
542
+ patch_size=int(args.patch_size),
543
+ random_reads=int(args.random_reads),
544
+ seed=args.seed,
545
+ mmap_mode=mmap_mode,
546
+ open_mode=args.open_mode,
547
+ dparams_nthreads=int(args.dparams_nthreads),
548
+ disable_progress=bool(args.disable_progress),
549
+ )
550
+
551
+ conversion_summary = summarize_conversion(conversion_rows)
552
+ read_summary = summarize_random_read(read_rows)
553
+ print_summary(conversion_summary, read_summary)
554
+
555
+ out_json = output_dir / f"bench_convert_nii_to_mla_random_read__{args.layout_algo}.json"
556
+ payload = {
557
+ "config": {
558
+ "layout_algo": args.layout_algo,
559
+ "input_dir": str(input_dir),
560
+ "output_dir": str(output_dir),
561
+ "patch_size": int(args.patch_size),
562
+ "convert_processes": int(args.convert_processes),
563
+ "random_reads": int(args.random_reads),
564
+ "seed": args.seed,
565
+ "overwrite": bool(args.overwrite),
566
+ "open_mode": args.open_mode,
567
+ "mmap_mode": mmap_mode,
568
+ "dparams_nthreads": int(args.dparams_nthreads),
569
+ "random_read_strategy": (
570
+ "serial; each read picks one random file and one random patch"
571
+ ),
572
+ "random_read_warmup_fraction": 0.1,
573
+ },
574
+ "conversion_summary": conversion_summary,
575
+ "random_read_summary": read_summary,
576
+ "conversion_rows": conversion_rows,
577
+ "random_read_rows": read_rows,
578
+ }
579
+ with out_json.open("w", encoding="utf-8") as f:
580
+ json.dump(payload, f, indent=2)
581
+
582
+ print(f"\nWrote benchmark results: {out_json}")
583
+
584
+
585
+ if __name__ == "__main__":
586
+ main()