eubi-bridge 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Euro-BioImaging
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.2
2
+ Name: eubi_bridge
3
+ Version: 0.0.1
4
+ Summary: A package for converting datasets to OME-Zarr format.
5
+ Home-page: https://github.com/Euro-BioImaging/EuBI-Bridge
6
+ Author: Bugra Özdemir
7
+ Author-email: bugraa.ozdemir@gmail.com
8
+ License: MIT
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Dynamic: author
12
+ Dynamic: author-email
13
+ Dynamic: description
14
+ Dynamic: description-content-type
15
+ Dynamic: home-page
16
+ Dynamic: license
17
+ Dynamic: summary
18
+
19
+ A package to convert collections of image data from diverse formats
20
+ to the OME-Zarr format in a distributed manner.
@@ -0,0 +1,2 @@
1
+ A package to convert collections of image data from diverse formats
2
+ to the OME-Zarr format in a distributed manner.
@@ -0,0 +1,16 @@
1
+ from eubi_bridge import ebridge
2
+ from eubi_bridge import fileset_io
3
+ from eubi_bridge.base import scale, writers
4
+ from eubi_bridge.ngff import defaults, multiscales
5
+ from eubi_bridge.utils import convenience, dask_client_plugins
6
+
7
+ __all__ = [
8
+ 'ebridge_base',
9
+ 'fileset_io',
10
+ 'scale',
11
+ 'writers',
12
+ 'defaults',
13
+ 'multiscales',
14
+ 'convenience',
15
+ 'dask_client_plugins',
16
+ ]
File without changes
@@ -0,0 +1,137 @@
1
+ import zarr, dataclasses
2
+ from pathlib import Path
3
+ import numpy as np, zarr
4
+ import dask.array as da
5
+ from typing import Callable
6
+
7
+
8
+ def simple_downscale(
9
+ darr,
10
+ scale_factor: (tuple, list, np.ndarray) = None,
11
+ backend = 'numpy' # placeholder
12
+ ):
13
+ """
14
+ Downscale a Dask array along each dimension by given scale factors.
15
+
16
+ Parameters:
17
+ arr (dask.array): The input n-dimensional Dask array.
18
+ scale_factors (tuple): The downsampling factors for each dimension.
19
+
20
+ Returns:
21
+ dask.array: The downscaled Dask array.
22
+ """
23
+ if len(scale_factor) != darr.ndim:
24
+ raise ValueError("scale_factors must have the same length as the array's number of dimensions")
25
+ slices = tuple(slice(None, None, int(scale)) for scale in scale_factor)
26
+ downscaled_arr = darr[slices]
27
+ return downscaled_arr
28
+
29
+ def mean_downscale(arr: da.Array,
30
+ scale_factor: (tuple, list, np.ndarray) = None
31
+ ):
32
+ if len(scale_factor) != arr.ndim:
33
+ raise ValueError("scale_factors must have the same length as the array's number of dimensions")
34
+ axes = dict({idx: factor for idx, factor in enumerate(scale_factor)})
35
+ downscaled_arr = da.coarsen(da.mean, arr,
36
+ axes = axes, trim_excess = True).astype(arr.dtype)
37
+ return downscaled_arr
38
+
39
+ def median_downscale(arr: da.Array,
40
+ scale_factor: (tuple, list, np.ndarray) = None
41
+ ):
42
+ if len(scale_factor) != arr.ndim:
43
+ raise ValueError("scale_factors must have the same length as the array's number of dimensions")
44
+ axes = dict({idx: factor for idx, factor in enumerate(scale_factor)})
45
+ downscaled_arr = da.coarsen(da.median, arr,
46
+ axes = axes, trim_excess = True).astype(arr.dtype)
47
+ return downscaled_arr
48
+
49
+ @dataclasses.dataclass
50
+ class DownscaleManager:
51
+ base_shape: (list, tuple)
52
+ scale_factor: (list, tuple)
53
+ n_layers: (list, tuple)
54
+ scale: (list, tuple) = None
55
+
56
+ def __post_init__(self):
57
+ ndim = len(self.base_shape)
58
+ assert len(self.scale_factor) == ndim
59
+
60
+ @property
61
+ def _scale_ids(self):
62
+ return np.arange(self.n_layers).reshape(-1, 1)
63
+
64
+ @property
65
+ def _theoretical_scale_factors(self):
66
+ return np.power(self.scale_factor, self._scale_ids)
67
+
68
+ @property
69
+ def output_shapes(self):
70
+ shapes = np.floor_divide(self.base_shape, self._theoretical_scale_factors)
71
+ shapes[shapes == 0] = 1
72
+ return shapes
73
+
74
+ @property
75
+ def scale_factors(self):
76
+ return np.true_divide(self.output_shapes[0], self.output_shapes)
77
+
78
+ @property
79
+ def scales(self):
80
+ return np.multiply(self.scale, self.scale_factors)
81
+
82
+
83
+ @dataclasses.dataclass
84
+ class Downscaler:
85
+ array: da.Array
86
+ scale_factor: (list, tuple)
87
+ n_layers: int
88
+ scale: (list, tuple) = None
89
+ output_chunks: (list, tuple) = None
90
+ backend: str = 'numpy'
91
+ downscale_method: str = 'simple'
92
+
93
+ def __post_init__(self):
94
+ self.param_names = ['array', 'scale_factor', 'n_layers', 'scale', 'output_chunks', 'backend', 'downscale_method']
95
+ self.update()
96
+
97
+ def get_method(self):
98
+ if self.downscale_method == 'simple':
99
+ method = simple_downscale
100
+ elif self.downscale_method == "mean":
101
+ method = mean_downscale
102
+ elif self.downscale_method == "median":
103
+ method = mean_downscale
104
+ else:
105
+ raise NotImplementedError(f"Currently, only 'simple', 'mean' and 'median' methods are implemented.")
106
+ return method
107
+
108
+ def run(self):
109
+ self.method = self.get_method()
110
+ assert isinstance(self.array, da.Array)
111
+ self.dm = DownscaleManager(self.array.shape,
112
+ self.scale_factor,
113
+ self.n_layers,
114
+ self.scale
115
+ )
116
+ if self.output_chunks is None:
117
+ self.output_chunks = [self.array.chunksize] * self.n_layers
118
+
119
+ downscaled = []
120
+ for idx, (scale_factor, chunks) in enumerate(zip(self.dm.scale_factors, self.output_chunks)):
121
+ if idx == 0:
122
+ downscaled.append(self.array)
123
+ else:
124
+ factor = tuple(int(x) for x in scale_factor)
125
+ res1 = self.method(self.array, scale_factor = factor)
126
+ downscaled.append(res1)
127
+ self.downscaled_arrays = {str(i): arr for i, arr in enumerate(downscaled)}
128
+ return self
129
+
130
+ def update(self, **kwargs):
131
+ for key, value in kwargs.items():
132
+ if key in self.param_names:
133
+ self.__setattr__(key, value)
134
+ else:
135
+ warnings.warn(f"The given parameter name '{key}' is not valid, ignoring it..")
136
+ self.run()
137
+ return self
@@ -0,0 +1,369 @@
1
+ import os, itertools, tempfile, shutil, threading
2
+ import zarr, dask, numcodecs
3
+ from dask import delayed
4
+ import rechunker
5
+ from rechunker import rechunk, Rechunked
6
+ import dask.array as da
7
+ # from dask.diagnostics import ProgressBar
8
+ import numpy as np
9
+ # import tensorstore as ts
10
+ from pathlib import Path
11
+ from typing import List, Tuple, Dict, Union, Any, Tuple
12
+ ### internal imports
13
+ from eubi_bridge.ngff.multiscales import Multimeta
14
+ from eubi_bridge.utils.convenience import get_chunksize_from_array, is_zarr_group
15
+
16
+ import logging, warnings
17
+
18
+ logging.getLogger('distributed.diskutils').setLevel(logging.CRITICAL)
19
+
20
+ def create_zarr_array(directory: Union[Path, str, zarr.Group],
21
+ array_name: str,
22
+ shape: Tuple[int, ...],
23
+ chunks: Tuple[int, ...],
24
+ dtype: Any,
25
+ overwrite: bool = False) -> zarr.Array:
26
+ chunks = tuple(np.minimum(shape, chunks))
27
+
28
+ if not isinstance(directory, zarr.Group):
29
+ path = os.path.join(directory, array_name)
30
+ dataset = zarr.create(shape=shape,
31
+ chunks=chunks,
32
+ dtype=dtype,
33
+ store=path,
34
+ dimension_separator='/',
35
+ overwrite=overwrite)
36
+ else:
37
+ dataset = directory.create(name=array_name,
38
+ shape=shape,
39
+ chunks=chunks,
40
+ dtype=dtype,
41
+ dimension_separator='/',
42
+ overwrite=overwrite)
43
+ return dataset
44
+
45
+
46
+ def get_regions(array_shape: Tuple[int, ...],
47
+ region_shape: Tuple[int, ...],
48
+ as_slices: bool = False) -> list:
49
+ assert len(array_shape) == len(region_shape)
50
+ steps = []
51
+ for size, inc in zip(array_shape, region_shape):
52
+ seq = np.arange(0, size, inc)
53
+ if size > seq[-1]:
54
+ seq = np.append(seq, size)
55
+ increments = tuple((seq[i], seq[i + 1]) for i in range(len(seq) - 1))
56
+ if as_slices:
57
+ steps.append(tuple(slice(*item) for item in increments))
58
+ else:
59
+ steps.append(increments)
60
+ return list(itertools.product(*steps))
61
+
62
+ def get_compressor(name, **params):
63
+ name = name.lower()
64
+ compression_dict = {
65
+ "blosc": "Blosc",
66
+ "bz2": "BZ2",
67
+ "gzip": "GZip",
68
+ "lzma": "LZMA",
69
+ "lz4": "LZ4",
70
+ "pcodec": "PCodec",
71
+ "zfpy": "ZFPY",
72
+ "zlib": "Zlib",
73
+ "zstd": "Zstd"
74
+ }
75
+
76
+ compressor_name = compression_dict[name]
77
+ compressor_class = getattr(numcodecs, compressor_name)
78
+ compressor = compressor_class(**params)
79
+ return compressor
80
+
81
+ def get_default_fill_value(dtype):
82
+ if np.issubdtype(dtype, np.integer):
83
+ return 0
84
+ elif np.issubdtype(dtype, np.floating):
85
+ return 0.0
86
+ elif np.issubdtype(dtype, np.bool_):
87
+ return False
88
+ return None
89
+
90
+ def write_chunk_with_zarrpy(chunk: np.ndarray, zarr_array: zarr.Array, block_info: Dict) -> None:
91
+ zarr_array[tuple(slice(*b) for b in block_info[0]["array-location"])] = chunk
92
+
93
+ def write_chunk_with_tensorstore(chunk: np.ndarray, ts_store, block_info: Dict) -> None:
94
+ ts_store[tuple(slice(*b) for b in block_info[0]["array-location"])] = chunk
95
+
96
+ def write_with_rechunker(arr: da.Array,
97
+ chunks: Tuple[int, ...],
98
+ location: Union[str, Path],
99
+ overwrite: bool = True,
100
+ **kwargs) -> Rechunked:
101
+ temp_dir = kwargs.get('temp_dir')
102
+ if not temp_dir:
103
+ raise ValueError("A temp_dir must be specified.")
104
+
105
+ temp_dir_is_auto = temp_dir == 'auto'
106
+ if temp_dir_is_auto:
107
+ temp_dir = tempfile.TemporaryDirectory()
108
+
109
+ max_mem = kwargs.get('rechunkers_max_mem', "auto")
110
+ if max_mem == "auto":
111
+ max_mem = get_chunksize_from_array(arr)
112
+
113
+ if overwrite:
114
+ shutil.rmtree(location, ignore_errors=True)
115
+
116
+ target_store = zarr.DirectoryStore(location, dimension_separator='/')
117
+ temp_store = zarr.DirectoryStore(temp_dir.name if isinstance(temp_dir, tempfile.TemporaryDirectory) else temp_dir,
118
+ dimension_separator='/')
119
+
120
+ compressor_name = kwargs.get('compressor', 'blosc')
121
+ compressor_params = kwargs.get('compressor_params', {})
122
+ compressor = get_compressor(compressor_name, **compressor_params)
123
+
124
+ dtype = kwargs.get('dtype', arr.dtype)
125
+ if dtype == 'auto':
126
+ dtype = arr.dtype
127
+
128
+ fill_value = kwargs.get('fill_value', get_default_fill_value(dtype))
129
+
130
+ # Use rechunker (without fill_value)
131
+ rechunked = rechunk(source=arr,
132
+ target_chunks=chunks,
133
+ target_store=target_store,
134
+ temp_store=temp_store,
135
+ max_mem=max_mem,
136
+ executor='dask',
137
+ target_options={'overwrite': True,
138
+ 'compressor': compressor,
139
+ 'write_empty_chunks': True}) # No fill_value here
140
+
141
+ # **Reopen Zarr array and update fill_value properly**
142
+ zarr_array = zarr.open_array(target_store, mode="a") # Open in append mode
143
+ zarr_array.fill_value = fill_value # Set fill_value correctly
144
+
145
+ # Cleanup temporary directory if auto-generated
146
+ if temp_dir_is_auto:
147
+ temp_dir.cleanup()
148
+
149
+ return rechunked
150
+
151
+ def write_with_zarrpy(arr: da.Array,
152
+ chunks: Tuple[int, ...],
153
+ location: Union[str, Path],
154
+ overwrite: bool = True,
155
+ **kwargs) -> da.Array:
156
+ rechunk_method = kwargs.get('rechunk_method', 'tasks')
157
+
158
+ if not np.equal(arr.chunksize, chunks).all():
159
+ arr = arr.rechunk(chunks, method=rechunk_method #, threshold = 1_000_000
160
+ )
161
+
162
+ store = zarr.DirectoryStore(location, dimension_separator='/')
163
+ try:
164
+ zarr_array = zarr.open_array(location, mode='w')
165
+ except:
166
+ compressor_name = kwargs.get('compressor', 'blosc')
167
+ compressor_params = kwargs.get('compressor_params', {})
168
+ compressor = get_compressor(compressor_name, **compressor_params)
169
+ dtype = kwargs.get('dtype', arr.dtype)
170
+ if dtype == 'auto':
171
+ dtype = arr.dtype
172
+
173
+ fill_value = kwargs.get('fill_value', get_default_fill_value(dtype))
174
+
175
+ zarr_array = zarr.create(shape=arr.shape, chunks=chunks, dtype=dtype, compressor = compressor, store=store, overwrite=overwrite, fill_value = fill_value)
176
+
177
+ return arr.map_blocks(write_chunk_with_zarrpy, zarr_array=zarr_array, dtype=dtype)
178
+
179
+
180
+ def write_with_tensorstore(arr: da.Array,
181
+ chunks: Tuple[int, ...],
182
+ location: Union[str, Path],
183
+ overwrite: bool = True,
184
+ **kwargs) -> da.Array:
185
+
186
+ try:
187
+ import tensorstore as ts
188
+ except:
189
+ raise ModuleNotFoundError(f"The module tensorstore has not been found. Try 'conda install -c conda-forge tensorstore'")
190
+ rechunk_method = kwargs.get('rechunk_method', 'tasks')
191
+
192
+ compressor_name = kwargs.get('compressor', 'blosc')
193
+ compressor_params = kwargs.get('compressor_params', {})
194
+ compressor = dict(id = compressor_name, **compressor_params)
195
+ dtype = kwargs.get('dtype', arr.dtype)
196
+ fill_value = kwargs.get('fill_value', get_default_fill_value(dtype))
197
+
198
+ if dtype == 'auto':
199
+ dtype = arr.dtype
200
+
201
+ zarr_spec = {
202
+ "driver": "zarr",
203
+ "kvstore": {
204
+ "driver": "file",
205
+ "path": location,
206
+ },
207
+ "metadata": {
208
+ "dtype": dtype.str,
209
+ "shape": arr.shape,
210
+ "chunks": chunks,
211
+ "compressor": compressor,
212
+ "dimension_separator": "/",
213
+ "fill_value": fill_value
214
+ },
215
+ }
216
+
217
+ if not np.equal(arr.chunksize, chunks).all():
218
+ arr = arr.rechunk(chunks, method=rechunk_method)
219
+
220
+ ts_store = ts.open(zarr_spec, create=True, delete_existing=overwrite).result()
221
+ return arr.map_blocks(write_chunk_with_tensorstore, ts_store=ts_store, dtype=arr.dtype)
222
+
223
+ def write_with_dask(arr: da.Array,
224
+ chunks: Tuple[int, ...],
225
+ location: Union[str, Path],
226
+ overwrite: bool = True,
227
+ **kwargs: Any
228
+ ) -> List[da.Array]:
229
+ rechunk_method: str = kwargs.get('rechunk_method', 'tasks')
230
+
231
+ if not np.equal(arr.chunksize, chunks).all():
232
+ res: da.Array = arr.rechunk(chunks, method=rechunk_method)
233
+ else:
234
+ res: da.Array = arr
235
+
236
+ store: zarr.DirectoryStore = zarr.DirectoryStore(location, dimension_separator='/')
237
+ try:
238
+ zarr_array: zarr.Array = zarr.open_array(location, mode='r')
239
+ except:
240
+ compressor_name = kwargs.get('compressor', 'blosc')
241
+ compressor_params = kwargs.get('compressor_params', {})
242
+ compressor = get_compressor(compressor_name, **compressor_params)
243
+ dtype = kwargs.get('dtype', arr.dtype)
244
+ if dtype == 'auto':
245
+ dtype = arr.dtype
246
+ fill_value = kwargs.get('fill_value', get_default_fill_value(dtype))
247
+ zarr_array = zarr.create(shape=res.shape, chunks=chunks, dtype=dtype, compressor=compressor, store=store,
248
+ overwrite=overwrite, fill_value = fill_value)
249
+
250
+ region_shape: Tuple[int, ...] = kwargs.get('region_shape', chunks)
251
+ regions: List[Tuple[slice, ...]] = get_regions(arr.shape, region_shape, as_slices=True)
252
+ result: List[da.Array] = []
253
+ for slc in regions:
254
+ res: da.Array = da.to_zarr(
255
+ arr=arr[slc],
256
+ region=slc,
257
+ url=zarr_array,
258
+ compute=False,
259
+ )
260
+ result.append(res)
261
+
262
+ return result
263
+
264
+ @delayed
265
+ def count_threads():
266
+ return threading.active_count()
267
+
268
+ def store_arrays(arrays: Dict[str, Dict[str, da.Array]],
269
+ output_path: Union[Path, str],
270
+ scales: Dict[str, Dict[str, Tuple[float, ...]]],
271
+ units: list,
272
+ output_chunks: Tuple[int, ...] = None,
273
+ compute: bool = False,
274
+ overwrite: bool = False,
275
+ **kwargs) -> Dict[str, da.Array]:
276
+
277
+ rechunk_method = kwargs.get('rechunk_method', 'tasks')
278
+ use_tensorstore = kwargs.get('use_tensorstore', False)
279
+ verbose = kwargs.get('verbose', False)
280
+
281
+ arrays = {k: {'0': v} if not isinstance(v, dict) else v for k, v in arrays.items()}
282
+ flatarrays = {os.path.join(output_path, f"{key}.zarr" if not key.endswith('zarr') else key, str(level)): arr
283
+ for key, subarrays in arrays.items()
284
+ for level, arr in subarrays.items()}
285
+ flatscales = {os.path.join(output_path, f"{key}.zarr" if not key.endswith('zarr') else key, str(level)): scale
286
+ for key, subscales in scales.items()
287
+ for level, scale in subscales.items()}
288
+
289
+ if rechunk_method == 'rechunker':
290
+ writer_func = write_with_rechunker
291
+ if use_tensorstore:
292
+ raise NotImplementedError("The rechunker method cannot be used with tensorstore.")
293
+ if 'region_shape' in kwargs:
294
+ raise NotImplementedError("The rechunker method is not compatible with region-based writing.")
295
+ elif 'region_shape' in kwargs:
296
+ writer_func = write_with_dask
297
+ if use_tensorstore:
298
+ raise NotImplementedError("Region-based writing is not possible with tensorstore.")
299
+ else:
300
+ writer_func = write_with_tensorstore if use_tensorstore else write_with_zarrpy
301
+
302
+ try:
303
+ zarr.group(output_path, overwrite=overwrite)
304
+ results = {}
305
+ for key, arr in flatarrays.items():
306
+ flatscale = flatscales[key]
307
+ # Make sure chunk size is not larger than array shape in any dimension.
308
+ chunks = np.minimum(output_chunks or arr.chunksize, arr.shape)
309
+
310
+ if rechunk_method in (None, 'auto'):
311
+ if np.all(np.less_equal(chunks, arr.chunksize)):
312
+ rechunk_method = 'rechunker'
313
+ kwargs['rechunk_method'] = rechunk_method
314
+ writer_func = write_with_rechunker
315
+ if use_tensorstore:
316
+ raise NotImplementedError("The rechunker method cannot be used with tensorstore.")
317
+ if 'region_shape' in kwargs:
318
+ raise NotImplementedError("The rechunker method is not compatible with region-based writing.")
319
+ else:
320
+ kwargs['rechunk_method'] = 'tasks'
321
+
322
+ if rechunk_method != 'rechunker':
323
+ if 'temp_dir' in kwargs:
324
+ kwargs.pop('temp_dir')
325
+
326
+ dirpath = os.path.dirname(key)
327
+ arrpath = os.path.basename(key)
328
+
329
+ gr = zarr.open_group(dirpath, mode='a') if is_zarr_group(dirpath) else zarr.group(dirpath, overwrite=overwrite)
330
+
331
+ meta = Multimeta()
332
+ try:
333
+ meta.from_ngff(gr)
334
+ except:
335
+ pass
336
+ if not meta.has_axes:
337
+ meta.parse_axes(axis_order='tczyx', unit_list=units)
338
+
339
+ meta.add_dataset(path=arrpath, scale=flatscale, overwrite=True)
340
+ meta.retag(os.path.basename(dirpath))
341
+ meta.to_ngff(gr)
342
+
343
+ if verbose:
344
+ print(f"Writer function: {writer_func}")
345
+ print(f"Rechunk method: {rechunk_method}")
346
+ results[key] = writer_func(arr=arr,
347
+ chunks=chunks,
348
+ location=key, # compressor = compressor, dtype = dtype,
349
+ overwrite=overwrite,
350
+ **kwargs
351
+ )
352
+
353
+ if compute:
354
+ if rechunk_method == 'rechunker':
355
+ for result in results.values():
356
+ result.execute()
357
+ else:
358
+ dask.compute(list(results.values()))
359
+ else:
360
+ return results
361
+ except Exception as e:
362
+ # print(e)
363
+ pass
364
+ return results
365
+
366
+
367
+
368
+
369
+
@@ -0,0 +1,6 @@
1
+ import fire
2
+ from eubi_bridge.ebridge import EuBIBridge
3
+
4
+ def eubibridge_cmd():
5
+ _ = fire.Fire(EuBIBridge)
6
+ return