PyPI - zarrify - Versions diffs - 0.0.1__tar.gz - Mend

zarrify 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

zarrify-0.0.1/LICENSE.txt +28 -0
zarrify-0.0.1/PKG-INFO +23 -0
zarrify-0.0.1/README.md +0 -0
zarrify-0.0.1/pyproject.toml +26 -0
zarrify-0.0.1/src/zarrify/__about__.py +3 -0
zarrify-0.0.1/src/zarrify/__init__.py +8 -0
zarrify-0.0.1/src/zarrify/formats/mrc.py +73 -0
zarrify-0.0.1/src/zarrify/formats/n5.py +44 -0
zarrify-0.0.1/src/zarrify/formats/tiff.py +70 -0
zarrify-0.0.1/src/zarrify/formats/tiff_stack.py +82 -0
zarrify-0.0.1/src/zarrify/to_zarr.py +115 -0
zarrify-0.0.1/src/zarrify/utils/dask_utils.py +39 -0
zarrify-0.0.1/src/zarrify/utils/volume.py +54 -0

zarrify-0.0.1/LICENSE.txt ADDED Viewed

@@ -0,0 +1,28 @@
+BSD 3-Clause License
+Copyright (c) 2024, Howard Hughes Medical Institute
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

zarrify-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,23 @@
+Metadata-Version: 2.3
+Name: zarrify
+Version: 0.0.1
+Summary:
+Author: Yurii Zubov
+Author-email: zubov452@gmail.com
+Requires-Python: >=3.11
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: click (>=8.1.8,<9.0.0)
+Requires-Dist: colorama (>=0.4.6,<0.5.0)
+Requires-Dist: dask (>=2024.12.1,<2025.0.0)
+Requires-Dist: dask-jobqueue (==0.8.2)
+Requires-Dist: imagecodecs (>=2024.12.30,<2025.0.0)
+Requires-Dist: mrcfile (>=1.5.3,<2.0.0)
+Requires-Dist: natsort (>=8.4.0,<9.0.0)
+Requires-Dist: tifffile (>=2025.1.10,<2026.0.0)
+Requires-Dist: zarr (==2.16.1)
+Description-Content-Type: text/markdown

zarrify-0.0.1/README.md ADDED Viewed

File without changes

zarrify-0.0.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,26 @@
+[project]
+name = "zarrify"
+version = "0.0.1"
+description = ""
+authors = [
+    {name = "Yurii Zubov",email = "zubov452@gmail.com"}
+]
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "zarr (==2.16.1)",
+    "mrcfile (>=1.5.3,<2.0.0)",
+    "dask-jobqueue (==0.8.2)",
+    "colorama (>=0.4.6,<0.5.0)",
+    "imagecodecs (>=2024.12.30,<2025.0.0)",
+    "dask (>=2024.12.1,<2025.0.0)",
+    "tifffile (>=2025.1.10,<2026.0.0)",
+    "natsort (>=8.4.0,<9.0.0)",
+    "click (>=8.1.8,<9.0.0)"
+]
+[project.scripts]
+zarrify = "zarrify.to_zarr:cli"
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"

zarrify-0.0.1/src/zarrify/__about__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2025-present Yurii Zubov <zubov452@gmail.com>
+__version__ = "0.0.1"

zarrify-0.0.1/src/zarrify/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: 2025-present Yurii Zubov <zubov452@gmail.com>
+#
+# SPDX-License-Identifier: MIT
+from zarrify import to_zarr
+__all__ = ["to_zarr"]

zarrify-0.0.1/src/zarrify/formats/mrc.py ADDED Viewed

@@ -0,0 +1,73 @@
+import zarr
+import mrcfile
+import os
+from typing import Tuple
+from dask.array.core import slices_from_chunks, normalize_chunks
+from dask.distributed import Client, wait
+from toolz import partition_all
+import time
+from zarrify.utils.volume import Volume
+class Mrc3D(Volume):
+    def __init__(
+        self,
+        src_path: str,
+        axes: list[str],
+        scale: list[float],
+        translation: list[float],
+        units: list[str],
+    ):
+        """Construct all the necessary attributes for the proper conversion of tiff to OME-NGFF Zarr.
+        Args:
+            input_filepath (str): path to source tiff file.
+        """
+        super().__init__(src_path, axes, scale, translation, units)
+        self.memmap = mrcfile.mmap(self.src_path, mode="r")
+        self.ndim = self.memmap.data.ndim
+        self.shape = self.memmap.shape
+        self.dtype = self.memmap.data.dtype
+    def save_chunk(self, z_arr: zarr.Array, chunk_slice: Tuple[slice, ...]):
+        """Copies data from a particular part of the input mrc array into a specific chunk of the output zarr array.
+        Args:
+            z_arr (zarr.core.Array): output zarr array object
+            chunk_slice (Tuple[slice, ...]): slice of the mrc array to copy.
+        """
+        mrc_file = mrcfile.mmap(self.src_path, mode="r")
+        if not (mrc_file.data[chunk_slice] == 0).all():
+            z_arr[chunk_slice] = mrc_file.data[chunk_slice]
+    def write_to_zarr(
+        self,
+        z_arr: zarr.Array,
+        client: Client,
+    ):
+        """Use mrcfile memmap to access small parts of the mrc file and write them into zarr chunks.
+        Args:
+            dest_path (str): path to the zarr group where the output dataset is stored.
+            client (Client): instance of a dask client
+        """
+        out_slices = slices_from_chunks(
+            normalize_chunks(z_arr.chunks, shape=z_arr.shape)
+        )
+        out_slices_partitioned = tuple(partition_all(100000, out_slices))
+        for idx, part in enumerate(out_slices_partitioned):
+            print(f"{idx + 1} / {len(out_slices_partitioned)}")
+            start = time.time()
+            fut = client.map(lambda v: self.save_chunk(z_arr, v), part)
+            print(
+                f"Submitted {len(part)} tasks to the scheduler in {time.time()- start}s"
+            )
+            # wait for all the futures to complete
+            result = wait(fut)
+            print(f"Completed {len(part)} tasks in {time.time() - start}s")

zarrify-0.0.1/src/zarrify/formats/n5.py ADDED Viewed

@@ -0,0 +1,44 @@
+import zarr
+import os
+class N53D(Volume):
+    def __init__(
+        self,
+        src_path: str,
+        axes: list[str],
+        scale: list[float],
+        translation: list[float],
+        units: list[str],
+    ):
+        """Construct all the necessary attributes for the proper conversion of tiff to OME-NGFF Zarr.
+        Args:
+            input_filepath (str): path to source tiff file.
+        """
+        super().__init__(src_path, axes, scale, translation, units)
+        self.store_path, self.arr_path = self.separate_store_path(src_path, '')
+        self.n5_store = zarr.N5Store(self.store_path)
+        self.n5_arr = zarr.open(store = self.n5_store, path=self.arr_path, mode='r')
+        self.shape = self.n5_arr.shape
+        self.dtype = self.n5_arr.dtype
+        self.chunks = self.n5_arr.chunks
+    def separate_store_path(store, path):
+        """
+        sometimes you can pass a total os path to node, leading to
+        an empty('') node.path attribute.
+        the correct way is to separate path to container(.n5, .zarr)
+        from path to array within a container.
+        Args:
+            store (string): path to store
+            path (string): path array/group (.n5 or .zarr)
+        Returns:
+            (string, string): returns regularized store and group/array path
+        """
+        new_store, path_prefix = os.path.split(store)
+        if ".n5" in path_prefix:
+            return store, path
+        return separate_store_path(new_store, os.path.join(path_prefix, path))

zarrify-0.0.1/src/zarrify/formats/tiff.py ADDED Viewed

@@ -0,0 +1,70 @@
+from tifffile import imread
+import numpy as np
+import zarr
+import os
+from dask.distributed import Client, wait
+import time
+import dask.array as da
+import copy
+from zarrify.utils.volume import Volume
+class Tiff3D(Volume):
+    def __init__(
+        self,
+        src_path: str,
+        axes: list[str],
+        scale: list[float],
+        translation: list[float],
+        units: list[str],
+    ):
+        """Construct all the necessary attributes for the proper conversion of tiff to OME-NGFF Zarr.
+        Args:
+            input_filepath (str): path to source tiff file.
+        """
+        super().__init__(src_path, axes, scale, translation, units)
+        self.zarr_store = imread(os.path.join(src_path), aszarr=True)
+        self.zarr_arr = zarr.open(self.zarr_store)
+        self.shape = self.zarr_arr.shape
+        self.dtype = self.zarr_arr.dtype
+    def write_to_zarr(self, zarray: zarr.Group, client: Client):
+        chunks_list = np.arange(0, zarray.shape[0], zarray.chunks[0])
+        src_path = copy.copy(self.src_path)
+        start = time.time()
+        fut = client.map(
+            lambda v: write_volume_slab_to_zarr(v, zarray, src_path), chunks_list
+        )
+        print(
+            f"Submitted {len(chunks_list)} tasks to the scheduler in {time.time()- start}s"
+        )
+        # wait for all the futures to complete
+        result = wait(fut)
+        print(f"Completed {len(chunks_list)} tasks in {time.time() - start}s")
+        return 0
+def write_volume_slab_to_zarr(chunk_num: int, zarray: zarr.Array, src_path: str):
+    # check if the slab is at the array boundary or not
+    if chunk_num + zarray.chunks[0] > zarray.shape[0]:
+        slab_thickness = zarray.shape[0] - chunk_num
+    else:
+        slab_thickness = zarray.chunks[0]
+    slab_shape = [slab_thickness] + list(zarray.shape[-2:])
+    np_slab = np.empty(slab_shape, zarray.dtype)
+    tiff_slab = imread(src_path, key=range(chunk_num, chunk_num + slab_thickness, 1))
+    np_slab[0 : zarray.chunks[0], :, :] = tiff_slab
+    # write a tiff stack slab into zarr array
+    zarray[chunk_num : chunk_num + zarray.chunks[0], :, :] = np_slab

zarrify-0.0.1/src/zarrify/formats/tiff_stack.py ADDED Viewed

@@ -0,0 +1,82 @@
+from tifffile import imread
+import numpy as np
+import zarr
+import os
+from dask.distributed import Client, wait
+import time
+import dask.array as da
+from natsort import natsorted
+from glob import glob
+from zarrify.utils.volume import Volume
+class TiffStack(Volume):
+    def __init__(
+        self,
+        src_path: str,
+        axes: list[str],
+        scale: list[float],
+        translation: list[float],
+        units: list[str],
+    ):
+        """Construct all the necessary attributes for the proper conversion of tiff to OME-NGFF Zarr.
+        Args:
+            input_filepath (str): path to source tiff file.
+        """
+        super().__init__(src_path, axes, scale, translation, units)
+        self.stack_list = natsorted(glob(os.path.join(src_path, "*.tif*")))
+        probe_image_store = imread(
+            os.path.join(src_path, self.stack_list[0]), aszarr=True
+        )
+        probe_image_arr = da.from_zarr(probe_image_store)
+        self.dtype = probe_image_arr.dtype
+        self.shape = [len(self.stack_list)] + list(probe_image_arr.shape)
+    def write_tile_slab_to_zarr(
+        self, chunk_num: int, zarray: zarr.Array, src_volume: list
+    ):
+        # check if the slab is at the array boundary or not
+        if chunk_num + zarray.chunks[0] > zarray.shape[0]:
+            slab_thickness = zarray.shape[0] - chunk_num
+        else:
+            slab_thickness = zarray.chunks[0]
+        slab_shape = [slab_thickness] + list(zarray.shape[-2:])
+        np_slab = np.empty(slab_shape, zarray.dtype)
+        # combine tiles into a slab with thickness equal to the chunk size in z direction
+        for slab_index in np.arange(chunk_num, chunk_num + slab_thickness, 1):
+            try:
+                image_tile = imread(src_volume[slab_index])
+            except:
+                print(
+                    f"Tiff tile with index {slab_index} is not present in tiff stack."
+                )
+            np_slab[slab_index - chunk_num, :, :] = image_tile
+        # write a tiff stack slab into a zarr array
+        zarray[chunk_num : chunk_num + zarray.chunks[0], :, :] = np_slab
+    # parallel writing of tiff stack into zarr array
+    def write_to_zarr(self, zarray: zarr.Array, client: Client):
+        chunks_list = np.arange(0, zarray.shape[0], zarray.chunks[0])
+        start = time.time()
+        fut = client.map(
+            lambda v: self.write_tile_slab_to_zarr(v, zarray, self.stack_list),
+            chunks_list,
+        )
+        print(
+            f"Submitted {len(chunks_list)} tasks to the scheduler in {time.time()- start}s"
+        )
+        # wait for all the futures to complete
+        result = wait(fut)
+        print(f"Completed {len(chunks_list)} tasks in {time.time() - start}s")
+        return 0

zarrify-0.0.1/src/zarrify/to_zarr.py ADDED Viewed

@@ -0,0 +1,115 @@
+import zarr
+from numcodecs import Zstd
+import os
+import click
+import sys
+from dask.distributed import Client
+import time
+from zarrify.formats.tiff_stack import TiffStack
+from zarrify.formats.tiff import Tiff3D
+from zarrify.formats.mrc import Mrc3D
+from zarrify.formats.n5 import N53D
+from zarrify.utils.dask_utils import initialize_dask_client
+# @click.command("zarrify")
+# @click.option(
+#     "--src",
+#     "-s",
+#     type=click.Path(exists=True),
+#     help="Input file/directory location",
+# )
+# @click.option("--dest", "-s", type=click.STRING, help="Output .zarr file path.")
+# @click.option(
+#     "--num_workers", "-w", default=100, type=click.INT, help="Number of dask workers"
+# )
+# @click.option(
+#     "--cluster",
+#     "-c",
+#     default="",
+#     type=click.STRING,
+#     help="Which instance of dask client to use. Local client - 'local', cluster 'lsf'",
+# )
+# @click.option(
+#     "--zarr_chunks",
+#     "-zc",
+#     nargs=3,
+#     default=(64, 128, 128),
+#     type=click.INT,
+#     help="Chunk size for (z, y, x) axis order. z-axis is normal to the tiff stack plane. Default (64, 128, 128)",
+# )
+# @click.option(
+#     "--axes",
+#     "-a",
+#     nargs=3,
+#     default=("z", "y", "x"),
+#     type=str,
+#     help="Metadata axis names. Order matters. \n Example: -a z y x",
+# )
+# @click.option(
+#     "--translation",
+#     "-t",
+#     nargs=3,
+#     default=(0.0, 0.0, 0.0),
+#     type=float,
+#     help="Metadata translation(offset) value. Order matters. \n Example: -t 1.0 2.0 3.0",
+# )
+# @click.option(
+#     "--scale",
+#     "-s",
+#     nargs=3,
+#     default=(1.0, 1.0, 1.0),
+#     type=float,
+#     help="Metadata scale value. Order matters. \n Example: -s 1.0 2.0 3.0",
+# )
+# @click.option(
+#     "--units",
+#     "-u",
+#     nargs=3,
+#     default=("nanometer", "nanometer", "nanometer"),
+#     type=str,
+#     help="Metadata unit names. Order matters. \n Example: -t nanometer nanometer nanometer",
+# )
+# def cli(src, dest, num_workers, cluster, zarr_chunks, axes, translation, scale, units):
+    # create a dask client to submit tasks
+#client = initialize_dask_client(cluster)
+def to_zarr(src : str,
+            dest: str,
+            client : Client,
+            num_workers : int = 20,
+            zarr_chunks : list[int] = [128]*3,
+            axes : list[str] = ('z', 'y', 'x'),
+            scale : list[float] = [1.0]*3,
+            translation : list[float] = [0.0]*3,
+            units: list[str] = ['nanometer']*3):
+    if '.n5' in src:
+        dataset = N53D(src, axes, scale, translation, units)
+    if src.endswith(".mrc"):
+        dataset = Mrc3D(src, axes, scale, translation, units)
+    elif src.endswith(".tif") or src.endswith(".tiff"):
+        dataset = Tiff3D(src, axes, scale, translation, units)
+    if os.path.isdir(src):
+        dataset = TiffStack(src, axes, scale, translation, units)
+    z_store = zarr.NestedDirectoryStore(dest)
+    z_root = zarr.open(store=z_store, mode="a")
+    z_arr = z_root.require_dataset(
+        name="s0",
+        shape=dataset.shape,
+        dtype=dataset.dtype,
+        chunks=zarr_chunks,
+        compressor=Zstd(level=6),
+    )
+    # write in parallel to zarr using dask
+    client.cluster.scale(num_workers)
+    dataset.write_to_zarr(z_arr, client)
+    client.cluster.scale(0)
+    # populate zarr metadata
+    dataset.add_ome_metadata(z_root)
+# if __name__ == "__main__":
+#     cli()

zarrify-0.0.1/src/zarrify/utils/dask_utils.py ADDED Viewed

@@ -0,0 +1,39 @@
+from dask_jobqueue import LSFCluster
+from dask.distributed import Client, LocalCluster
+import os
+import sys
+def initialize_dask_client(cluster_type: str) -> Client:
+    """Initialize dask client.
+    Args:
+        cluster_type (str): type of the cluster, either local or lsf
+    Returns:
+        (Client): instance of a dask client
+    """
+    if cluster_type == "":
+        print("Did not specify which instance of the dask client to use!")
+        sys.exit(0)
+    elif cluster_type == "lsf":
+        num_cores = 1
+        cluster = LSFCluster(
+            cores=num_cores,
+            processes=num_cores,
+            memory=f"{15 * num_cores}GB",
+            ncpus=num_cores,
+            mem=15 * num_cores,
+            walltime="48:00",
+            local_directory="/scratch/$USER/",
+        )
+    elif cluster_type == "local":
+        cluster = LocalCluster()
+    client = Client(cluster)
+    with open(
+        os.path.join(os.getcwd(), "dask_dashboard_link" + ".txt"), "w"
+    ) as text_file:
+        text_file.write(str(client.dashboard_link))
+    print(client.dashboard_link)
+    return client

zarrify-0.0.1/src/zarrify/utils/volume.py ADDED Viewed

@@ -0,0 +1,54 @@
+import zarr
+class Volume:
+    def __init__(
+        self,
+        src_path: str,
+        axes: list[str],
+        scale: list[float],
+        translation: list[float],
+        units: list[str],
+    ):
+        self.src_path = src_path
+        self.metadata = {
+            "axes": axes,
+            "translation": translation,
+            "scale": scale,
+            "units": units,
+        }
+    def add_ome_metadata(self, root: zarr.Group):
+        """Add selected tiff metadata to zarr attributes file (.zattrs).
+        Args:
+            root (zarr.Group): root group of the output zarr array
+        """
+        # json template for a multiscale structure
+        z_attrs: dict = {"multiscales": [{}]}
+        z_attrs["multiscales"][0]["axes"] = [
+            {"name": axis, "type": "space", "unit": unit}
+            for axis, unit in zip(self.metadata["axes"], self.metadata["units"])
+        ]
+        z_attrs["multiscales"][0]["coordinateTransformations"] = [
+            {"scale": [1.0, 1.0, 1.0], "type": "scale"}
+        ]
+        z_attrs["multiscales"][0]["datasets"] = [
+            {
+                "coordinateTransformations": [
+                    {"scale": self.metadata["scale"], "type": "scale"},
+                    {
+                        "translation": self.metadata["translation"],
+                        "type": "translation",
+                    },
+                ],
+                "path": list(root.array_keys())[0],
+            }
+        ]
+        z_attrs["multiscales"][0]["name"] = "/" if root.path == "" else root.path
+        z_attrs["multiscales"][0]["version"] = "0.4"
+        # add multiscale template to .attrs
+        root.attrs["multiscales"] = z_attrs["multiscales"]