PyPI - volsegtools - Versions diffs - 0.0.0__py3-none-any.whl - Mend

volsegtools 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

volsegtools/__init__.py +0 -0
volsegtools/_cli/__init__.py +1 -0
volsegtools/_cli/molstar_preprocessor.py +79 -0
volsegtools/abc/__init__.py +6 -0
volsegtools/abc/converter.py +69 -0
volsegtools/abc/data_handle.py +24 -0
volsegtools/abc/downsampler.py +26 -0
volsegtools/abc/kernel.py +8 -0
volsegtools/abc/preprocessor.py +38 -0
volsegtools/abc/serializer.py +12 -0
volsegtools/converter/__init__.py +1 -0
volsegtools/converter/map_converter.py +148 -0
volsegtools/core/__init__.py +5 -0
volsegtools/core/bounds.py +12 -0
volsegtools/core/downsampling_parameters.py +28 -0
volsegtools/core/gaussian_kernel_3D.py +16 -0
volsegtools/core/lattice_kind.py +8 -0
volsegtools/core/vector.py +9 -0
volsegtools/downsampler/__init__.py +2 -0
volsegtools/downsampler/base_downsampler.py +20 -0
volsegtools/downsampler/hierarchy_downsampler.py +253 -0
volsegtools/model/__init__.py +13 -0
volsegtools/model/chunking_mode.py +16 -0
volsegtools/model/metadata.py +50 -0
volsegtools/model/opaque_data_handle.py +112 -0
volsegtools/model/storing_parameters.py +52 -0
volsegtools/model/working_store.py +142 -0
volsegtools/preprocessor/__init__.py +2 -0
volsegtools/preprocessor/preprocessor.py +75 -0
volsegtools/preprocessor/preprocessor_builder.py +110 -0
volsegtools/serialization/__init__.py +1 -0
volsegtools/serialization/bcif_serializer.py +318 -0
volsegtools/typing.py +12 -0
volsegtools-0.0.0.dist-info/METADATA +22 -0
volsegtools-0.0.0.dist-info/RECORD +38 -0
volsegtools-0.0.0.dist-info/WHEEL +5 -0
volsegtools-0.0.0.dist-info/entry_points.txt +2 -0
volsegtools-0.0.0.dist-info/top_level.txt +1 -0

volsegtools/downsampler/hierarchy_downsampler.py ADDED Viewed

@@ -0,0 +1,253 @@
+import asyncio
+import collections
+import logging
+import math
+from typing import List, Tuple
+import dask
+import dask.array as da
+import dask_image.ndfilters as dask_filter
+import numpy as np
+from volsegtools.core import (
+    Bounds,
+    DownsamplingParameters,
+    LatticeKind,
+    Vector3,
+    to_bytes,
+)
+from volsegtools.downsampler import BaseDownsampler
+from volsegtools.model import (
+    ChannelMetadata,
+    DescriptiveStatistics,
+    FlatChannelIterator,
+    OpaqueDataHandle,
+    StoringParameters,
+    TimeFrameMetadata,
+)
+from volsegtools.model.working_store import WorkingStore
+MIN_GRID_SIZE = 100**1
+class HierarchyDownsampler(BaseDownsampler):
+    """ """
+    # This value was used in the previous version of the preprocessor.
+    KERNEL_PARAMETERS: Tuple[int, int, int] = (1, 4, 6)
+    def __init__(self):
+        _parameters = DownsamplingParameters()
+        super().__init__(_parameters)
+    """
+    def __init__(self, params = DownsamplingParameters()):
+        super().__init__(params)
+    """
+    async def downsample_lattice(
+        self, data: OpaqueDataHandle
+    ) -> List[OpaqueDataHandle]:
+        # We have to get the actual data from the zarr store.
+        store = WorkingStore.instance
+        channel_iter = FlatChannelIterator(
+            store.get_data_group(data.metadata.kind).require_group(
+                data.metadata.lattice_id
+            )
+        )
+        ret_value = []
+        # Each time frame can have multiple channels, this flat channel
+        # iterator iterates over each channel in each resolution for this
+        # particular data lattice.
+        for channel_info in channel_iter:
+            if 1 in channel_info.data.shape:
+                # TODO: Add some kind of message, that it does not make
+                # sense, fi there is only a single dimension.
+                continue
+            # This is always true, becase we are reading from the zarr store.
+            dask_arr = da.from_zarr(
+                url=channel_info.data,
+                chunks=channel_info.data.chunks,
+            )
+            current_level_data = dask_arr
+            downsampling_steps = self._calculate_downsampling_steps_count(
+                dask_arr,
+            )
+            logging.info(f"Downsampling steps {downsampling_steps}")
+            downsampling_levels = self._calculate_downsampling_levels(
+                dask_arr,
+                downsampling_steps=downsampling_steps,
+                # TODO: this should be changeable with some "additional_downsampler_config" which
+                # would be just an arbitrary dictionary.
+                factor=8,
+            )
+            logging.info(f"Downsampling levels {downsampling_levels}")
+            for step in range(downsampling_steps):
+                # TODO use step to compute the voxel size
+                current_ratio = 2 ** (step + 1)
+                logging.info(
+                    f"Currently downsampling r{current_ratio}, {channel_info.time} and ch{channel_info.channel}"
+                )
+                downsampled_data = dask_filter.convolve(
+                    current_level_data,
+                    self.parameters.kernel.as_ndarray(),
+                    mode="mirror",
+                    cval=0.0,
+                )
+                # TODO: find out what this does, removes the neighbor?
+                downsampled_data = downsampled_data[::2, ::2, ::2]
+                if current_ratio not in downsampling_levels:
+                    continue
+                if self.parameters.acceptance_threshold != None:
+                    logging.info("Using the acceptance threshold")
+                    downsampled_data[
+                        downsampled_data >= self.parameters.acceptance_threshold
+                    ]
+                if self.parameters.is_mask:
+                    # TODO: this is inefficient point of usage, merge this
+                    # acceptance threshold check
+                    logging.info("Converting to Mask")
+                    downsampled_data = da.where(
+                        downsampled_data > self.parameters.acceptance_threshold, 1, 0
+                    )
+                stats = dask.compute(
+                    da.mean(downsampled_data),
+                    da.std(downsampled_data),
+                    downsampled_data.max(),
+                    downsampled_data.min(),
+                )
+                stats = DescriptiveStatistics(*stats)
+                data_ref = OpaqueDataHandle(downsampled_data)
+                data_ref.metadata = data.metadata
+                # Only change things that are really different.
+                data_ref.metadata.id = int(channel_info.time.split("_")[-1])
+                data_ref.metadata.resolution = current_ratio
+                data_ref.metadata.lattice_dimensions = Vector3(
+                    downsampled_data.shape[0],
+                    downsampled_data.shape[1],
+                    downsampled_data.shape[2],
+                )
+                data_ref.metadata.channels.append(
+                    ChannelMetadata(int(channel_info.channel), stats)
+                )
+                ret_value.append(data_ref)
+                params = StoringParameters(
+                    resolution_level=current_ratio,
+                    time_frame=int(channel_info.time.split("_")[-1]),
+                    channel=int(channel_info.channel),
+                    storage_dtype=np.byte
+                    if self.parameters.is_mask
+                    else downsampled_data.dtype,
+                    lattice_kind=data.metadata.kind,
+                )
+                WorkingStore.instance.store_lattice_time_frame(
+                    params,
+                    downsampled_data,
+                    data.metadata.lattice_id,
+                )
+                current_level_data = downsampled_data
+        return ret_value
+    def _calculate_downsampling_steps_count(
+        self,
+        data: da.Array,
+        downsampling_factor: int = 8,
+    ) -> int:
+        """Calculates the number of steps that are going to be taken during
+        the downsampling of the input data.
+        Parameters
+        ----------
+        data: da.Array
+            The input data that shall be downsampled.
+        downsampling_factor: int
+            The factor of downsampling.
+        Returns
+        -------
+        int:
+            the number of downsampling steps.
+        """
+        steps_count: int = 0
+        # Steps are calculated either from bounds provided as downsampling
+        # parameters, if any. In that case the maximal bound has priority over
+        # the minimal bound as it is the user's decision.
+        # Otherwise we have to compute them manually.
+        if self.parameters.downsampling_level_bounds:
+            level_bounds: Bounds = self.parameters.downsampling_level_bounds
+            if level_bounds.min:
+                steps_count = int(math.log2(level_bounds.min))
+            if level_bounds.max:
+                steps_count = int(math.log2(level_bounds.max))
+        else:
+            input_grid_size: float = math.prod(data.shape)
+            if input_grid_size <= MIN_GRID_SIZE:
+                return 1
+            file_size_in_bytes = data.dtype.itemsize * input_grid_size
+            size_per_downsampling = file_size_in_bytes / to_bytes(
+                self.parameters.size_per_level_bounds_in_mb.min
+            )
+            steps_count = int(math.log(size_per_downsampling, downsampling_factor))
+        return steps_count
+    def _calculate_downsampling_levels(
+        self,
+        data: da.Array,
+        factor: int,
+        downsampling_steps: int,
+    ) -> List[int]:
+        levels: List[int] = [2**x for x in range(1, downsampling_steps + 1)]
+        if self.parameters.downsampling_level_bounds:
+            level_bounds: Bounds = self.parameters.downsampling_level_bounds
+            if level_bounds.max:
+                predicate = lambda x: x <= level_bounds.max
+                levels = [x for x in levels if predicate(x)]
+            if level_bounds.min:
+                predicate = lambda x: x >= level_bounds.min
+                levels = [x for x in levels if predicate(x)]
+        size_per_level: int = self.parameters.size_per_level_bounds_in_mb.max
+        if size_per_level:
+            # TODO: make this a parameters
+            input_grid_size: float = math.prod(data.shape)
+            file_size_in_bytes = data.dtype.itemsize * input_grid_size
+            # TODO: this needs a better name
+            n = math.ceil(
+                math.log(
+                    file_size_in_bytes
+                    / (
+                        # TODO: Make this a function or something
+                        size_per_level * 1024**2
+                    ),
+                    factor,
+                )
+            )
+            levels = [x for x in levels if x >= 2**n]
+        if len(levels) == 0:
+            raise RuntimeError(
+                "No downsamplings could be saved because the max size per"
+                f"channel ({size_per_level}) is too low"
+            )
+        return levels

volsegtools/model/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .chunking_mode import ChunkingMode
+# from .downsampling_data import Data, ChannelInfo, FlatChannelIterator
+from .metadata import (
+    ChannelMetadata,
+    DescriptiveStatistics,
+    Metadata,
+    OriginalTimeFrameMetadata,
+    TimeFrameMetadata,
+)
+from .opaque_data_handle import ChannelInfo, FlatChannelIterator, OpaqueDataHandle
+from .storing_parameters import StoringParameters
+from .working_store import WorkingStore

volsegtools/model/chunking_mode.py ADDED Viewed

@@ -0,0 +1,16 @@
+import enum
+class ChunkingMode(enum.Enum):
+    """Chunking mode enumeration.
+    Attributes
+    ----------
+    AUTO
+        sdasda
+    """
+    AUTO = 1
+    NONE = 2
+    CUSTOM = 3

volsegtools/model/metadata.py ADDED Viewed

@@ -0,0 +1,50 @@
+import dataclasses
+from typing import List
+from volsegtools.core import LatticeKind, Vector3
+@dataclasses.dataclass
+class DescriptiveStatistics:
+    """Represents statistics that should be collected for some data set for
+    it to be representable in CIF.
+    """
+    mean: float
+    std: float
+    max: float
+    min: float
+@dataclasses.dataclass
+class ChannelMetadata:
+    id: int
+    statistics: DescriptiveStatistics
+@dataclasses.dataclass
+class TimeFrameMetadata:
+    # TODO: rename to 'name'
+    axis_order: Vector3 = dataclasses.field(default_factory=Vector3)
+    lattice_id: str = "unknown"
+    kind: LatticeKind = dataclasses.field(default=LatticeKind.VOLUME)
+    id: int = -1
+    axis_order: Vector3 = dataclasses.field(default_factory=Vector3)
+    resolution: int = -1
+    origin: Vector3 = dataclasses.field(default_factory=Vector3)
+    lattice_dimensions: Vector3 = dataclasses.field(default_factory=Vector3)
+    voxel_size: Vector3 = dataclasses.field(default_factory=Vector3)
+    channels: List[ChannelMetadata] = dataclasses.field(default_factory=list)
+@dataclasses.dataclass
+class OriginalTimeFrameMetadata(TimeFrameMetadata):
+    axis_order: Vector3 = dataclasses.field(default_factory=Vector3)
+@dataclasses.dataclass
+class Metadata:
+    original_time_frame: OriginalTimeFrameMetadata = dataclasses.field(
+        default_factory=OriginalTimeFrameMetadata
+    )
+    time_frames: List[TimeFrameMetadata] = dataclasses.field(default_factory=list)

volsegtools/model/opaque_data_handle.py ADDED Viewed

@@ -0,0 +1,112 @@
+import dataclasses
+import dask.array as da
+import numpy as np
+from numpy.typing import ArrayLike
+from zarr.core.array import Array as ZarrArray
+from volsegtools.abc.data_handle import DataHandle
+from volsegtools.model.metadata import TimeFrameMetadata
+# cuPy is an optional import to the volseg-tools (as CUDA may not be available
+# everywhere)
+try:
+    import cupy as cp
+except ImportError:
+    # Define the array type as something inacessible
+    ...
+class TimeFrameIterator: ...
+class ResolutionIterator: ...
+class ChannelIterator: ...
+@dataclasses.dataclass
+class ChannelInfo:
+    resolution: str
+    time: str
+    channel: str
+    data: zarr.Array
+class FlatChannelIterator:
+    def __init__(self, group):
+        self.group = group
+        self._iter = self._group_iter()
+    def _group_iter(self):
+        for resolution, resolution_group in self.group.groups():
+            for time, time_group in resolution_group.groups():
+                for channel, channel_arr in time_group.arrays():
+                    yield ChannelInfo(resolution, time, channel, channel_arr)
+    def __iter__(self):
+        return self
+    def __next__(self) -> ChannelInfo:
+        return next(self._iter)
+class OpaqueDataHandle(DataHandle):
+    """Wrapper around basic volseg-tools data model.
+    The data held by the data model can potentially be very large and have to
+    be storred in files in the files system (usually by leveraging Zarr store).
+    This is a wrapper around this file that is passed between the different
+    stages and holds the reference to the data stored in the file system. If
+    unwrapped then it is directly loaded to the memory as an array.
+    """
+    def __init__(self, reference):
+        if type(reference) is np.ndarray:
+            if reference.base is None:
+                raise RuntimeError("Data reference can be made only from a view")
+        self._metadata = TimeFrameMetadata()
+        self._internal_repr = reference
+    @property
+    def metadata(self) -> TimeFrameMetadata:
+        return self._metadata
+    @metadata.setter
+    def metadata(self, new_metadata) -> None:
+        self._metadata = new_metadata
+    def access(self) -> ArrayLike:
+        return self._internal_repr
+    def unwrap(self, target="numpy") -> ArrayLike:
+        """
+        Supported unwrapping targets are: numpy, dask, and optionally cupy
+        """
+        match target:
+            case "numpy":
+                return self._repr_to_numpy_arr()
+            case "dask":
+                return self._repr_to_dask_arr()
+            case "cupy":
+                return self._repr_to_cupy_arr()
+            case _:
+                raise RuntimeError("Unknown unwrapping kind")
+    def _repr_to_numpy_arr(self):
+        if type(self._internal_repr) is np.ndarray:
+            return self._internal_repr.copy()
+        elif type(self._internal_repr) is ZarrArray:
+            return self._internal_repr[:]
+        raise RuntimeError(
+            f"Unknown internal representation {type(self._internal_repr)}"
+        )
+    def _repr_to_dask_arr(self):
+        return None
+    def _repr_to_cupy_arr(self):
+        return None

volsegtools/model/storing_parameters.py ADDED Viewed

@@ -0,0 +1,52 @@
+import numpy as np
+import numpy.typing
+import pydantic
+from zarr.abc.codec import BytesBytesCodec
+from zarr.codecs import BloscCodec
+from volsegtools.core import LatticeKind
+from volsegtools.model.chunking_mode import ChunkingMode
+class StoringParameters(pydantic.BaseModel):
+    """Parameters used for storing a volume or a segmentation.
+    Attributes
+    ----------
+    is_compression_enabled: bool, default: False
+        Whether the compression is enabled.
+    chunking_mode: ChunkingMode, default: ChunkingMode.AUTO
+        Which chunking mode is used for this particular entry.
+    storage_dtype: numpy.typing.DTypeLike, default: np.float64
+        What is the type of data stored in this entry.
+    resolution_level: pydantic.NonNegativeInt, default: 0
+        Of which resolution level is this entry.
+    time_frame: pydantic.NonNegativeInt, default: 0
+        Which time frame is this entry.
+    channel: pydantic.NonNegativeInt, default: 0
+        Which channel is this entry.
+    compressor: Codec, default: Blosc()
+        Which compression codec is going to be used.
+    """
+    is_compression_enabled: bool = False
+    chunking_mode: ChunkingMode = ChunkingMode.AUTO
+    storage_dtype: numpy.typing.DTypeLike = np.float64
+    resolution_level: pydantic.NonNegativeInt = 0
+    time_frame: pydantic.NonNegativeInt = 0
+    channel: pydantic.NonNegativeInt = 0
+    compressor: BytesBytesCodec = BloscCodec()
+    lattice_kind: LatticeKind = LatticeKind.VOLUME
+    def __str__(self) -> str:
+        return f"""Storing Paramaters:
+        is_compression_enabled {self.is_compression_enabled}
+        chunking_mode {self.chunking_mode}
+        storage_dtype {self.storage_dtype}
+        resolution_level {self.resolution_level}
+        time_frame {self.time_frame}
+        channel {self.channel}
+        compressor {self.compressor}"""
+    class Config:
+        arbitrary_types_allowed = True

volsegtools/model/working_store.py ADDED Viewed

@@ -0,0 +1,142 @@
+import dataclasses
+from pathlib import Path
+from typing import Tuple
+import dask.array as da
+import numpy as np
+import zarr
+import zarr.storage
+from volsegtools.core import LatticeKind
+from volsegtools.model.chunking_mode import ChunkingMode
+from volsegtools.model.metadata import Metadata
+from volsegtools.model.opaque_data_handle import OpaqueDataHandle
+from volsegtools.model.storing_parameters import StoringParameters
+class Singleton(type):
+    _instances = {}
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            instance = super().__call__(*args, **kwargs)
+            cls._instances[cls] = instance
+        return cls._instances[cls]
+    @property
+    def instance(cls):
+        return cls._instances[cls]
+class WorkingStore(metaclass=Singleton):
+    def __init__(self, store_path: Path):
+        self.data_store = zarr.storage.LocalStore(root=store_path)
+        self.root_group = zarr.create_group(store=self.data_store)
+        self._metadata = Metadata()
+        self.volume_dtype = np.float64
+        self.is_volume_dtype_set = False
+        self.segmentation_dtype = np.float64
+        self.is_segmentation_dtype_set = False
+        self._volume_data_group = self.root_group.require_group("volume_data")
+        self._segmentation_data_group = self.root_group.require_group(
+            "segmentation_data"
+        )
+    @property
+    def metadata(self):
+        return self._metadata
+    @metadata.setter
+    def metadata(self, value):
+        self._metadata = value
+        # TODO: it should return a dictionary
+        # self.root_group.attrs.put(dataclasses.asdict(self._metadata))
+    @property
+    def volume_data_group(self):
+        return self._volume_data_group
+    @property
+    def segmentation_data_group(self):
+        return self._segmentation_data_group
+    def get_data_array(
+        self, lattice_id, resolution, time_frame, channel, kind=LatticeKind.VOLUME
+    ):
+        kind_group = self.get_data_group(kind)
+        lattice_group = kind_group.require_group(lattice_id)
+        resolution_group: zarr.Group = lattice_group.require_group(
+            f"resolution_{resolution}"
+        )
+        time_frame_group: zarr.Group = resolution_group.require_group(
+            f"time_frame_{time_frame}"
+        )
+        # FIX: this is unsafe, there should be some check!
+        return list(time_frame_group.arrays())[channel][1][:]
+    @staticmethod
+    def _compute_chunk_size_based_on_data(
+        data_shape: Tuple[int, ...],
+    ) -> Tuple[int, ...]:
+        chunks = tuple([int(i / 4) if i > 4 else i for i in data_shape])
+        return chunks
+    @staticmethod
+    def _resolve_chunking_method(mode: ChunkingMode, data_shape: Tuple[int, ...]):
+        match mode:
+            case ChunkingMode.AUTO:
+                return "auto"
+            case ChunkingMode.NONE:
+                return (0, 0)
+            case ChunkingMode.CUSTOM:
+                return Data._compute_chunk_size_based_on_data(data_shape)
+            case _:
+                raise RuntimeError("Unsupported chunking method!")
+    def get_data_group(self, lattice_kind: LatticeKind):
+        match lattice_kind:
+            case LatticeKind.VOLUME:
+                return self.volume_data_group
+            case LatticeKind.SEGMENTATION:
+                return self.segmentation_data_group
+            case _:
+                raise RuntimeError("Unknown lattice kind encountered.")
+    def store_lattice_time_frame(
+        self,
+        params: StoringParameters,
+        data: da.Array,
+        lattice_id: str,
+    ) -> OpaqueDataHandle:
+        kind_group = self.get_data_group(params.lattice_kind)
+        lattice_group = kind_group.require_group(lattice_id)
+        resolution_group: zarr.Group = lattice_group.require_group(
+            f"resolution_{params.resolution_level}"
+        )
+        time_frame_group: zarr.Group = resolution_group.require_group(
+            f"time_frame_{params.time_frame}"
+        )
+        used_compressor = None
+        if params.is_compression_enabled:
+            used_compressor = params.compressor
+        zarr_repr: zarr.Array = time_frame_group.create_array(
+            name=str(params.channel),
+            chunks=WorkingStore._resolve_chunking_method(
+                params.chunking_mode, data.shape
+            ),
+            dtype=params.storage_dtype,
+            compressors=[used_compressor] if used_compressor is not None else None,
+            shape=data.shape,
+            overwrite=True,
+        )
+        da.to_zarr(arr=data, url=zarr_repr, overwrite=True, compute=True)
+        ref = OpaqueDataHandle(zarr_repr)
+        ref.metadata.lattice_id = lattice_id
+        return ref

volsegtools/preprocessor/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .preprocessor import Preprocessor
2	+ from .preprocessor_builder import PreprocessorBuilder