PyPI - annbatch - Versions diffs - 0.0.1__py3-none-any.whl - Mend

annbatch 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of annbatch might be problematic. Click here for more details.

Files changed (12) hide show

annbatch/__init__.py +15 -0
annbatch/abc.py +228 -0
annbatch/anndata_manager.py +396 -0
annbatch/dense.py +63 -0
annbatch/io.py +474 -0
annbatch/sparse.py +160 -0
annbatch/types.py +25 -0
annbatch/utils.py +319 -0
annbatch-0.0.1.dist-info/METADATA +214 -0
annbatch-0.0.1.dist-info/RECORD +12 -0
annbatch-0.0.1.dist-info/WHEEL +4 -0
annbatch-0.0.1.dist-info/licenses/LICENSE +21 -0

annbatch/sparse.py ADDED Viewed

@@ -0,0 +1,160 @@
+from __future__ import annotations
+import asyncio
+from importlib.util import find_spec
+from itertools import accumulate, chain, pairwise
+from typing import NamedTuple, cast
+import anndata as ad
+import numpy as np
+import zarr
+import zarr.core.sync as zsync
+if find_spec("torch"):
+    from torch.utils.data import IterableDataset as _IterableDataset
+else:
+    class _IterableDataset:
+        pass
+from annbatch.abc import AbstractIterableDataset, _assign_methods_to_ensure_unique_docstrings
+from annbatch.utils import (
+    CSRContainer,
+    MultiBasicIndexer,
+    add_anndata_docstring,
+    add_anndatas_docstring,
+    add_dataset_docstring,
+    add_datasets_docstring,
+)
+class CSRDatasetElems(NamedTuple):
+    """Container for cached objects that will be indexed into to generate CSR matrices"""
+    indptr: np.ndarray
+    indices: zarr.AsyncArray
+    data: zarr.AsyncArray
+class ZarrSparseDataset(  # noqa: D101
+    AbstractIterableDataset[ad.abc.CSRDataset, CSRContainer], _IterableDataset
+):
+    _dataset_elem_cache: dict[int, CSRDatasetElems] = {}
+    def _cache_update_callback(self):
+        """Callback for when datasets are added to ensure the cache is updated."""
+        return zsync.sync(self._ensure_cache())
+    def _validate(self, datasets: list[ad.abc.CSRDataset]):
+        if not all(isinstance(d, ad.abc.CSRDataset) for d in datasets):
+            raise TypeError("Cannot create sparse dataset using CSRDataset data")
+        if not all(cast("ad.abc.CSRDataset", d).backend == "zarr" for d in datasets):
+            raise TypeError(
+                "Cannot use CSRDataset backed by h5ad at the moment: see https://github.com/zarr-developers/VirtualiZarr/pull/790"
+            )
+    async def _create_sparse_elems(self, idx: int) -> CSRDatasetElems:
+        """Fetch the in-memory indptr, and backed indices and data for a given dataset index.
+        Parameters
+        ----------
+            idx
+                The index
+        Returns
+        -------
+            The constituent elems of the CSR dataset.
+        """
+        indptr = await self._dataset_manager.train_datasets[idx].group._async_group.getitem("indptr")
+        return CSRDatasetElems(
+            *(
+                await asyncio.gather(
+                    indptr.getitem(Ellipsis),
+                    self._dataset_manager.train_datasets[idx].group._async_group.getitem("indices"),
+                    self._dataset_manager.train_datasets[idx].group._async_group.getitem("data"),
+                )
+            )
+        )
+    async def _ensure_cache(self):
+        """Build up the cache of datasets i.e., in-memory indptr, and backed indices and data."""
+        arr_idxs = [
+            idx for idx in range(len(self._dataset_manager.train_datasets)) if idx not in self._dataset_elem_cache
+        ]
+        all_elems = await asyncio.gather(
+            *(
+                self._create_sparse_elems(idx)
+                for idx in range(len(self._dataset_manager.train_datasets))
+                if idx not in self._dataset_elem_cache
+            )
+        )
+        for idx, elems in zip(arr_idxs, all_elems, strict=True):
+            self._dataset_elem_cache[idx] = elems
+    async def _get_sparse_elems(self, dataset_idx: int) -> CSRDatasetElems:
+        """Return the arrays (zarr or otherwise) needed to represent on-disk data at a given index.
+        Parameters
+        ----------
+            dataset_idx
+                The index of the dataset whose arrays are sought.
+        Returns
+        -------
+            The arrays representing the sparse data.
+        """
+        if dataset_idx not in self._dataset_elem_cache:
+            await self._ensure_cache()
+        return self._dataset_elem_cache[dataset_idx]
+    async def _fetch_data(
+        self,
+        slices: list[slice],
+        dataset_idx: int,
+    ) -> CSRContainer:
+        # See https://github.com/scverse/anndata/blob/361325fc621887bf4f381e9412b150fcff599ff7/src/anndata/_core/sparse_dataset.py#L272-L295
+        # for the inspiration of this function.
+        indptr, indices, data = await self._get_sparse_elems(dataset_idx)
+        indptr_indices = [indptr[slice(s.start, s.stop + 1)] for s in slices]
+        indptr_limits = [slice(i[0], i[-1]) for i in indptr_indices]
+        indexer = MultiBasicIndexer(
+            [
+                zarr.core.indexing.BasicIndexer((l,), shape=data.metadata.shape, chunk_grid=data.metadata.chunk_grid)
+                for l in indptr_limits
+            ]
+        )
+        data_np, indices_np = await asyncio.gather(
+            data._get_selection(indexer, prototype=zarr.core.buffer.default_buffer_prototype()),
+            indices._get_selection(indexer, prototype=zarr.core.buffer.default_buffer_prototype()),
+        )
+        gaps = (s1.start - s0.stop for s0, s1 in pairwise(indptr_limits))
+        offsets = accumulate(chain([indptr_limits[0].start], gaps))
+        start_indptr = indptr_indices[0] - next(offsets)
+        if len(slices) < 2:  # there is only one slice so no need to concatenate
+            return CSRContainer(
+                elems=(data_np, indices_np, start_indptr),
+                shape=(start_indptr.shape[0] - 1, self._dataset_manager.n_var),
+                dtype=data_np.dtype,
+            )
+        end_indptr = np.concatenate([s[1:] - o for s, o in zip(indptr_indices[1:], offsets, strict=True)])
+        indptr_np = np.concatenate([start_indptr, end_indptr])
+        return CSRContainer(
+            elems=(data_np, indices_np, indptr_np),
+            shape=(indptr_np.shape[0] - 1, self._dataset_manager.n_var),
+            dtype=data_np.dtype,
+        )
+_assign_methods_to_ensure_unique_docstrings(ZarrSparseDataset)
+ZarrSparseDataset.__doc__ = AbstractIterableDataset.__init__.__doc__.format(
+    array_type="sparse", child_class="ZarrSparseDataset"
+)
+ZarrSparseDataset.add_datasets.__doc__ = add_datasets_docstring.format(on_disk_array_type="anndata.abc.CSRDataset")
+ZarrSparseDataset.add_dataset.__doc__ = add_dataset_docstring.format(on_disk_array_type="anndata.abc.CSRDataset")
+ZarrSparseDataset.add_anndatas.__doc__ = add_anndatas_docstring.format(on_disk_array_type="anndata.abc.CSRDataset")
+ZarrSparseDataset.add_anndata.__doc__ = add_anndata_docstring.format(on_disk_array_type="anndata.abc.CSRDataset")
+ZarrSparseDataset.__iter__.__doc__ = AbstractIterableDataset.__iter__.__doc__.format(
+    gpu_array="cupyx.scipy.sparse.spmatrix", cpu_array="scipy.sparse.csr_matrix"
+)

annbatch/types.py ADDED Viewed

@@ -0,0 +1,25 @@
+from __future__ import annotations
+from types import NoneType
+from typing import TypeVar
+import anndata as ad
+import numpy as np
+import zarr
+from scipy import sparse as sp
+from annbatch.utils import CSRContainer
+try:
+    from cupy import ndarray as CupyArray
+    from cupyx.scipy.sparse import csr_matrix as CupyCSRMatrix  # pragma: no cover
+except ImportError:
+    CupyCSRMatrix = NoneType
+    CupyArray = NoneType
+OutputInMemoryArray = sp.csr_matrix | np.ndarray | CupyCSRMatrix | CupyArray
+OnDiskArray = TypeVar("OnDiskArray", ad.abc.CSRDataset, zarr.Array)
+InputInMemoryArray = TypeVar("InputInMemoryArray", CSRContainer, np.ndarray)

annbatch/utils.py ADDED Viewed

@@ -0,0 +1,319 @@
+from __future__ import annotations
+import asyncio
+import warnings
+from dataclasses import dataclass
+from functools import cached_property
+from importlib.util import find_spec
+from itertools import islice
+from typing import TYPE_CHECKING, Protocol
+import numpy as np
+import scipy as sp
+import zarr
+try:
+    from cupy import ndarray as CupyArray
+    from cupyx.scipy.sparse import csr_matrix as CupyCSRMatrix  # pragma: no cover
+except ImportError:
+    CupyArray = None
+    CupyCSRMatrix = None
+if TYPE_CHECKING:
+    from collections import OrderedDict
+    from collections.abc import Awaitable, Callable, Generator, Iterable
+    from torch import Tensor
+    from annbatch.types import InputInMemoryArray, OutputInMemoryArray
+def split_given_size(a: np.ndarray, size: int) -> list[np.ndarray]:
+    """Wrapper around `np.split` to split up an array into `size` chunks"""
+    return np.split(a, np.arange(size, len(a), size))
+@dataclass
+class CSRContainer:
+    """A low-cost container for moving around the buffers of a CSR object"""
+    elems: tuple[np.ndarray, np.ndarray, np.ndarray]
+    shape: tuple[int, int]
+    dtype: np.dtype
+def _batched[T](iterable: Iterable[T], n: int) -> Generator[list[T], None, None]:
+    if n < 1:
+        raise ValueError("n must be >= 1")
+    it = iter(iterable)
+    while batch := list(islice(it, n)):
+        yield batch
+async def index_datasets(
+    dataset_index_to_slices: OrderedDict[int, list[slice]],
+    fetch_data: Callable[[list[slice], int], Awaitable[CSRContainer | np.ndarray]],
+) -> list[InputInMemoryArray]:
+    """Helper function meant to encapsulate asynchronous calls so that we can use the same event loop as zarr.
+    Parameters
+    ----------
+        dataset_index_to_slices
+            A lookup of the list-placement index of a dataset to the request slices.
+        fetch_data
+            The function to do the fetching for a given slice-dataset index pair.
+    """
+    tasks = []
+    for dataset_idx in dataset_index_to_slices.keys():
+        tasks.append(
+            fetch_data(
+                dataset_index_to_slices[dataset_idx],
+                dataset_idx,
+            )
+        )
+    return await asyncio.gather(*tasks)
+add_datasets_docstring = """\
+Append datasets to this dataset.
+Parameters
+----------
+    datasets
+        List of :class:`{on_disk_array_type}` objects, generally from :attr:`anndata.AnnData.X`.
+    obs
+        List of :class:`numpy.ndarray` labels, generally from :attr:`anndata.AnnData.obs`.
+"""
+add_dataset_docstring = """\
+Append a dataset to this dataset.
+Parameters
+----------
+    dataset
+        :class:`{on_disk_array_type}` object, generally from :attr:`anndata.AnnData.X`.
+    obs
+        :class:`numpy.ndarray` labels for the anndata, generally from :attr:`anndata.AnnData.obs`.
+"""
+add_anndatas_docstring = """\
+Append anndatas to this dataset.
+Parameters
+----------
+    anndatas
+        List of :class:`anndata.AnnData` objects, with :class:`{on_disk_array_type}` as the data matrix.
+    obs_keys
+        List of :attr:`anndata.AnnData.obs` column labels.
+    layer_keys
+        List of :attr:`anndata.AnnData.layers` keys, and if None, :attr:`anndata.AnnData.X` will be used.
+"""
+add_anndata_docstring = """\
+Append a anndata to this dataset.
+Parameters
+----------
+    anndata
+        :class:`anndata.AnnData` object, with :class:`{on_disk_array_type}` as the data matrix.
+    obs_key
+        :attr:`anndata.AnnData.obs` column labels.
+    layer_key
+        :attr:`anndata.AnnData.layers` key, and if None, :attr:`anndata.AnnData.X` will be used.
+"""
+# TODO: make this part of the public zarr or zarrs-python API.
+# We can do chunk coalescing in zarrs based on integer arrays, so I think
+# there would make sense with ezclump or similar.
+# Another "solution" would be for zarrs to support integer indexing properly, if that pipeline works,
+# or make this an "experimental setting" and to use integer indexing for the zarr-python pipeline.
+# See: https://github.com/zarr-developers/zarr-python/issues/3175 for why this is better than simpler alternatives.
+class MultiBasicIndexer(zarr.core.indexing.Indexer):
+    """Custom indexer to enable joint fetching of disparate slices"""
+    def __init__(self, indexers: list[zarr.core.indexing.Indexer]):
+        self.shape = (sum(i.shape[0] for i in indexers), *indexers[0].shape[1:])
+        self.drop_axes = indexers[0].drop_axes  # maybe?
+        self.indexers = indexers
+    def __iter__(self):
+        total = 0
+        for i in self.indexers:
+            for c in i:
+                out_selection = c[2]
+                gap = out_selection[0].stop - out_selection[0].start
+                yield type(c)(c[0], c[1], (slice(total, total + gap), *out_selection[1:]), c[3])
+                total += gap
+def sample_rows(
+    x_list: list[np.ndarray],
+    obs_list: list[np.ndarray] | None,
+    indices: list[np.ndarray] | None = None,
+    *,
+    shuffle: bool = True,
+) -> Generator[tuple[np.ndarray, np.ndarray | None], None, None]:
+    """Samples rows from multiple arrays and their corresponding observation arrays.
+    Parameters
+    ----------
+        x_list
+            A list of numpy arrays containing the data to sample from.
+        obs_list
+            A list of numpy arrays containing the corresponding observations.
+        indices
+            the list of indexes for each element in `x_list/`
+        shuffle
+            Whether to shuffle the rows before sampling.
+    Yields
+    ------
+        tuple
+            A tuple containing a row from `x_list` and the corresponding row from `obs_list`.
+    """
+    lengths = np.fromiter((x.shape[0] for x in x_list), dtype=int)
+    cum = np.concatenate(([0], np.cumsum(lengths)))
+    total = cum[-1]
+    idxs = np.arange(total)
+    if shuffle:
+        np.random.default_rng().shuffle(idxs)
+    arr_idxs = np.searchsorted(cum, idxs, side="right") - 1
+    row_idxs = idxs - cum[arr_idxs]
+    for ai, ri in zip(arr_idxs, row_idxs, strict=True):
+        res = [
+            x_list[ai][ri],
+            obs_list[ai][ri] if obs_list is not None else None,
+        ]
+        if indices is not None:
+            yield (*res, indices[ai][ri])
+        else:
+            yield tuple(res)
+class WorkerHandle:  # noqa: D101
+    @cached_property
+    def _worker_info(self):
+        if find_spec("torch"):
+            from torch.utils.data import get_worker_info
+            return get_worker_info()
+        return None
+    @cached_property
+    def _rng(self):
+        if self._worker_info is None:
+            return np.random.default_rng()
+        else:
+            # This is used for the _get_chunks function
+            # Use the same seed for all workers that the resulting splits are the same across workers
+            # torch default seed is `base_seed + worker_id`. Hence, subtract worker_id to get the base seed
+            return np.random.default_rng(self._worker_info.seed - self._worker_info.id)
+    def shuffle(self, obj: np.typing.ArrayLike) -> None:
+        """Perform in-place shuffle.
+        Parameters
+        ----------
+            obj
+                The object to be shuffled
+        """
+        self._rng.shuffle(obj)
+    def get_part_for_worker(self, obj: np.ndarray) -> np.ndarray:
+        """Get a chunk of an incoming array accordnig to the current worker id.
+        Parameters
+        ----------
+            obj
+                Incoming array
+        Returns
+        -------
+            A evenly split part of the ray corresponding to how many workers there are.
+        """
+        if self._worker_info is None:
+            return obj
+        num_workers, worker_id = self._worker_info.num_workers, self._worker_info.id
+        chunks_split = np.array_split(obj, num_workers)
+        return chunks_split[worker_id]
+def check_lt_1(vals: list[int], labels: list[str]) -> None:
+    """Raise a ValueError if any of the values are less than one.
+    The format of the error is "{labels[i]} must be greater than 1, got {values[i]}"
+    and is raised based on the first found less than one value.
+    Parameters
+    ----------
+        vals
+            The values to check < 1
+        labels
+            The label for the value in the error if the value is less than one.
+    Raises
+    ------
+        ValueError: _description_
+    """
+    if any(is_lt_1 := [v < 1 for v in vals]):
+        label, value = next(
+            (label, value)
+            for label, value, check in zip(
+                labels,
+                vals,
+                is_lt_1,
+                strict=True,
+            )
+            if check
+        )
+        raise ValueError(f"{label} must be greater than 1, got {value}")
+class SupportsShape(Protocol):  # noqa: D101
+    @property
+    def shape(self) -> tuple[int, int] | list[int]: ...  # noqa: D102
+def check_var_shapes(objs: list[SupportsShape]) -> None:
+    """Small utility function to check that all objects have the same shape along the second axis"""
+    if not all(objs[0].shape[1] == d.shape[1] for d in objs):
+        raise ValueError("TODO: All datasets must have same shape along the var axis.")
+def to_torch(input: OutputInMemoryArray, preload_to_gpu: bool) -> Tensor:
+    """Send the input data to a torch.Tensor"""
+    import torch
+    if isinstance(input, torch.Tensor):
+        return input
+    if isinstance(input, sp.sparse.csr_matrix):
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "Sparse CSR tensor support is in beta state", UserWarning)
+            tensor = torch.sparse_csr_tensor(
+                torch.from_numpy(input.indptr),
+                torch.from_numpy(input.indices),
+                torch.from_numpy(input.data),
+                input.shape,
+            )
+        if preload_to_gpu:
+            return tensor.cuda(non_blocking=True)
+        return tensor
+    if isinstance(input, np.ndarray):
+        tensor = torch.from_numpy(input)
+        if preload_to_gpu:
+            return tensor.cuda(non_blocking=True)
+        return tensor
+    if isinstance(input, CupyArray):
+        return torch.from_dlpack(input)
+    if isinstance(input, CupyCSRMatrix):
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "Sparse CSR tensor support is in beta state", UserWarning)
+            return torch.sparse_csr_tensor(
+                torch.from_dlpack(input.indptr),
+                torch.from_dlpack(input.indices),
+                torch.from_dlpack(input.data),
+                input.shape,
+            )
+    raise TypeError(f"Cannot convert {type(input)} to torch.Tensor")