PyPI - anndata - Versions diffs - 0.12.0rc3__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

anndata 0.12.0rc3py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

anndata/_core/aligned_df.py +1 -1
anndata/_core/aligned_mapping.py +1 -1
anndata/_core/anndata.py +1 -1
anndata/_core/index.py +2 -10
anndata/_core/merge.py +71 -91
anndata/_core/sparse_dataset.py +24 -7
anndata/_core/storage.py +2 -2
anndata/_core/xarray.py +310 -51
anndata/_io/h5ad.py +1 -1
anndata/_io/read.py +5 -1
anndata/_io/specs/lazy_methods.py +52 -19
anndata/_io/specs/methods.py +18 -16
anndata/_io/specs/registry.py +16 -5
anndata/_io/zarr.py +0 -14
anndata/_settings.py +5 -2
anndata/_settings.pyi +49 -0
anndata/_types.py +7 -0
anndata/compat/__init__.py +12 -0
anndata/experimental/__init__.py +9 -1
anndata/experimental/backed/__init__.py +3 -1
anndata/experimental/backed/_io.py +2 -7
anndata/experimental/backed/_lazy_arrays.py +39 -9
anndata/tests/helpers.py +36 -42
anndata/utils.py +3 -4
{anndata-0.12.0rc3.dist-info → anndata-0.12.1.dist-info}/METADATA +11 -35
anndata-0.12.1.dist-info/RECORD +58 -0
testing/anndata/_pytest.py +4 -0
anndata-0.12.0rc3.dist-info/RECORD +0 -57
{anndata-0.12.0rc3.dist-info → anndata-0.12.1.dist-info}/WHEEL +0 -0
{anndata-0.12.0rc3.dist-info → anndata-0.12.1.dist-info}/licenses/LICENSE +0 -0

anndata/_core/aligned_df.py CHANGED Viewed

@@ -59,7 +59,7 @@ def _gen_dataframe_mapping(
         df = pd.DataFrame(
             anno,
             index=None if length is None else mk_index(length),
-            columns=None if len(anno) else [],
+            columns=None if anno else [],
         )
     if length is None:

anndata/_core/aligned_mapping.py CHANGED Viewed

@@ -79,7 +79,7 @@ class AlignedMappingBase(MutableMapping[str, Value], ABC):
         elif isinstance(val, np.ndarray | CupyArray) and len(val.shape) == 1:
             val = val.reshape((val.shape[0], 1))
         elif isinstance(val, XDataset):
-            val = Dataset2D(data_vars=val.data_vars, coords=val.coords, attrs=val.attrs)
+            val = Dataset2D(val)
         for i, axis in enumerate(self.axes):
             if self.parent.shape[axis] == axis_len(val, i):
                 continue

anndata/_core/anndata.py CHANGED Viewed

@@ -62,7 +62,7 @@ if TYPE_CHECKING:
     from .index import Index
-class AnnData(metaclass=utils.DeprecationMixinMeta):
+class AnnData(metaclass=utils.DeprecationMixinMeta):  # noqa: PLW1641
     """\
     An annotated data matrix.

anndata/_core/index.py CHANGED Viewed

@@ -198,7 +198,8 @@ def _subset_sparse(a: CSMatrix | CSArray, subset_idx: Index):
 @_subset.register(pd.DataFrame)
-def _subset_df(df: pd.DataFrame, subset_idx: Index):
+@_subset.register(Dataset2D)
+def _subset_df(df: pd.DataFrame | Dataset2D, subset_idx: Index):
     return df.iloc[subset_idx]
@@ -209,15 +210,6 @@ def _subset_awkarray(a: AwkArray, subset_idx: Index):
     return a[subset_idx]
-@_subset.register(Dataset2D)
-def _(a: Dataset2D, subset_idx: Index):
-    key = a.index_dim
-    # xarray seems to have some code looking for a second entry in tuples
-    if isinstance(subset_idx, tuple) and len(subset_idx) == 1:
-        subset_idx = subset_idx[0]
-    return a.isel(**{key: subset_idx})
 # Registration for SparseDataset occurs in sparse_dataset.py
 @_subset.register(h5py.Dataset)
 def _subset_dataset(d, subset_idx):

anndata/_core/merge.py CHANGED Viewed

@@ -17,7 +17,6 @@ import pandas as pd
 import scipy
 from natsort import natsorted
 from packaging.version import Version
-from pandas.api.types import is_extension_array_dtype
 from scipy import sparse
 from anndata._core.file_backing import to_memory
@@ -46,7 +45,7 @@ if TYPE_CHECKING:
     from anndata._types import Join_T
-    from ..compat import XDataArray
+    from ..compat import XDataArray, XDataset
 T = TypeVar("T")
@@ -130,6 +129,8 @@ def equal(a, b) -> bool:
 @equal.register(pd.DataFrame)
+@equal.register(Dataset2D)
+@equal.register(pd.Series)
 def equal_dataframe(a, b) -> bool:
     return a.equals(b)
@@ -170,11 +171,6 @@ def equal_cupyarray(a, b) -> bool:
     return bool(cp.array_equal(a, b, equal_nan=True))
-@equal.register(pd.Series)
-def equal_series(a, b) -> bool:
-    return a.equals(b)
 @equal.register(CSMatrix)
 @equal.register(CSArray)
 @equal.register(CupySparseMatrix)
@@ -189,6 +185,15 @@ def equal_sparse(a, b) -> bool:
             # Comparison broken for CSC matrices
             # https://github.com/cupy/cupy/issues/7757
             a, b = CupyCSRMatrix(a), CupyCSRMatrix(b)
+        if Version(scipy.__version__) >= Version("1.16.0rc1"):
+            # TODO: https://github.com/scipy/scipy/issues/23068
+            return bool(
+                a.format == b.format
+                and (a.shape == b.shape)
+                and np.all(a.indptr == b.indptr)
+                and np.all(a.indices == b.indices)
+                and np.all((a.data == b.data) | (np.isnan(a.data) & np.isnan(b.data)))
+            )
         comp = a != b
         if isinstance(comp, bool):
             return not comp
@@ -215,11 +220,6 @@ def equal_awkward(a, b) -> bool:
     return ak.almost_equal(a, b)
-@equal.register(Dataset2D)
-def equal_dataset2d(a, b) -> bool:
-    return a.equals(b)
 def as_sparse(x, *, use_sparse_array: bool = False) -> CSMatrix | CSArray:
     if not isinstance(x, CSMatrix | CSArray):
         in_memory_array_class = (
@@ -584,8 +584,8 @@ class Reindexer:
         """
         if self.no_change and (axis_len(el, axis) == len(self.old_idx)):
             return el
-        if isinstance(el, pd.DataFrame):
-            return self._apply_to_df(el, axis=axis, fill_value=fill_value)
+        if isinstance(el, pd.DataFrame | Dataset2D):
+            return self._apply_to_df_like(el, axis=axis, fill_value=fill_value)
         elif isinstance(el, CSMatrix | CSArray | CupySparseMatrix):
             return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
         elif isinstance(el, AwkArray):
@@ -594,12 +594,10 @@ class Reindexer:
             return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value)
         elif isinstance(el, CupyArray):
             return self._apply_to_cupy_array(el, axis=axis, fill_value=fill_value)
-        elif isinstance(el, Dataset2D):
-            return self._apply_to_dataset2d(el, axis=axis, fill_value=fill_value)
         else:
             return self._apply_to_array(el, axis=axis, fill_value=fill_value)
-    def _apply_to_df(self, el: pd.DataFrame, *, axis, fill_value=None):
+    def _apply_to_df_like(self, el: pd.DataFrame | Dataset2D, *, axis, fill_value=None):
         if fill_value is None:
             fill_value = np.nan
         return el.reindex(self.new_idx, axis=axis, fill_value=fill_value)
@@ -758,31 +756,6 @@ class Reindexer:
                 el = ak.pad_none(el, 1, axis=axis)  # axis == 0
             return el[self.idx]
-    def _apply_to_dataset2d(self, el: Dataset2D, *, axis, fill_value=None):
-        if fill_value is None:
-            fill_value = np.nan
-        index_dim = el.index_dim
-        if axis == 0:
-            # Dataset.reindex() can't handle ExtensionArrays
-            extension_arrays = {
-                col: arr for col, arr in el.items() if is_extension_array_dtype(arr)
-            }
-            el = el.drop_vars(extension_arrays.keys())
-            el = el.reindex(
-                {index_dim: self.new_idx}, method=None, fill_value=fill_value
-            )
-            for col, arr in extension_arrays.items():
-                el[col] = (
-                    index_dim,
-                    pd.Series(arr, index=self.old_idx).reindex(
-                        self.new_idx, fill_value=fill_value
-                    ),
-                )
-            return el
-        else:
-            msg = "This should be unreachable, please open an issue."
-            raise Exception(msg)
     @property
     def idx(self):
         return self.old_idx.get_indexer(self.new_idx)
@@ -931,12 +904,6 @@ def concat_arrays(  # noqa: PLR0911, PLR0912
             ],
             format="csr",
         )
-        scipy_version = Version(scipy.__version__)
-        # Bug where xstack produces a matrix not an array in 1.11.*
-        if use_sparse_array and (scipy_version.major, scipy_version.minor) == (1, 11):
-            if mat.format == "csc":
-                return sparse.csc_array(mat)
-            return sparse.csr_array(mat)
         return mat
     else:
         return np.concatenate(
@@ -1294,7 +1261,7 @@ def make_dask_col_from_extension_dtype(
 def make_xarray_extension_dtypes_dask(
     annotations: Iterable[Dataset2D], *, use_only_object_dtype: bool = False
-) -> Generator[Dataset2D, None, None]:
+) -> Generator[XDataset, None, None]:
     """
     Creates a generator of Dataset2D objects with dask arrays in place of :class:`pandas.api.extensions.ExtensionArray` dtype columns.
@@ -1323,7 +1290,7 @@ def make_xarray_extension_dtypes_dask(
                     if name in extension_cols
                     else col
                 )
-                for name, col in a.items()
+                for name, col in a._items()
             }
         )
@@ -1336,30 +1303,26 @@ def concat_dataset2d_on_annot_axis(
     join: Join_T,
     *,
     force_lazy: bool,
-    label: str | None = None,
-    label_col: pd.Categorical | None = None,
+    concat_indices: pd.Index | None = None,
 ) -> Dataset2D:
-    """Create a concatenate dataset from a list of :class:`~anndata._core.xarray.Dataset2D` objects.
+    """Create a concatenate dataset from a list of :class:`~anndata.experimental.backed.Dataset2D` objects.
     The goal of this function is to mimic `pd.concat(..., ignore_index=True)` so has some complicated logic
     for handling the "index" to ensure (a) nothing is loaded into memory and (b) the true index is always tracked.
     Parameters
     ----------
     annotations
-        The :class:`~anndata._core.xarray.Dataset2D` objects to be concatenated.
+        The :class:`~anndata.experimental.backed.Dataset2D` objects to be concatenated.
     join
         Type of join operation
     force_lazy
         Whether to lazily concatenate elements using dask even when eager concatenation is possible.
-    label
-        Column in axis annotation (i.e. `.obs` or `.var`) to place batch information in.
-        If it's None, no column is added.
-    label_col
-        The bath information annotation.
+    concat_indices
+        Already calculated indices to be used as the index on the concatenated object.
     Returns
     -------
-    Concatenated :class:`~anndata._core.xarray.Dataset2D`
+    Concatenated :class:`~anndata.experimental.backed.Dataset2D`
     """
     from anndata._core.xarray import Dataset2D
     from anndata._io.specs.lazy_methods import DUMMY_RANGE_INDEX_KEY
@@ -1375,49 +1338,65 @@ def concat_dataset2d_on_annot_axis(
         old_key = a.index_dim
         is_fake_index = old_key != a.true_index_dim
         # First create a dummy index
-        a.coords[DS_CONCAT_DUMMY_INDEX_NAME] = (
+        a.ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] = (
             old_key,
             pd.RangeIndex(a.shape[0]),
         )
         # Set all the dimensions to this new dummy index
-        a = a.swap_dims({old_key: DS_CONCAT_DUMMY_INDEX_NAME})
+        ds_swapped = a.ds.swap_dims({old_key: DS_CONCAT_DUMMY_INDEX_NAME})
         # Move the old coordinate into a variable
-        old_coord = a.coords[old_key]
-        del a.coords[old_key]
-        a[old_key] = old_coord
+        old_coord = ds_swapped.coords[old_key]
+        del ds_swapped.coords[old_key]
+        ds_swapped[old_key] = old_coord
+        a = Dataset2D(ds_swapped)
         if not is_fake_index:
             a.true_index_dim = old_key
         annotations_re_indexed.append(a)
     # Concat along the dummy index
-    ds = Dataset2D(
-        xr.concat(annotations_re_indexed, join=join, dim=DS_CONCAT_DUMMY_INDEX_NAME),
-    )
-    ds.is_backed = have_backed
-    ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] = pd.RangeIndex(
-        ds.coords[DS_CONCAT_DUMMY_INDEX_NAME].shape[0]
+    ds_concat = xr.concat(
+        [a.ds for a in annotations_re_indexed],
+        join=join,
+        dim=DS_CONCAT_DUMMY_INDEX_NAME,
     )
+    ds_concat.attrs.pop("indexing_key", None)
+    # Wrapping allows us to use the Dataset2D methods
+    # directly for setting certain attrs/coords without duplicating here.
+    ds_concat_2d = Dataset2D(ds_concat)
+    ds_concat_2d.is_backed = have_backed
+    if concat_indices is not None:
+        concat_indices.name = DS_CONCAT_DUMMY_INDEX_NAME
+        ds_concat_2d.index = concat_indices
+        ds_concat = ds_concat_2d.ds
+    else:
+        ds_concat.coords[DS_CONCAT_DUMMY_INDEX_NAME] = pd.RangeIndex(
+            ds_concat.coords[DS_CONCAT_DUMMY_INDEX_NAME].shape[0]
+        )
     # Drop any lingering dimensions (swap doesn't delete)
-    ds = ds.drop_dims(d for d in ds.dims if d != DS_CONCAT_DUMMY_INDEX_NAME)
+    ds_concat = ds_concat.drop_dims(
+        d for d in ds_concat.dims if d != DS_CONCAT_DUMMY_INDEX_NAME
+    )
     # Create a new true index and then delete the columns resulting from the concatenation for each index.
     # This includes the dummy column (which is neither a dimension nor a true indexing column)
-    index = xr.concat(
-        [a.true_xr_index for a in annotations_re_indexed],
-        dim=DS_CONCAT_DUMMY_INDEX_NAME,
-    )
-    # prevent duplicate values
-    index.coords[DS_CONCAT_DUMMY_INDEX_NAME] = ds.coords[DS_CONCAT_DUMMY_INDEX_NAME]
-    ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] = index
+    if concat_indices is None:
+        index = xr.concat(
+            [a.true_xr_index for a in annotations_re_indexed],
+            dim=DS_CONCAT_DUMMY_INDEX_NAME,
+        )
+        # prevent duplicate values
+        index.coords[DS_CONCAT_DUMMY_INDEX_NAME] = ds_concat.coords[
+            DS_CONCAT_DUMMY_INDEX_NAME
+        ]
+        ds_concat.coords[DS_CONCAT_DUMMY_INDEX_NAME] = index
     for key in {
         true_index
         for a in annotations_re_indexed
         if (true_index := a.true_index_dim) != a.index_dim
     }:
-        del ds[key]
-    if DUMMY_RANGE_INDEX_KEY in ds:
-        del ds[DUMMY_RANGE_INDEX_KEY]
-    if label is not None and label_col is not None:
-        ds[label] = (DS_CONCAT_DUMMY_INDEX_NAME, label_col)
-    return ds
+        del ds_concat[key]
+    if DUMMY_RANGE_INDEX_KEY in ds_concat:
+        del ds_concat[DUMMY_RANGE_INDEX_KEY]
+    ds_concat_2d = Dataset2D(ds_concat)
+    return ds_concat_2d
 def concat(  # noqa: PLR0912, PLR0913, PLR0915
@@ -1691,14 +1670,15 @@ def concat(  # noqa: PLR0912, PLR0913, PLR0915
             ignore_index=True,
         )
         concat_annot.index = concat_indices
-        if label is not None:
-            concat_annot[label] = label_col
     else:
         concat_annot = concat_dataset2d_on_annot_axis(
-            annotations, join, force_lazy=force_lazy, label=label, label_col=label_col
+            annotations,
+            join,
+            force_lazy=force_lazy,
+            concat_indices=concat_indices,
         )
-        concat_indices.name = DS_CONCAT_DUMMY_INDEX_NAME
-        concat_annot.index = concat_indices
+    if label is not None:
+        concat_annot[label] = label_col
     # Annotation for other axis
     alt_annotations = [getattr(a, alt_axis_name) for a in adatas]
@@ -1720,7 +1700,7 @@ def concat(  # noqa: PLR0912, PLR0913, PLR0915
             )
         )
         annotations_with_only_dask = [
-            a.rename({a.true_index_dim: "merge_index"})
+            a.ds.rename({a.true_index_dim: "merge_index"})
             for a in annotations_with_only_dask
         ]
         alt_annot = Dataset2D(

anndata/_core/sparse_dataset.py CHANGED Viewed

@@ -165,7 +165,11 @@ class BackedSparseMatrix(_cs_matrix):
     def _get_contiguous_compressed_slice(
         self, s: slice
     ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-        new_indptr = self.indptr[s.start : s.stop + 1].copy()
+        new_indptr = self.indptr[s.start : s.stop + 1]
+        # If indptr is cached, we need to make a copy of the subset
+        # so as not to alter the underlying cached data.
+        if isinstance(self.indptr, np.ndarray):
+            new_indptr = new_indptr.copy()
         start = new_indptr[0]
         stop = new_indptr[-1]
@@ -395,10 +399,12 @@ def validate_indices(
 class BaseCompressedSparseDataset(abc._AbstractCSDataset, ABC):
     _group: GroupStorageType
+    _should_cache_indptr: bool
-    def __init__(self, group: GroupStorageType):
+    def __init__(self, group: GroupStorageType, *, should_cache_indptr: bool = True):
         type(self)._check_group_format(group)
         self._group = group
+        self._should_cache_indptr = should_cache_indptr
     @property
     def group(self) -> GroupStorageType:
@@ -616,8 +622,9 @@ class BaseCompressedSparseDataset(abc._AbstractCSDataset, ABC):
         It should therefore fit into memory, so we cache it for faster access.
         """
-        arr = self.group["indptr"][...]
-        return arr
+        if self._should_cache_indptr:
+            return self.group["indptr"][...]
+        return self.group["indptr"]
     @cached_property
     def _indices(self) -> H5Array | ZarrArray:
@@ -660,13 +667,23 @@ class _CSCDataset(BaseCompressedSparseDataset, abc.CSCDataset):
     """Internal concrete version of :class:`anndata.abc.CSRDataset`."""
-def sparse_dataset(group: GroupStorageType) -> abc.CSRDataset | abc.CSCDataset:
+def sparse_dataset(
+    group: GroupStorageType,
+    *,
+    should_cache_indptr: bool = True,
+) -> abc.CSRDataset | abc.CSCDataset:
     """Generates a backed mode-compatible sparse dataset class.
     Parameters
     ----------
     group
         The backing group store.
+    should_cache_indptr
+        Whether or not to cache the indptr for repeated reuse as a :class:`numpy.ndarray`.
+        The default is `True` but one might set it to false if the dataset is repeatedly reopened
+        using this command, and then only a subset is read in before closing again.
+        See https://github.com/scverse/anndata/blob/3c489b979086c39c59d3eb5dad90ebacce3b9a80/src/anndata/_io/specs/lazy_methods.py#L85-L95
+        for the target use-case.
     Returns
     -------
@@ -713,9 +730,9 @@ def sparse_dataset(group: GroupStorageType) -> abc.CSRDataset | abc.CSCDataset:
     """
     encoding_type = _get_group_format(group)
     if encoding_type == "csr":
-        return _CSRDataset(group)
+        return _CSRDataset(group, should_cache_indptr=should_cache_indptr)
     elif encoding_type == "csc":
-        return _CSCDataset(group)
+        return _CSCDataset(group, should_cache_indptr=should_cache_indptr)
     msg = f"Unknown encoding type {encoding_type}"
     raise ValueError(msg)

anndata/_core/storage.py CHANGED Viewed

@@ -37,8 +37,8 @@ def coerce_array(
         return value
     # If value is one of the allowed types, return it
     array_data_structure_types = get_args(ArrayDataStructureTypes)
-    if isinstance(value, XDataset) and not isinstance(value, Dataset2D):
-        value = Dataset2D(value.data_vars, value.coords, value.attrs)
+    if isinstance(value, XDataset):
+        value = Dataset2D(value)
     if isinstance(value, (*array_data_structure_types, Dataset2D)):
         if isinstance(value, np.matrix):
             msg = f"{name} should not be a np.matrix, use np.ndarray instead."

anndata 0.12.0rc3__py3-none-any.whl → 0.12.1__py3-none-any.whl

anndata 0.12.0rc3py3-none-any.whl → 0.12.1py3-none-any.whl