anndata 0.12.0rc3__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -59,7 +59,7 @@ def _gen_dataframe_mapping(
59
59
  df = pd.DataFrame(
60
60
  anno,
61
61
  index=None if length is None else mk_index(length),
62
- columns=None if len(anno) else [],
62
+ columns=None if anno else [],
63
63
  )
64
64
 
65
65
  if length is None:
@@ -79,7 +79,7 @@ class AlignedMappingBase(MutableMapping[str, Value], ABC):
79
79
  elif isinstance(val, np.ndarray | CupyArray) and len(val.shape) == 1:
80
80
  val = val.reshape((val.shape[0], 1))
81
81
  elif isinstance(val, XDataset):
82
- val = Dataset2D(data_vars=val.data_vars, coords=val.coords, attrs=val.attrs)
82
+ val = Dataset2D(val)
83
83
  for i, axis in enumerate(self.axes):
84
84
  if self.parent.shape[axis] == axis_len(val, i):
85
85
  continue
anndata/_core/anndata.py CHANGED
@@ -62,7 +62,7 @@ if TYPE_CHECKING:
62
62
  from .index import Index
63
63
 
64
64
 
65
- class AnnData(metaclass=utils.DeprecationMixinMeta):
65
+ class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641
66
66
  """\
67
67
  An annotated data matrix.
68
68
 
anndata/_core/index.py CHANGED
@@ -198,7 +198,8 @@ def _subset_sparse(a: CSMatrix | CSArray, subset_idx: Index):
198
198
 
199
199
 
200
200
  @_subset.register(pd.DataFrame)
201
- def _subset_df(df: pd.DataFrame, subset_idx: Index):
201
+ @_subset.register(Dataset2D)
202
+ def _subset_df(df: pd.DataFrame | Dataset2D, subset_idx: Index):
202
203
  return df.iloc[subset_idx]
203
204
 
204
205
 
@@ -209,15 +210,6 @@ def _subset_awkarray(a: AwkArray, subset_idx: Index):
209
210
  return a[subset_idx]
210
211
 
211
212
 
212
- @_subset.register(Dataset2D)
213
- def _(a: Dataset2D, subset_idx: Index):
214
- key = a.index_dim
215
- # xarray seems to have some code looking for a second entry in tuples
216
- if isinstance(subset_idx, tuple) and len(subset_idx) == 1:
217
- subset_idx = subset_idx[0]
218
- return a.isel(**{key: subset_idx})
219
-
220
-
221
213
  # Registration for SparseDataset occurs in sparse_dataset.py
222
214
  @_subset.register(h5py.Dataset)
223
215
  def _subset_dataset(d, subset_idx):
anndata/_core/merge.py CHANGED
@@ -17,7 +17,6 @@ import pandas as pd
17
17
  import scipy
18
18
  from natsort import natsorted
19
19
  from packaging.version import Version
20
- from pandas.api.types import is_extension_array_dtype
21
20
  from scipy import sparse
22
21
 
23
22
  from anndata._core.file_backing import to_memory
@@ -46,7 +45,7 @@ if TYPE_CHECKING:
46
45
 
47
46
  from anndata._types import Join_T
48
47
 
49
- from ..compat import XDataArray
48
+ from ..compat import XDataArray, XDataset
50
49
 
51
50
  T = TypeVar("T")
52
51
 
@@ -130,6 +129,8 @@ def equal(a, b) -> bool:
130
129
 
131
130
 
132
131
  @equal.register(pd.DataFrame)
132
+ @equal.register(Dataset2D)
133
+ @equal.register(pd.Series)
133
134
  def equal_dataframe(a, b) -> bool:
134
135
  return a.equals(b)
135
136
 
@@ -170,11 +171,6 @@ def equal_cupyarray(a, b) -> bool:
170
171
  return bool(cp.array_equal(a, b, equal_nan=True))
171
172
 
172
173
 
173
- @equal.register(pd.Series)
174
- def equal_series(a, b) -> bool:
175
- return a.equals(b)
176
-
177
-
178
174
  @equal.register(CSMatrix)
179
175
  @equal.register(CSArray)
180
176
  @equal.register(CupySparseMatrix)
@@ -189,6 +185,15 @@ def equal_sparse(a, b) -> bool:
189
185
  # Comparison broken for CSC matrices
190
186
  # https://github.com/cupy/cupy/issues/7757
191
187
  a, b = CupyCSRMatrix(a), CupyCSRMatrix(b)
188
+ if Version(scipy.__version__) >= Version("1.16.0rc1"):
189
+ # TODO: https://github.com/scipy/scipy/issues/23068
190
+ return bool(
191
+ a.format == b.format
192
+ and (a.shape == b.shape)
193
+ and np.all(a.indptr == b.indptr)
194
+ and np.all(a.indices == b.indices)
195
+ and np.all((a.data == b.data) | (np.isnan(a.data) & np.isnan(b.data)))
196
+ )
192
197
  comp = a != b
193
198
  if isinstance(comp, bool):
194
199
  return not comp
@@ -215,11 +220,6 @@ def equal_awkward(a, b) -> bool:
215
220
  return ak.almost_equal(a, b)
216
221
 
217
222
 
218
- @equal.register(Dataset2D)
219
- def equal_dataset2d(a, b) -> bool:
220
- return a.equals(b)
221
-
222
-
223
223
  def as_sparse(x, *, use_sparse_array: bool = False) -> CSMatrix | CSArray:
224
224
  if not isinstance(x, CSMatrix | CSArray):
225
225
  in_memory_array_class = (
@@ -584,8 +584,8 @@ class Reindexer:
584
584
  """
585
585
  if self.no_change and (axis_len(el, axis) == len(self.old_idx)):
586
586
  return el
587
- if isinstance(el, pd.DataFrame):
588
- return self._apply_to_df(el, axis=axis, fill_value=fill_value)
587
+ if isinstance(el, pd.DataFrame | Dataset2D):
588
+ return self._apply_to_df_like(el, axis=axis, fill_value=fill_value)
589
589
  elif isinstance(el, CSMatrix | CSArray | CupySparseMatrix):
590
590
  return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
591
591
  elif isinstance(el, AwkArray):
@@ -594,12 +594,10 @@ class Reindexer:
594
594
  return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value)
595
595
  elif isinstance(el, CupyArray):
596
596
  return self._apply_to_cupy_array(el, axis=axis, fill_value=fill_value)
597
- elif isinstance(el, Dataset2D):
598
- return self._apply_to_dataset2d(el, axis=axis, fill_value=fill_value)
599
597
  else:
600
598
  return self._apply_to_array(el, axis=axis, fill_value=fill_value)
601
599
 
602
- def _apply_to_df(self, el: pd.DataFrame, *, axis, fill_value=None):
600
+ def _apply_to_df_like(self, el: pd.DataFrame | Dataset2D, *, axis, fill_value=None):
603
601
  if fill_value is None:
604
602
  fill_value = np.nan
605
603
  return el.reindex(self.new_idx, axis=axis, fill_value=fill_value)
@@ -758,31 +756,6 @@ class Reindexer:
758
756
  el = ak.pad_none(el, 1, axis=axis) # axis == 0
759
757
  return el[self.idx]
760
758
 
761
- def _apply_to_dataset2d(self, el: Dataset2D, *, axis, fill_value=None):
762
- if fill_value is None:
763
- fill_value = np.nan
764
- index_dim = el.index_dim
765
- if axis == 0:
766
- # Dataset.reindex() can't handle ExtensionArrays
767
- extension_arrays = {
768
- col: arr for col, arr in el.items() if is_extension_array_dtype(arr)
769
- }
770
- el = el.drop_vars(extension_arrays.keys())
771
- el = el.reindex(
772
- {index_dim: self.new_idx}, method=None, fill_value=fill_value
773
- )
774
- for col, arr in extension_arrays.items():
775
- el[col] = (
776
- index_dim,
777
- pd.Series(arr, index=self.old_idx).reindex(
778
- self.new_idx, fill_value=fill_value
779
- ),
780
- )
781
- return el
782
- else:
783
- msg = "This should be unreachable, please open an issue."
784
- raise Exception(msg)
785
-
786
759
  @property
787
760
  def idx(self):
788
761
  return self.old_idx.get_indexer(self.new_idx)
@@ -931,12 +904,6 @@ def concat_arrays( # noqa: PLR0911, PLR0912
931
904
  ],
932
905
  format="csr",
933
906
  )
934
- scipy_version = Version(scipy.__version__)
935
- # Bug where xstack produces a matrix not an array in 1.11.*
936
- if use_sparse_array and (scipy_version.major, scipy_version.minor) == (1, 11):
937
- if mat.format == "csc":
938
- return sparse.csc_array(mat)
939
- return sparse.csr_array(mat)
940
907
  return mat
941
908
  else:
942
909
  return np.concatenate(
@@ -1294,7 +1261,7 @@ def make_dask_col_from_extension_dtype(
1294
1261
 
1295
1262
  def make_xarray_extension_dtypes_dask(
1296
1263
  annotations: Iterable[Dataset2D], *, use_only_object_dtype: bool = False
1297
- ) -> Generator[Dataset2D, None, None]:
1264
+ ) -> Generator[XDataset, None, None]:
1298
1265
  """
1299
1266
  Creates a generator of Dataset2D objects with dask arrays in place of :class:`pandas.api.extensions.ExtensionArray` dtype columns.
1300
1267
 
@@ -1323,7 +1290,7 @@ def make_xarray_extension_dtypes_dask(
1323
1290
  if name in extension_cols
1324
1291
  else col
1325
1292
  )
1326
- for name, col in a.items()
1293
+ for name, col in a._items()
1327
1294
  }
1328
1295
  )
1329
1296
 
@@ -1336,30 +1303,26 @@ def concat_dataset2d_on_annot_axis(
1336
1303
  join: Join_T,
1337
1304
  *,
1338
1305
  force_lazy: bool,
1339
- label: str | None = None,
1340
- label_col: pd.Categorical | None = None,
1306
+ concat_indices: pd.Index | None = None,
1341
1307
  ) -> Dataset2D:
1342
- """Create a concatenate dataset from a list of :class:`~anndata._core.xarray.Dataset2D` objects.
1308
+ """Create a concatenate dataset from a list of :class:`~anndata.experimental.backed.Dataset2D` objects.
1343
1309
  The goal of this function is to mimic `pd.concat(..., ignore_index=True)` so has some complicated logic
1344
1310
  for handling the "index" to ensure (a) nothing is loaded into memory and (b) the true index is always tracked.
1345
1311
 
1346
1312
  Parameters
1347
1313
  ----------
1348
1314
  annotations
1349
- The :class:`~anndata._core.xarray.Dataset2D` objects to be concatenated.
1315
+ The :class:`~anndata.experimental.backed.Dataset2D` objects to be concatenated.
1350
1316
  join
1351
1317
  Type of join operation
1352
1318
  force_lazy
1353
1319
  Whether to lazily concatenate elements using dask even when eager concatenation is possible.
1354
- label
1355
- Column in axis annotation (i.e. `.obs` or `.var`) to place batch information in.
1356
- If it's None, no column is added.
1357
- label_col
1358
- The bath information annotation.
1320
+ concat_indices
1321
+ Already calculated indices to be used as the index on the concatenated object.
1359
1322
 
1360
1323
  Returns
1361
1324
  -------
1362
- Concatenated :class:`~anndata._core.xarray.Dataset2D`
1325
+ Concatenated :class:`~anndata.experimental.backed.Dataset2D`
1363
1326
  """
1364
1327
  from anndata._core.xarray import Dataset2D
1365
1328
  from anndata._io.specs.lazy_methods import DUMMY_RANGE_INDEX_KEY
@@ -1375,49 +1338,65 @@ def concat_dataset2d_on_annot_axis(
1375
1338
  old_key = a.index_dim
1376
1339
  is_fake_index = old_key != a.true_index_dim
1377
1340
  # First create a dummy index
1378
- a.coords[DS_CONCAT_DUMMY_INDEX_NAME] = (
1341
+ a.ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] = (
1379
1342
  old_key,
1380
1343
  pd.RangeIndex(a.shape[0]),
1381
1344
  )
1382
1345
  # Set all the dimensions to this new dummy index
1383
- a = a.swap_dims({old_key: DS_CONCAT_DUMMY_INDEX_NAME})
1346
+ ds_swapped = a.ds.swap_dims({old_key: DS_CONCAT_DUMMY_INDEX_NAME})
1384
1347
  # Move the old coordinate into a variable
1385
- old_coord = a.coords[old_key]
1386
- del a.coords[old_key]
1387
- a[old_key] = old_coord
1348
+ old_coord = ds_swapped.coords[old_key]
1349
+ del ds_swapped.coords[old_key]
1350
+ ds_swapped[old_key] = old_coord
1351
+ a = Dataset2D(ds_swapped)
1388
1352
  if not is_fake_index:
1389
1353
  a.true_index_dim = old_key
1390
1354
  annotations_re_indexed.append(a)
1391
1355
  # Concat along the dummy index
1392
- ds = Dataset2D(
1393
- xr.concat(annotations_re_indexed, join=join, dim=DS_CONCAT_DUMMY_INDEX_NAME),
1394
- )
1395
- ds.is_backed = have_backed
1396
- ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] = pd.RangeIndex(
1397
- ds.coords[DS_CONCAT_DUMMY_INDEX_NAME].shape[0]
1356
+ ds_concat = xr.concat(
1357
+ [a.ds for a in annotations_re_indexed],
1358
+ join=join,
1359
+ dim=DS_CONCAT_DUMMY_INDEX_NAME,
1398
1360
  )
1361
+ ds_concat.attrs.pop("indexing_key", None)
1362
+ # Wrapping allows us to use the Dataset2D methods
1363
+ # directly for setting certain attrs/coords without duplicating here.
1364
+ ds_concat_2d = Dataset2D(ds_concat)
1365
+ ds_concat_2d.is_backed = have_backed
1366
+ if concat_indices is not None:
1367
+ concat_indices.name = DS_CONCAT_DUMMY_INDEX_NAME
1368
+ ds_concat_2d.index = concat_indices
1369
+ ds_concat = ds_concat_2d.ds
1370
+ else:
1371
+ ds_concat.coords[DS_CONCAT_DUMMY_INDEX_NAME] = pd.RangeIndex(
1372
+ ds_concat.coords[DS_CONCAT_DUMMY_INDEX_NAME].shape[0]
1373
+ )
1399
1374
  # Drop any lingering dimensions (swap doesn't delete)
1400
- ds = ds.drop_dims(d for d in ds.dims if d != DS_CONCAT_DUMMY_INDEX_NAME)
1375
+ ds_concat = ds_concat.drop_dims(
1376
+ d for d in ds_concat.dims if d != DS_CONCAT_DUMMY_INDEX_NAME
1377
+ )
1401
1378
  # Create a new true index and then delete the columns resulting from the concatenation for each index.
1402
1379
  # This includes the dummy column (which is neither a dimension nor a true indexing column)
1403
- index = xr.concat(
1404
- [a.true_xr_index for a in annotations_re_indexed],
1405
- dim=DS_CONCAT_DUMMY_INDEX_NAME,
1406
- )
1407
- # prevent duplicate values
1408
- index.coords[DS_CONCAT_DUMMY_INDEX_NAME] = ds.coords[DS_CONCAT_DUMMY_INDEX_NAME]
1409
- ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] = index
1380
+ if concat_indices is None:
1381
+ index = xr.concat(
1382
+ [a.true_xr_index for a in annotations_re_indexed],
1383
+ dim=DS_CONCAT_DUMMY_INDEX_NAME,
1384
+ )
1385
+ # prevent duplicate values
1386
+ index.coords[DS_CONCAT_DUMMY_INDEX_NAME] = ds_concat.coords[
1387
+ DS_CONCAT_DUMMY_INDEX_NAME
1388
+ ]
1389
+ ds_concat.coords[DS_CONCAT_DUMMY_INDEX_NAME] = index
1410
1390
  for key in {
1411
1391
  true_index
1412
1392
  for a in annotations_re_indexed
1413
1393
  if (true_index := a.true_index_dim) != a.index_dim
1414
1394
  }:
1415
- del ds[key]
1416
- if DUMMY_RANGE_INDEX_KEY in ds:
1417
- del ds[DUMMY_RANGE_INDEX_KEY]
1418
- if label is not None and label_col is not None:
1419
- ds[label] = (DS_CONCAT_DUMMY_INDEX_NAME, label_col)
1420
- return ds
1395
+ del ds_concat[key]
1396
+ if DUMMY_RANGE_INDEX_KEY in ds_concat:
1397
+ del ds_concat[DUMMY_RANGE_INDEX_KEY]
1398
+ ds_concat_2d = Dataset2D(ds_concat)
1399
+ return ds_concat_2d
1421
1400
 
1422
1401
 
1423
1402
  def concat( # noqa: PLR0912, PLR0913, PLR0915
@@ -1691,14 +1670,15 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915
1691
1670
  ignore_index=True,
1692
1671
  )
1693
1672
  concat_annot.index = concat_indices
1694
- if label is not None:
1695
- concat_annot[label] = label_col
1696
1673
  else:
1697
1674
  concat_annot = concat_dataset2d_on_annot_axis(
1698
- annotations, join, force_lazy=force_lazy, label=label, label_col=label_col
1675
+ annotations,
1676
+ join,
1677
+ force_lazy=force_lazy,
1678
+ concat_indices=concat_indices,
1699
1679
  )
1700
- concat_indices.name = DS_CONCAT_DUMMY_INDEX_NAME
1701
- concat_annot.index = concat_indices
1680
+ if label is not None:
1681
+ concat_annot[label] = label_col
1702
1682
 
1703
1683
  # Annotation for other axis
1704
1684
  alt_annotations = [getattr(a, alt_axis_name) for a in adatas]
@@ -1720,7 +1700,7 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915
1720
1700
  )
1721
1701
  )
1722
1702
  annotations_with_only_dask = [
1723
- a.rename({a.true_index_dim: "merge_index"})
1703
+ a.ds.rename({a.true_index_dim: "merge_index"})
1724
1704
  for a in annotations_with_only_dask
1725
1705
  ]
1726
1706
  alt_annot = Dataset2D(
@@ -165,7 +165,11 @@ class BackedSparseMatrix(_cs_matrix):
165
165
  def _get_contiguous_compressed_slice(
166
166
  self, s: slice
167
167
  ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
168
- new_indptr = self.indptr[s.start : s.stop + 1].copy()
168
+ new_indptr = self.indptr[s.start : s.stop + 1]
169
+ # If indptr is cached, we need to make a copy of the subset
170
+ # so as not to alter the underlying cached data.
171
+ if isinstance(self.indptr, np.ndarray):
172
+ new_indptr = new_indptr.copy()
169
173
 
170
174
  start = new_indptr[0]
171
175
  stop = new_indptr[-1]
@@ -395,10 +399,12 @@ def validate_indices(
395
399
 
396
400
  class BaseCompressedSparseDataset(abc._AbstractCSDataset, ABC):
397
401
  _group: GroupStorageType
402
+ _should_cache_indptr: bool
398
403
 
399
- def __init__(self, group: GroupStorageType):
404
+ def __init__(self, group: GroupStorageType, *, should_cache_indptr: bool = True):
400
405
  type(self)._check_group_format(group)
401
406
  self._group = group
407
+ self._should_cache_indptr = should_cache_indptr
402
408
 
403
409
  @property
404
410
  def group(self) -> GroupStorageType:
@@ -616,8 +622,9 @@ class BaseCompressedSparseDataset(abc._AbstractCSDataset, ABC):
616
622
 
617
623
  It should therefore fit into memory, so we cache it for faster access.
618
624
  """
619
- arr = self.group["indptr"][...]
620
- return arr
625
+ if self._should_cache_indptr:
626
+ return self.group["indptr"][...]
627
+ return self.group["indptr"]
621
628
 
622
629
  @cached_property
623
630
  def _indices(self) -> H5Array | ZarrArray:
@@ -660,13 +667,23 @@ class _CSCDataset(BaseCompressedSparseDataset, abc.CSCDataset):
660
667
  """Internal concrete version of :class:`anndata.abc.CSRDataset`."""
661
668
 
662
669
 
663
- def sparse_dataset(group: GroupStorageType) -> abc.CSRDataset | abc.CSCDataset:
670
+ def sparse_dataset(
671
+ group: GroupStorageType,
672
+ *,
673
+ should_cache_indptr: bool = True,
674
+ ) -> abc.CSRDataset | abc.CSCDataset:
664
675
  """Generates a backed mode-compatible sparse dataset class.
665
676
 
666
677
  Parameters
667
678
  ----------
668
679
  group
669
680
  The backing group store.
681
+ should_cache_indptr
682
+ Whether or not to cache the indptr for repeated reuse as a :class:`numpy.ndarray`.
683
+ The default is `True` but one might set it to false if the dataset is repeatedly reopened
684
+ using this command, and then only a subset is read in before closing again.
685
+ See https://github.com/scverse/anndata/blob/3c489b979086c39c59d3eb5dad90ebacce3b9a80/src/anndata/_io/specs/lazy_methods.py#L85-L95
686
+ for the target use-case.
670
687
 
671
688
  Returns
672
689
  -------
@@ -713,9 +730,9 @@ def sparse_dataset(group: GroupStorageType) -> abc.CSRDataset | abc.CSCDataset:
713
730
  """
714
731
  encoding_type = _get_group_format(group)
715
732
  if encoding_type == "csr":
716
- return _CSRDataset(group)
733
+ return _CSRDataset(group, should_cache_indptr=should_cache_indptr)
717
734
  elif encoding_type == "csc":
718
- return _CSCDataset(group)
735
+ return _CSCDataset(group, should_cache_indptr=should_cache_indptr)
719
736
  msg = f"Unknown encoding type {encoding_type}"
720
737
  raise ValueError(msg)
721
738
 
anndata/_core/storage.py CHANGED
@@ -37,8 +37,8 @@ def coerce_array(
37
37
  return value
38
38
  # If value is one of the allowed types, return it
39
39
  array_data_structure_types = get_args(ArrayDataStructureTypes)
40
- if isinstance(value, XDataset) and not isinstance(value, Dataset2D):
41
- value = Dataset2D(value.data_vars, value.coords, value.attrs)
40
+ if isinstance(value, XDataset):
41
+ value = Dataset2D(value)
42
42
  if isinstance(value, (*array_data_structure_types, Dataset2D)):
43
43
  if isinstance(value, np.matrix):
44
44
  msg = f"{name} should not be a np.matrix, use np.ndarray instead."