anndata 0.12.6__py3-none-any.whl → 0.12.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@ import pandas as pd
9
9
  from pandas.api.types import is_string_dtype
10
10
 
11
11
  from .._warnings import ImplicitModificationWarning
12
- from ..compat import XDataset
12
+ from ..compat import XDataset, pandas_as_str
13
13
  from .xarray import Dataset2D
14
14
 
15
15
  if TYPE_CHECKING:
@@ -59,7 +59,7 @@ def _gen_dataframe_mapping(
59
59
  df = pd.DataFrame(
60
60
  anno,
61
61
  index=None if length is None else mk_index(length),
62
- columns=None if anno else [],
62
+ columns=None if anno else pd.array([], dtype="str"),
63
63
  )
64
64
 
65
65
  if length is None:
@@ -88,12 +88,12 @@ def _gen_dataframe_df(
88
88
  if length is not None and length != len(anno):
89
89
  raise _mk_df_error(source, attr, length, len(anno))
90
90
  anno = anno.copy(deep=False)
91
- if not is_string_dtype(anno.index):
91
+ if not is_string_dtype(anno.index[~anno.index.isna()]):
92
92
  msg = "Transforming to str index."
93
93
  warnings.warn(msg, ImplicitModificationWarning, stacklevel=2)
94
- anno.index = anno.index.astype(str)
94
+ anno.index = pandas_as_str(anno.index)
95
95
  if not len(anno.columns):
96
- anno.columns = anno.columns.astype(str)
96
+ anno.columns = pandas_as_str(anno.columns)
97
97
  return anno
98
98
 
99
99
 
anndata/_core/anndata.py CHANGED
@@ -26,7 +26,14 @@ from anndata._warnings import ImplicitModificationWarning
26
26
 
27
27
  from .. import utils
28
28
  from .._settings import settings
29
- from ..compat import CSArray, DaskArray, ZarrArray, _move_adj_mtx, old_positionals
29
+ from ..compat import (
30
+ CSArray,
31
+ DaskArray,
32
+ ZarrArray,
33
+ _move_adj_mtx,
34
+ old_positionals,
35
+ pandas_as_str,
36
+ )
30
37
  from ..logging import anndata_logger as logger
31
38
  from ..utils import (
32
39
  axis_len,
@@ -48,7 +55,7 @@ from .xarray import Dataset2D
48
55
  if TYPE_CHECKING:
49
56
  from collections.abc import Iterable
50
57
  from os import PathLike
51
- from typing import Any, ClassVar, Literal
58
+ from typing import Any, ClassVar, Literal, NoReturn
52
59
 
53
60
  from zarr.storage import StoreLike
54
61
 
@@ -383,11 +390,11 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641
383
390
  if obs is None:
384
391
  obs = pd.DataFrame(index=X.index)
385
392
  elif not isinstance(X.index, pd.RangeIndex):
386
- x_indices.append(("obs", "index", X.index.astype(str)))
393
+ x_indices.append(("obs", "index", pandas_as_str(X.index)))
387
394
  if var is None:
388
395
  var = pd.DataFrame(index=X.columns)
389
396
  elif not isinstance(X.columns, pd.RangeIndex):
390
- x_indices.append(("var", "columns", X.columns.astype(str)))
397
+ x_indices.append(("var", "columns", pandas_as_str(X.columns)))
391
398
  X = ensure_df_homogeneous(X, "X")
392
399
 
393
400
  # ----------------------------------------------------------------------
@@ -790,7 +797,9 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641
790
797
  )
791
798
  raise ValueError(msg)
792
799
  else:
793
- value = pd.Index(value)
800
+ value = (
801
+ value if isinstance(value, pd.Index) else pandas_as_str(pd.Index(value))
802
+ )
794
803
  if not isinstance(value.name, str | type(None)):
795
804
  value.name = None
796
805
  if (
@@ -1058,6 +1067,7 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641
1058
1067
  if not isinstance(df_full[k].dtype, pd.CategoricalDtype):
1059
1068
  continue
1060
1069
  all_categories = df_full[k].cat.categories
1070
+ # TODO: this mode is going away
1061
1071
  with pd.option_context("mode.chained_assignment", None):
1062
1072
  df_sub[k] = df_sub[k].cat.remove_unused_categories()
1063
1073
  # also correct the colors...
@@ -1627,8 +1637,8 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641
1627
1637
  annoA-1 NaN 2.0 1.0 0.0
1628
1638
  annoA-2 NaN 3.0 2.0 0.0
1629
1639
  annoB-2 NaN 2.0 1.0 0.0
1630
- >>> outer.var_names
1631
- Index(['a', 'b', 'c', 'd'], dtype='object')
1640
+ >>> outer.var_names.astype("string")
1641
+ Index(['a', 'b', 'c', 'd'], dtype='string')
1632
1642
  >>> outer.X
1633
1643
  array([[ 1., 2., 3., nan],
1634
1644
  [ 4., 5., 6., nan],
@@ -1710,8 +1720,8 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641
1710
1720
  ... dict(var_names=['d', 'c', 'b']),
1711
1721
  ... )
1712
1722
  >>> adata = adata1.concatenate(adata2, adata3, join='outer')
1713
- >>> adata.var_names
1714
- Index(['a', 'b', 'c', 'd'], dtype='object')
1723
+ >>> adata.var_names.astype("string")
1724
+ Index(['a', 'b', 'c', 'd'], dtype='string')
1715
1725
  >>> adata.X.toarray()
1716
1726
  array([[0., 2., 3., 0.],
1717
1727
  [0., 5., 6., 0.],
@@ -1779,25 +1789,25 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641
1779
1789
 
1780
1790
  return out
1781
1791
 
1782
- def var_names_make_unique(self, join: str = "-"):
1792
+ def var_names_make_unique(self, join: str = "-") -> None:
1783
1793
  # Important to go through the setter so obsm dataframes are updated too
1784
1794
  self.var_names = utils.make_index_unique(self.var.index, join)
1785
1795
 
1786
1796
  var_names_make_unique.__doc__ = utils.make_index_unique.__doc__
1787
1797
 
1788
- def obs_names_make_unique(self, join: str = "-"):
1798
+ def obs_names_make_unique(self, join: str = "-") -> None:
1789
1799
  # Important to go through the setter so obsm dataframes are updated too
1790
1800
  self.obs_names = utils.make_index_unique(self.obs.index, join)
1791
1801
 
1792
1802
  obs_names_make_unique.__doc__ = utils.make_index_unique.__doc__
1793
1803
 
1794
- def _check_uniqueness(self):
1795
- if not self.obs.index.is_unique:
1804
+ def _check_uniqueness(self) -> None:
1805
+ if self.obs.index[~self.obs.index.isna()].has_duplicates:
1796
1806
  utils.warn_names_duplicates("obs")
1797
- if not self.var.index.is_unique:
1807
+ if self.var.index[~self.var.index.isna()].has_duplicates:
1798
1808
  utils.warn_names_duplicates("var")
1799
1809
 
1800
- def __contains__(self, key: Any):
1810
+ def __contains__(self, key: Any) -> NoReturn:
1801
1811
  msg = "AnnData has no attribute __contains__, don’t check `in adata`."
1802
1812
  raise AttributeError(msg)
1803
1813
 
anndata/_core/merge.py CHANGED
@@ -334,7 +334,11 @@ def try_unifying_dtype( # noqa PLR0911, PLR0912
334
334
  if not pd.isnull(dtype) and len(dtype.categories) > 0
335
335
  )
336
336
  if same_orders:
337
- return next(iter(dtypes))
337
+ return next(
338
+ dtype
339
+ for dtype in dtypes
340
+ if not pd.isnull(dtype) and len(dtype.categories) > 0
341
+ )
338
342
  return object
339
343
  # Boolean
340
344
  elif all(pd.api.types.is_bool_dtype(dtype) or dtype is None for dtype in col):
@@ -958,8 +962,13 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0) -> list[Reinde
958
962
  msg = "Cannot concatenate an AwkwardArray with other array types."
959
963
  raise NotImplementedError(msg)
960
964
  common_keys = intersect_keys(el.fields for el in els)
965
+ # TODO: replace dtype=object once this is fixed: https://github.com/scikit-hep/awkward/issues/3730
961
966
  reindexers = [
962
- Reindexer(pd.Index(el.fields), pd.Index(list(common_keys))) for el in els
967
+ Reindexer(
968
+ pd.Index(el.fields, dtype=object),
969
+ pd.Index(list(common_keys), dtype=object),
970
+ )
971
+ for el in els
963
972
  ]
964
973
  else:
965
974
  min_ind = min(el.shape[alt_axis] for el in els)
@@ -1198,6 +1207,8 @@ def make_dask_col_from_extension_dtype(
1198
1207
  A :class:`dask.Array`: representation of the column.
1199
1208
  """
1200
1209
  import dask.array as da
1210
+ import xarray as xr
1211
+ from xarray.core.indexing import LazilyIndexedArray
1201
1212
 
1202
1213
  from anndata._io.specs.lazy_methods import (
1203
1214
  compute_chunk_layout_for_axis_size,
@@ -1205,7 +1216,6 @@ def make_dask_col_from_extension_dtype(
1205
1216
  maybe_open_h5,
1206
1217
  )
1207
1218
  from anndata.compat import XDataArray
1208
- from anndata.compat import xarray as xr
1209
1219
  from anndata.experimental import read_elem_lazy
1210
1220
 
1211
1221
  base_path_or_zarr_group = col.attrs.get("base_path_or_zarr_group")
@@ -1228,9 +1238,7 @@ def make_dask_col_from_extension_dtype(
1228
1238
  # reopening is important to get around h5py's unserializable lock in processes
1229
1239
  with maybe_open_h5(base_path_or_zarr_group, elem_name) as f:
1230
1240
  v = read_elem_lazy(f)
1231
- variable = xr.Variable(
1232
- data=xr.core.indexing.LazilyIndexedArray(v), dims=dims
1233
- )
1241
+ variable = xr.Variable(data=LazilyIndexedArray(v), dims=dims)
1234
1242
  data_array = XDataArray(
1235
1243
  variable,
1236
1244
  coords=coords,
@@ -1323,9 +1331,10 @@ def concat_dataset2d_on_annot_axis(
1323
1331
  -------
1324
1332
  Concatenated :class:`~anndata.experimental.backed.Dataset2D`
1325
1333
  """
1334
+ import xarray as xr
1335
+
1326
1336
  from anndata._core.xarray import Dataset2D
1327
1337
  from anndata._io.specs.lazy_methods import DUMMY_RANGE_INDEX_KEY
1328
- from anndata.compat import xarray as xr
1329
1338
 
1330
1339
  annotations_re_indexed = []
1331
1340
  have_backed = any(a.is_backed for a in annotations)
@@ -1525,15 +1534,18 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915
1525
1534
  >>> inner
1526
1535
  AnnData object with n_obs × n_vars = 4 × 2
1527
1536
  obs: 'group'
1528
- >>> (inner.obs_names, inner.var_names) # doctest: +NORMALIZE_WHITESPACE
1529
- (Index(['s1', 's2', 's3', 's4'], dtype='object'),
1530
- Index(['var1', 'var2'], dtype='object'))
1537
+ >>> (
1538
+ ... inner.obs_names.astype("string"),
1539
+ ... inner.var_names.astype("string"),
1540
+ ... ) # doctest: +NORMALIZE_WHITESPACE
1541
+ (Index(['s1', 's2', 's3', 's4'], dtype='string'),
1542
+ Index(['var1', 'var2'], dtype='string'))
1531
1543
  >>> outer = ad.concat([a, b], join="outer") # Joining on union of variables
1532
1544
  >>> outer
1533
1545
  AnnData object with n_obs × n_vars = 4 × 3
1534
1546
  obs: 'group', 'measure'
1535
- >>> outer.var_names
1536
- Index(['var1', 'var2', 'var3'], dtype='object')
1547
+ >>> outer.var_names.astype("string")
1548
+ Index(['var1', 'var2', 'var3'], dtype='string')
1537
1549
  >>> outer.to_df() # Sparse arrays are padded with zeroes by default
1538
1550
  var1 var2 var3
1539
1551
  s1 0 1 0
@@ -1638,7 +1650,7 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915
1638
1650
 
1639
1651
  # Combining indexes
1640
1652
  concat_indices = pd.concat(
1641
- [pd.Series(axis_indices(a, axis=axis)) for a in adatas], ignore_index=True
1653
+ [axis_indices(a, axis=axis).to_series() for a in adatas], ignore_index=True
1642
1654
  )
1643
1655
  if index_unique is not None:
1644
1656
  concat_indices = concat_indices.str.cat(
@@ -392,8 +392,17 @@ def is_sparse_indexing_overridden(
392
392
  def validate_indices(
393
393
  mtx: BackedSparseMatrix, indices: tuple[Index1D, Index1D]
394
394
  ) -> tuple[Index1D, Index1D]:
395
- res = mtx._validate_indices(indices)
396
- return res[0] if SCIPY_1_15 else res
395
+ if hasattr(mtx, "_validate_indices"):
396
+ res = mtx._validate_indices(indices)
397
+ return res[0] if SCIPY_1_15 else res
398
+ # https://github.com/scipy/scipy/pull/23267
399
+ elif Version(version("scipy")) >= Version("1.17.0rc0"):
400
+ from scipy.sparse._index import _validate_indices # type: ignore
401
+
402
+ return _validate_indices(indices, mtx.shape, mtx.format)[0]
403
+ else: # pragma: no cover
404
+ msg = "Cannot validate indices"
405
+ raise RuntimeError(msg)
397
406
 
398
407
 
399
408
  class BaseCompressedSparseDataset(abc._AbstractCSDataset, ABC):
anndata/_core/views.py CHANGED
@@ -315,6 +315,7 @@ def as_view_df(df, view_args):
315
315
  if settings.remove_unused_categories:
316
316
  for col in df.columns:
317
317
  if isinstance(df[col].dtype, pd.CategoricalDtype):
318
+ # TODO: this mode is going away
318
319
  with pd.option_context("mode.chained_assignment", None):
319
320
  df[col] = df[col].cat.remove_unused_categories()
320
321
  return DataFrameView(df, view_args=view_args)
anndata/_core/xarray.py CHANGED
@@ -3,23 +3,33 @@ from __future__ import annotations
3
3
  import warnings
4
4
  from dataclasses import dataclass
5
5
  from functools import wraps
6
- from typing import TYPE_CHECKING, overload
6
+ from typing import TYPE_CHECKING, TypeVar, overload
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
10
10
 
11
- from ..compat import XDataArray, XDataset, XVariable
11
+ from ..compat import XDataArray, XDataset, XVariable, pandas_as_str
12
12
 
13
13
  if TYPE_CHECKING:
14
- from collections.abc import Hashable, Iterable, Iterator, Mapping
14
+ from collections.abc import (
15
+ Callable,
16
+ Collection,
17
+ Hashable,
18
+ Iterable,
19
+ Iterator,
20
+ Mapping,
21
+ )
15
22
  from typing import Any, Literal
16
23
 
17
24
  from .._types import Dataset2DIlocIndexer
18
25
 
26
+ P = TypeVar("P")
27
+ R = TypeVar("R")
19
28
 
20
- def requires_xarray(func):
29
+
30
+ def requires_xarray(func: Callable[P, R]) -> Callable[P, R]:
21
31
  @wraps(func)
22
- def wrapper(*args, **kwargs):
32
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
23
33
  try:
24
34
  import xarray # noqa: F401
25
35
  except ImportError as e:
@@ -91,7 +101,7 @@ class Dataset2D:
91
101
  return self.ds.attrs.get("is_backed", False)
92
102
 
93
103
  @is_backed.setter
94
- def is_backed(self, isbacked: bool) -> bool:
104
+ def is_backed(self, isbacked: bool) -> None:
95
105
  if not isbacked and "is_backed" in self.ds.attrs:
96
106
  del self.ds.attrs["is_backed"]
97
107
  else:
@@ -191,18 +201,21 @@ class Dataset2D:
191
201
  @overload
192
202
  def __getitem__(self, key: Hashable) -> XDataArray: ...
193
203
  @overload
194
- def __getitem__(self, key: Iterable[Hashable]) -> Dataset2D: ...
204
+ def __getitem__(self, key: Collection[Hashable]) -> Dataset2D: ...
195
205
  def __getitem__(
196
206
  self, key: Mapping[Any, Any] | Hashable | Iterable[Hashable]
197
207
  ) -> Dataset2D | XDataArray:
198
208
  ret = self.ds.__getitem__(key)
199
- if len(key) == 0 and not isinstance(key, tuple): # empty XDataset
209
+ if is_empty := (len(key) == 0 and not isinstance(key, tuple)): # empty Dataset
200
210
  ret.coords[self.index_dim] = self.xr_index
201
211
  if isinstance(ret, XDataset):
202
212
  # If we get an xarray Dataset, we return a Dataset2D
203
213
  as_2d = Dataset2D(ret)
204
-
205
- as_2d.true_index_dim = self.true_index_dim
214
+ if not is_empty and self.true_index_dim not in [
215
+ *as_2d.columns,
216
+ as_2d.index_dim,
217
+ ]:
218
+ as_2d[self.true_index_dim] = self.true_index
206
219
  as_2d.is_backed = self.is_backed
207
220
  return as_2d
208
221
  return ret
@@ -222,18 +235,21 @@ class Dataset2D:
222
235
  -------
223
236
  :class:`pandas.DataFrame` with index set accordingly.
224
237
  """
238
+ index_key = self.ds.attrs.get("indexing_key", None)
239
+ all_columns = {*self.columns, *([] if index_key is None else [index_key])}
225
240
  # https://github.com/pydata/xarray/issues/10419
226
241
  non_nullable_string_cols = {
227
242
  col
228
- for col in self.columns
243
+ for col in all_columns
229
244
  if not self[col].attrs.get("is_nullable_string", False)
230
245
  }
231
246
  df = self.ds.to_dataframe()
232
- index_key = self.ds.attrs.get("indexing_key", None)
247
+ for col in all_columns - non_nullable_string_cols:
248
+ df[col] = (
249
+ pandas_as_str(df[col]) if col == index_key else df[col].astype("string")
250
+ )
233
251
  if df.index.name != index_key and index_key is not None:
234
252
  df = df.set_index(index_key)
235
- for col in set(self.columns) - non_nullable_string_cols:
236
- df[col] = df[col].astype(dtype="string")
237
253
  df.index.name = None # matches old AnnData object
238
254
  return df
239
255
 
@@ -263,7 +279,7 @@ class Dataset2D:
263
279
  For supported setter values see :meth:`xarray.Dataset.__setitem__`.
264
280
  """
265
281
  if key == self.index_dim:
266
- msg = f"Cannot set {self.index_dim} as a variable. Use `index` instead."
282
+ msg = f"Cannot set the index dimension {self.index_dim} as if it were a variable. Use `ds.index = ...` instead."
267
283
  raise KeyError(msg)
268
284
  if isinstance(value, tuple):
269
285
  if isinstance(value[0], tuple):
anndata/_io/h5ad.py CHANGED
@@ -41,6 +41,7 @@ if TYPE_CHECKING:
41
41
 
42
42
  from .._core.file_backing import AnnDataFileManager
43
43
  from .._core.raw import Raw
44
+ from .._types import StorageType
44
45
 
45
46
  T = TypeVar("T")
46
47
 
@@ -261,7 +262,7 @@ def read_h5ad(
261
262
 
262
263
  with h5py.File(filename, "r") as f:
263
264
 
264
- def callback(func, elem_name: str, elem, iospec):
265
+ def callback(read_func, elem_name: str, elem: StorageType, iospec: IOSpec):
265
266
  if iospec.encoding_type == "anndata" or elem_name.endswith("/"):
266
267
  return AnnData(**{
267
268
  # This is covering up backwards compat in the anndata initializer
@@ -279,7 +280,7 @@ def read_h5ad(
279
280
  elif elem_name in {"/obs", "/var"}:
280
281
  # Backwards compat
281
282
  return read_dataframe(elem)
282
- return func(elem)
283
+ return read_func(elem)
283
284
 
284
285
  adata = read_dispatched(f, callback=callback)
285
286
 
anndata/_io/read.py CHANGED
@@ -15,7 +15,7 @@ import pandas as pd
15
15
  from scipy import sparse
16
16
 
17
17
  from .. import AnnData
18
- from ..compat import old_positionals
18
+ from ..compat import old_positionals, pandas_as_str
19
19
  from .utils import is_float
20
20
 
21
21
  if TYPE_CHECKING:
@@ -74,8 +74,8 @@ def read_excel(
74
74
 
75
75
  df = read_excel(fspath(filename), sheet)
76
76
  X = df.values[:, 1:]
77
- row = dict(row_names=df.iloc[:, 0].values.astype(str))
78
- col = dict(col_names=np.array(df.columns[1:], dtype=str))
77
+ row = dict(row_names=pandas_as_str(df.iloc[:, 0]).array)
78
+ col = dict(col_names=pandas_as_str(df.columns[1:]).array)
79
79
  return AnnData(X, row, col)
80
80
 
81
81
 
@@ -270,7 +270,10 @@ def _gen_xarray_dict_iterator_from_elems(
270
270
  "base_path_or_zarr_group": v.base_path_or_zarr_group,
271
271
  "elem_name": v.elem_name,
272
272
  "is_nullable_string": isinstance(v, MaskedArray)
273
- and v.dtype == NULLABLE_NUMPY_STRING_TYPE,
273
+ and (
274
+ v.dtype == NULLABLE_NUMPY_STRING_TYPE
275
+ or isinstance(v.dtype, pd.StringDtype | np.dtypes.StringDType)
276
+ ),
274
277
  },
275
278
  )
276
279
  elif k == dim_name:
@@ -296,6 +299,10 @@ def read_dataframe(
296
299
  use_range_index: bool = False,
297
300
  chunks: tuple[int] | None = None,
298
301
  ) -> Dataset2D:
302
+ from xarray.core.indexing import BasicIndexer
303
+
304
+ from ...experimental.backed._lazy_arrays import MaskedArray
305
+
299
306
  elem_dict = {
300
307
  k: _reader.read_elem(elem[k], chunks=chunks)
301
308
  for k in [*elem.attrs["column-order"], elem.attrs["_index"]]
@@ -305,7 +312,12 @@ def read_dataframe(
305
312
  if not use_range_index:
306
313
  dim_name = elem.attrs["_index"]
307
314
  # no sense in reading this in multiple times since xarray requires an in-memory index
308
- index = elem_dict[dim_name].compute()
315
+ if isinstance(elem_dict[dim_name], DaskArray):
316
+ index = elem_dict[dim_name].compute()
317
+ elif isinstance(elem_dict[dim_name], MaskedArray):
318
+ index = elem_dict[dim_name][BasicIndexer((slice(None),))]
319
+ else:
320
+ raise NotImplementedError()
309
321
  else:
310
322
  dim_name = DUMMY_RANGE_INDEX_KEY
311
323
  index = pd.RangeIndex(len(elem_dict[elem.attrs["_index"]])).astype("str")
@@ -25,7 +25,6 @@ from anndata._core.sparse_dataset import _CSCDataset, _CSRDataset, sparse_datase
25
25
  from anndata._io.utils import check_key, zero_dim_array_as_scalar
26
26
  from anndata._warnings import OldFormatWarning
27
27
  from anndata.compat import (
28
- NULLABLE_NUMPY_STRING_TYPE,
29
28
  AwkArray,
30
29
  CupyArray,
31
30
  CupyCSCMatrix,
@@ -43,7 +42,7 @@ from anndata.compat import (
43
42
  )
44
43
 
45
44
  from ..._settings import settings
46
- from ...compat import is_zarr_v2
45
+ from ...compat import NULLABLE_NUMPY_STRING_TYPE, PANDAS_STRING_ARRAY_TYPES, is_zarr_v2
47
46
  from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial
48
47
 
49
48
  if TYPE_CHECKING:
@@ -1140,27 +1139,24 @@ def read_partial_categorical(elem, *, items=None, indices=(slice(None),)):
1140
1139
  @_REGISTRY.register_write(
1141
1140
  ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0")
1142
1141
  )
1143
- @_REGISTRY.register_write(
1144
- H5Group, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0")
1145
- )
1146
- @_REGISTRY.register_write(
1147
- ZarrGroup, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0")
1148
- )
1149
1142
  def write_nullable(
1150
1143
  f: GroupStorageType,
1151
1144
  k: str,
1152
- v: pd.arrays.IntegerArray | pd.arrays.BooleanArray | pd.arrays.StringArray,
1145
+ v: pd.arrays.IntegerArray
1146
+ | pd.arrays.BooleanArray
1147
+ | pd.arrays.StringArray
1148
+ | pd.arrays.ArrowStringArray,
1153
1149
  *,
1154
1150
  _writer: Writer,
1155
1151
  dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
1156
- ):
1152
+ ) -> None:
1157
1153
  if (
1158
- isinstance(v, pd.arrays.StringArray)
1154
+ isinstance(v, pd.arrays.StringArray | pd.arrays.ArrowStringArray)
1159
1155
  and not settings.allow_write_nullable_strings
1160
1156
  ):
1161
1157
  msg = (
1162
1158
  "`anndata.settings.allow_write_nullable_strings` is False, "
1163
- "because writing of `pd.arrays.StringArray` is new "
1159
+ "because writing of `pd.arrays.{StringArray,ArrowStringArray}` is new "
1164
1160
  "and not supported in anndata < 0.11, still use by many people. "
1165
1161
  "Opt-in to writing these arrays by toggling the setting to True."
1166
1162
  )
@@ -1168,13 +1164,19 @@ def write_nullable(
1168
1164
  g = f.require_group(k)
1169
1165
  values = (
1170
1166
  v.to_numpy(na_value="")
1171
- if isinstance(v, pd.arrays.StringArray)
1167
+ if isinstance(v, pd.arrays.StringArray | pd.arrays.ArrowStringArray)
1172
1168
  else v.to_numpy(na_value=0, dtype=v.dtype.numpy_dtype)
1173
1169
  )
1174
1170
  _writer.write_elem(g, "values", values, dataset_kwargs=dataset_kwargs)
1175
1171
  _writer.write_elem(g, "mask", v.isna(), dataset_kwargs=dataset_kwargs)
1176
1172
 
1177
1173
 
1174
+ for store_type, array_type in product([H5Group, ZarrGroup], PANDAS_STRING_ARRAY_TYPES):
1175
+ _REGISTRY.register_write(
1176
+ store_type, array_type, IOSpec("nullable-string-array", "0.1.0")
1177
+ )(write_nullable)
1178
+
1179
+
1178
1180
  def _read_nullable(
1179
1181
  elem: GroupStorageType,
1180
1182
  *,
@@ -1190,18 +1192,6 @@ def _read_nullable(
1190
1192
  )
1191
1193
 
1192
1194
 
1193
- def _string_array(
1194
- values: np.ndarray, mask: np.ndarray
1195
- ) -> pd.api.extensions.ExtensionArray:
1196
- """Construct a string array from values and mask."""
1197
- arr = pd.array(
1198
- values.astype(NULLABLE_NUMPY_STRING_TYPE),
1199
- dtype=pd.StringDtype(),
1200
- )
1201
- arr[mask] = pd.NA
1202
- return arr
1203
-
1204
-
1205
1195
  _REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))(
1206
1196
  read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray)
1207
1197
  )
@@ -1216,12 +1206,22 @@ _REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))(
1216
1206
  read_nullable_boolean
1217
1207
  )
1218
1208
 
1219
- _REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))(
1220
- read_nullable_string := partial(_read_nullable, array_type=_string_array)
1221
- )
1222
- _REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))(
1223
- read_nullable_string
1224
- )
1209
+
1210
+ @_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))
1211
+ @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))
1212
+ def _read_nullable_string(
1213
+ elem: GroupStorageType, *, _reader: Reader
1214
+ ) -> pd.api.extensions.ExtensionArray:
1215
+ values = _reader.read_elem(elem["values"])
1216
+ mask = _reader.read_elem(elem["mask"])
1217
+ dtype = pd.StringDtype()
1218
+
1219
+ arr = pd.array(
1220
+ values.astype(NULLABLE_NUMPY_STRING_TYPE),
1221
+ dtype=dtype,
1222
+ )
1223
+ arr[mask] = pd.NA
1224
+ return arr
1225
1225
 
1226
1226
 
1227
1227
  ###########
anndata/_settings.py CHANGED
@@ -102,7 +102,7 @@ def check_and_get_environ_var(
102
102
  )
103
103
 
104
104
 
105
- def check_and_get_bool(option, default_value):
105
+ def check_and_get_bool(option: str, default_value: bool) -> bool: # noqa: FBT001
106
106
  return check_and_get_environ_var(
107
107
  f"ANNDATA_{option.upper()}",
108
108
  str(int(default_value)),
@@ -111,7 +111,7 @@ def check_and_get_bool(option, default_value):
111
111
  )
112
112
 
113
113
 
114
- def check_and_get_int(option, default_value):
114
+ def check_and_get_int(option: str, default_value: int) -> int:
115
115
  return check_and_get_environ_var(
116
116
  f"ANNDATA_{option.upper()}",
117
117
  str(int(default_value)),
@@ -431,7 +431,7 @@ settings.register(
431
431
  settings.register(
432
432
  "allow_write_nullable_strings",
433
433
  default_value=False,
434
- description="Whether or not to allow writing of `pd.arrays.StringArray`.",
434
+ description="Whether or not to allow writing of `pd.arrays.{StringArray,ArrowStringArray}`.",
435
435
  validate=validate_bool,
436
436
  get_from_env=check_and_get_bool,
437
437
  )
anndata/_types.py CHANGED
@@ -130,7 +130,7 @@ class Write(Protocol[RWAble_contra]):
130
130
  v
131
131
  The element to write out.
132
132
  dataset_kwargs
133
- Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`.
133
+ Keyword arguments to be passed to a library-level io function, like `chunks` for :mod:`zarr`.
134
134
  """
135
135
  ...
136
136
 
@@ -194,7 +194,7 @@ class WriteCallback(Protocol[RWAble]):
194
194
  iospec
195
195
  Internal AnnData encoding specification for the element.
196
196
  dataset_kwargs
197
- Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`.
197
+ Keyword arguments to be passed to a library-level io function, like `chunks` for :mod:`zarr`.
198
198
  """
199
199
  ...
200
200
 
@@ -2,11 +2,12 @@ from __future__ import annotations
2
2
 
3
3
  from codecs import decode
4
4
  from collections.abc import Mapping, Sequence
5
+ from enum import Enum, auto
5
6
  from functools import cache, partial, singledispatch
6
7
  from importlib.metadata import version
7
8
  from importlib.util import find_spec
8
9
  from types import EllipsisType
9
- from typing import TYPE_CHECKING, TypeVar
10
+ from typing import TYPE_CHECKING, TypeVar, overload
10
11
  from warnings import warn
11
12
 
12
13
  import h5py
@@ -31,8 +32,8 @@ CSMatrix = scipy.sparse.csr_matrix | scipy.sparse.csc_matrix
31
32
  CSArray = scipy.sparse.csr_array | scipy.sparse.csc_array
32
33
 
33
34
 
34
- class Empty:
35
- pass
35
+ class Empty(Enum):
36
+ TOKEN = auto()
36
37
 
37
38
 
38
39
  Index1DNorm = slice | NDArray[np.bool_] | NDArray[np.integer]
@@ -76,8 +77,6 @@ H5File = h5py.File
76
77
  #############################
77
78
  @cache
78
79
  def is_zarr_v2() -> bool:
79
- from packaging.version import Version
80
-
81
80
  return Version(version("zarr")) < Version("3.0.0")
82
81
 
83
82
 
@@ -217,10 +216,79 @@ NULLABLE_NUMPY_STRING_TYPE = (
217
216
  else np.dtypes.StringDType(na_object=pd.NA)
218
217
  )
219
218
 
219
+ PANDAS_SUPPORTS_NA_VALUE = Version(version("pandas")) >= Version("2.3")
220
+
221
+
222
+ PANDAS_STRING_ARRAY_TYPES: list[type[pd.api.extensions.ExtensionArray]] = [
223
+ pd.arrays.StringArray,
224
+ pd.arrays.ArrowStringArray,
225
+ ]
226
+ # these are removed in favor of the above classes: https://github.com/pandas-dev/pandas/pull/62149
227
+ try:
228
+ from pandas.core.arrays.string_ import StringArrayNumpySemantics
229
+ except ImportError:
230
+ pass
231
+ else:
232
+ PANDAS_STRING_ARRAY_TYPES += [StringArrayNumpySemantics]
233
+ try:
234
+ from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
235
+ except ImportError:
236
+ pass
237
+ else:
238
+ PANDAS_STRING_ARRAY_TYPES += [ArrowStringArrayNumpySemantics]
239
+
240
+
241
+ @overload
242
+ def pandas_as_str(a: pd.Index[Any]) -> pd.Index[str]: ...
243
+ @overload
244
+ def pandas_as_str(a: pd.Series[Any]) -> pd.Series[str]: ...
245
+
246
+
247
+ def pandas_as_str(a: pd.Index | pd.Series) -> pd.Index[str] | pd.Series[str]:
248
+ """Convert to fitting dtype, maintaining NA semantics if possible.
249
+
250
+ This is `"str"` when `pd.options.future.infer_string` is `True` (e.g. in Pandas 3+), and `"object"` otherwise.
251
+ """
252
+ if not pd.options.future.infer_string:
253
+ return a.astype(str)
254
+ if a.array.dtype == "string": # any `pd.StringDtype`
255
+ return a
256
+ if PANDAS_SUPPORTS_NA_VALUE:
257
+ dtype = pd.StringDtype(na_value=a.array.dtype.na_value)
258
+ elif a.array.dtype.na_value is pd.NA:
259
+ dtype = pd.StringDtype() # NA semantics
260
+ elif a.array.dtype.na_value is np.nan and find_spec("pyarrow"): # noqa: PLW0177
261
+ # on pandas 2.2, this is the only way to get `np.nan` semantics
262
+ dtype = pd.StringDtype("pyarrow_numpy")
263
+ else:
264
+ msg = (
265
+ f"Converting an array with `dtype.na_value={a.array.dtype.na_value}` to a string array requires pyarrow or pandas>=2.3. "
266
+ "Converting to `pd.NA` semantics instead."
267
+ )
268
+ warn(msg, UserWarning, stacklevel=2)
269
+ dtype = pd.StringDtype() # NA semantics
270
+ return a.astype(dtype)
271
+
272
+
273
+ V = TypeVar("V")
274
+ T = TypeVar("T")
275
+
276
+
277
+ @overload
278
+ def _read_attr(
279
+ attrs: Mapping[str, V], name: str, default: Empty = Empty.TOKEN
280
+ ) -> V: ...
281
+
282
+
283
+ @overload
284
+ def _read_attr(attrs: Mapping[str, V], name: str, default: T) -> V | T: ...
285
+
220
286
 
221
287
  @singledispatch
222
- def _read_attr(attrs: Mapping, name: str, default: Any | None = Empty):
223
- if default is Empty:
288
+ def _read_attr(
289
+ attrs: Mapping[str, V], name: str, default: T | Empty = Empty.TOKEN
290
+ ) -> V | T:
291
+ if default is Empty.TOKEN:
224
292
  return attrs[name]
225
293
  else:
226
294
  return attrs.get(name, default=default)
@@ -228,8 +296,8 @@ def _read_attr(attrs: Mapping, name: str, default: Any | None = Empty):
228
296
 
229
297
  @_read_attr.register(h5py.AttributeManager)
230
298
  def _read_attr_hdf5(
231
- attrs: h5py.AttributeManager, name: str, default: Any | None = Empty
232
- ):
299
+ attrs: h5py.AttributeManager, name: str, default: T | Empty = Empty.TOKEN
300
+ ) -> str | T:
233
301
  """
234
302
  Read an HDF5 attribute and perform all necessary conversions.
235
303
 
@@ -238,7 +306,7 @@ def _read_attr_hdf5(
238
306
  For example Julia's HDF5.jl writes string attributes as fixed-size strings, which
239
307
  are read as bytes by h5py.
240
308
  """
241
- if name not in attrs and default is not Empty:
309
+ if name not in attrs and default is not Empty.TOKEN:
242
310
  return default
243
311
  attr = attrs[name]
244
312
  attr_id = attrs.get_id(name)
@@ -16,10 +16,7 @@ if TYPE_CHECKING:
16
16
  from anndata.typing import RWAble
17
17
 
18
18
 
19
- def read_dispatched(
20
- elem: StorageType,
21
- callback: ReadCallback,
22
- ) -> RWAble:
19
+ def read_dispatched(elem: StorageType, callback: ReadCallback) -> RWAble:
23
20
  """
24
21
  Read elem, calling the callback at each sub-element.
25
22
 
@@ -19,16 +19,22 @@ from ...compat import (
19
19
  XZarrArrayWrapper,
20
20
  ZarrArray,
21
21
  )
22
- from ...compat import xarray as xr
23
22
 
24
23
  if TYPE_CHECKING:
25
24
  from pathlib import Path
26
25
  from typing import Literal
27
26
 
27
+ from pandas._libs.missing import NAType
28
+ from pandas.core.dtypes.base import ExtensionDtype
29
+
28
30
  from anndata.compat import ZarrGroup
29
31
 
30
32
  from ...compat import Index1DNorm
31
33
 
34
+ if TYPE_CHECKING: # Double nesting so Sphinx can import the parent block
35
+ from xarray.core.extension_array import PandasExtensionArray
36
+ from xarray.core.indexing import ExplicitIndexer
37
+
32
38
 
33
39
  K = TypeVar("K", H5Array, ZarrArray)
34
40
 
@@ -43,14 +49,13 @@ class ZarrOrHDF5Wrapper(XZarrArrayWrapper, Generic[K]):
43
49
  self.shape = self._array.shape
44
50
  self.dtype = self._array.dtype
45
51
 
46
- def __getitem__(self, key: xr.core.indexing.ExplicitIndexer):
52
+ def __getitem__(self, key: ExplicitIndexer):
53
+ from xarray.core.indexing import IndexingSupport, explicit_indexing_adapter
54
+
47
55
  if isinstance(self._array, ZarrArray):
48
56
  return super().__getitem__(key)
49
- res = xr.core.indexing.explicit_indexing_adapter(
50
- key,
51
- self.shape,
52
- xr.core.indexing.IndexingSupport.OUTER_1VECTOR,
53
- self._getitem,
57
+ res = explicit_indexing_adapter(
58
+ key, self.shape, IndexingSupport.OUTER_1VECTOR, self._getitem
54
59
  )
55
60
  return res
56
61
 
@@ -109,22 +114,23 @@ class CategoricalArray(XBackendArray, Generic[K]):
109
114
 
110
115
  @cached_property
111
116
  def categories(self) -> np.ndarray:
112
- if isinstance(self._categories, ZarrArray):
113
- return self._categories[...]
114
117
  from anndata.io import read_elem
115
118
 
116
119
  return read_elem(self._categories)
117
120
 
118
- def __getitem__(
119
- self, key: xr.core.indexing.ExplicitIndexer
120
- ) -> xr.core.extension_array.PandasExtensionArray:
121
+ def __getitem__(self, key: ExplicitIndexer) -> PandasExtensionArray:
122
+ from xarray.core.extension_array import PandasExtensionArray
123
+
121
124
  codes = self._codes[key]
122
125
  categorical_array = pd.Categorical.from_codes(
123
- codes=codes, categories=self.categories, ordered=self._ordered
126
+ codes=codes,
127
+ # casting to numpy (string) maintains our old behavior, this will be relaxed in 0.13
128
+ categories=np.array(self.categories),
129
+ ordered=self._ordered,
124
130
  )
125
131
  if settings.remove_unused_categories:
126
132
  categorical_array = categorical_array.remove_unused_categories()
127
- return xr.core.extension_array.PandasExtensionArray(categorical_array)
133
+ return PandasExtensionArray(categorical_array)
128
134
 
129
135
  @cached_property
130
136
  def dtype(self):
@@ -163,9 +169,9 @@ class MaskedArray(XBackendArray, Generic[K]):
163
169
  self.file_format = "zarr" if isinstance(mask, ZarrArray) else "h5"
164
170
  self.elem_name = elem_name
165
171
 
166
- def __getitem__(
167
- self, key: xr.core.indexing.ExplicitIndexer
168
- ) -> xr.core.extension_array.PandasExtensionArray | np.ndarray:
172
+ def __getitem__(self, key: ExplicitIndexer) -> PandasExtensionArray | np.ndarray:
173
+ from xarray.core.extension_array import PandasExtensionArray
174
+
169
175
  values = self._values[key]
170
176
  mask = self._mask[key]
171
177
  if self._dtype_str == "nullable-integer":
@@ -181,10 +187,10 @@ class MaskedArray(XBackendArray, Generic[K]):
181
187
  else:
182
188
  msg = f"Invalid dtype_str {self._dtype_str}"
183
189
  raise RuntimeError(msg)
184
- return xr.core.extension_array.PandasExtensionArray(extension_array)
190
+ return PandasExtensionArray(extension_array)
185
191
 
186
192
  @cached_property
187
- def dtype(self):
193
+ def dtype(self) -> np.dtypes.StringDType[NAType] | ExtensionDtype:
188
194
  if self._dtype_str == "nullable-integer":
189
195
  return pd.array(
190
196
  [],
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import shutil
4
4
  from collections.abc import Mapping
5
+ from contextlib import ExitStack, contextmanager
5
6
  from functools import singledispatch
6
7
  from os import PathLike
7
8
  from pathlib import Path
@@ -30,10 +31,11 @@ from ..compat import H5Array, H5Group, ZarrArray, ZarrGroup
30
31
  from . import read_dispatched, read_elem_lazy
31
32
 
32
33
  if TYPE_CHECKING:
33
- from collections.abc import Callable, Collection, Iterable, Sequence
34
+ from collections.abc import Callable, Collection, Generator, Iterable, Sequence
34
35
  from typing import Any, Literal
35
36
 
36
37
  from .._core.merge import Reindexer, StrategiesLiteral
38
+ from .._types import Join_T
37
39
 
38
40
  SPARSE_MATRIX = {"csc_matrix", "csr_matrix"}
39
41
 
@@ -100,35 +102,42 @@ def _gen_slice_to_append(
100
102
 
101
103
 
102
104
  @singledispatch
103
- def as_group(store, *, mode: str) -> ZarrGroup | H5Group:
105
+ @contextmanager
106
+ def as_group(store, *, mode: str) -> Generator[ZarrGroup | H5Group]:
104
107
  msg = "This is not yet implemented."
105
108
  raise NotImplementedError(msg)
106
109
 
107
110
 
108
111
  @as_group.register(PathLike)
109
112
  @as_group.register(str)
110
- def _(store: PathLike[str] | str, *, mode: str) -> ZarrGroup | H5Group:
113
+ @contextmanager
114
+ def _(store: PathLike[str] | str, *, mode: str) -> Generator[ZarrGroup | H5Group]:
111
115
  store = Path(store)
112
116
  if store.suffix == ".h5ad":
113
117
  import h5py
114
118
 
115
- return h5py.File(store, mode=mode)
119
+ f = h5py.File(store, mode=mode)
120
+ try:
121
+ yield f
122
+ finally:
123
+ f.close()
116
124
 
117
- if mode == "r": # others all write: r+, a, w, w-
125
+ elif mode == "r": # others all write: r+, a, w, w-
118
126
  import zarr
119
127
 
120
- return zarr.open_group(store, mode=mode)
121
-
122
- from anndata._io.zarr import open_write_group
128
+ yield zarr.open_group(store, mode=mode)
129
+ else:
130
+ from anndata._io.zarr import open_write_group
123
131
 
124
- return open_write_group(store, mode=mode)
132
+ yield open_write_group(store, mode=mode)
125
133
 
126
134
 
127
135
  @as_group.register(ZarrGroup)
128
136
  @as_group.register(H5Group)
129
- def _(store, *, mode: str) -> ZarrGroup | H5Group:
137
+ @contextmanager
138
+ def _(store: ZarrGroup | H5Group, *, mode: str) -> Generator[ZarrGroup | H5Group]:
130
139
  del mode
131
- return store
140
+ yield store
132
141
 
133
142
 
134
143
  ###################
@@ -441,9 +450,10 @@ def _write_alt_pairwise(
441
450
  write_elem(output_group, f"{alt_axis_name}p", alt_pairwise)
442
451
 
443
452
 
444
- def concat_on_disk( # noqa: PLR0912, PLR0913, PLR0915
445
- in_files: Collection[PathLike[str] | str] | Mapping[str, PathLike[str] | str],
446
- out_file: PathLike[str] | str,
453
+ def concat_on_disk( # noqa: PLR0913
454
+ in_files: Collection[PathLike[str] | str | H5Group | ZarrGroup]
455
+ | Mapping[str, PathLike[str] | str | H5Group | ZarrGroup],
456
+ out_file: PathLike[str] | str | H5Group | ZarrGroup,
447
457
  *,
448
458
  max_loaded_elems: int = 100_000_000,
449
459
  axis: Literal["obs", 0, "var", 1] = 0,
@@ -584,10 +594,11 @@ def concat_on_disk( # noqa: PLR0912, PLR0913, PLR0915
584
594
  merge = resolve_merge_strategy(merge)
585
595
  uns_merge = resolve_merge_strategy(uns_merge)
586
596
 
587
- out_file = Path(out_file)
588
- if not out_file.parent.exists():
589
- msg = f"Parent directory of {out_file} does not exist."
590
- raise FileNotFoundError(msg)
597
+ if is_out_path_like := isinstance(out_file, str | PathLike):
598
+ out_file = Path(out_file)
599
+ if not out_file.parent.exists():
600
+ msg = f"Parent directory of {out_file} does not exist."
601
+ raise FileNotFoundError(msg)
591
602
 
592
603
  if isinstance(in_files, Mapping):
593
604
  if keys is not None:
@@ -600,7 +611,11 @@ def concat_on_disk( # noqa: PLR0912, PLR0913, PLR0915
600
611
  else:
601
612
  in_files = list(in_files)
602
613
 
603
- if len(in_files) == 1:
614
+ if (
615
+ len(in_files) == 1
616
+ and isinstance(in_files[0], str | PathLike)
617
+ and is_out_path_like
618
+ ):
604
619
  shutil.copy2(in_files[0], out_file)
605
620
  return
606
621
 
@@ -610,9 +625,40 @@ def concat_on_disk( # noqa: PLR0912, PLR0913, PLR0915
610
625
  axis, axis_name = _resolve_axis(axis)
611
626
  _, alt_axis_name = _resolve_axis(1 - axis)
612
627
 
613
- output_group = as_group(out_file, mode="w")
614
- groups = [as_group(f, mode="r") for f in in_files]
628
+ with ExitStack() as stack, as_group(out_file, mode="w") as output_group:
629
+ groups = [stack.enter_context(as_group(f, mode="r")) for f in in_files]
630
+ _concat_on_disk_inner(
631
+ groups=groups,
632
+ output_group=output_group,
633
+ axis=axis,
634
+ axis_name=axis_name,
635
+ alt_axis_name=alt_axis_name,
636
+ keys=keys,
637
+ max_loaded_elems=max_loaded_elems,
638
+ join=join,
639
+ label=label,
640
+ index_unique=index_unique,
641
+ fill_value=fill_value,
642
+ merge=merge,
643
+ )
615
644
 
645
+
646
+ def _concat_on_disk_inner( # noqa: PLR0913
647
+ *,
648
+ groups: list[H5Group | ZarrGroup],
649
+ output_group: H5Group | ZarrGroup,
650
+ axis: Literal[0, 1],
651
+ axis_name: Literal["obs", "var"],
652
+ alt_axis_name: Literal["obs", "var"],
653
+ keys: np.ndarray[tuple[int], np.dtype[Any]] | Collection[str],
654
+ max_loaded_elems: int,
655
+ join: Join_T = "inner",
656
+ label: str | None,
657
+ index_unique: str | None,
658
+ fill_value: Any | None,
659
+ merge: Callable[[Collection[Mapping]], Mapping],
660
+ ) -> None:
661
+ """Internal helper to minimize the amount of indented code within the context manager"""
616
662
  use_reindexing = False
617
663
 
618
664
  alt_idxs = [_df_index(g[alt_axis_name]) for g in groups]
@@ -489,7 +489,7 @@ class AnnCollectionView(_ConcatViewMixin, _IterateViewMixin):
489
489
  # change dtype for all keys of .obsm
490
490
  "obsm": lambda a: np.asarray(a, dtype="float32"),
491
491
  # change type only for one key of .obs
492
- "obs": dict(key1=lambda c: c.astype(str)),
492
+ "obs": dict(key1=lambda c: c.astype("string")),
493
493
  }
494
494
  """
495
495
  return self._convert
@@ -834,7 +834,7 @@ class AnnCollection(_ConcatViewMixin, _IterateViewMixin):
834
834
  # change dtype for all keys of .obsm
835
835
  "obsm": lambda a: np.asarray(a, dtype="float32"),
836
836
  # change type only for one key of .obs
837
- "obs": dict(key1=lambda c: c.astype(str)),
837
+ "obs": dict(key1=lambda c: c.astype("string")),
838
838
  }
839
839
  """
840
840
  return self._convert
anndata/tests/helpers.py CHANGED
@@ -322,8 +322,8 @@ def gen_adata( # noqa: PLR0913
322
322
  random_state = np.random.default_rng()
323
323
 
324
324
  M, N = shape
325
- obs_names = pd.Index(f"cell{i}" for i in range(shape[0]))
326
- var_names = pd.Index(f"gene{i}" for i in range(shape[1]))
325
+ obs_names = pd.Index([f"cell{i}" for i in range(shape[0])], dtype="str")
326
+ var_names = pd.Index([f"gene{i}" for i in range(shape[1])], dtype="str")
327
327
  obs = gen_typed_df(M, obs_names, dtypes=obs_dtypes)
328
328
  var = gen_typed_df(N, var_names, dtypes=var_dtypes)
329
329
  # For #147
@@ -1166,7 +1166,8 @@ class AccessTrackingStoreBase(LocalStore):
1166
1166
  def reset_key_trackers(self) -> None:
1167
1167
  self.initialize_key_trackers(self._access_count.keys())
1168
1168
 
1169
- def assert_access_count(self, key: str, count: int):
1169
+ def assert_access_count(self, key: str, count: int) -> None:
1170
+ __tracebackhide__ = True
1170
1171
  keys_accessed = self.get_subkeys_accessed(key)
1171
1172
  access_count = self.get_access_count(key)
1172
1173
  assert self.get_access_count(key) == count, (
anndata/utils.py CHANGED
@@ -216,7 +216,7 @@ except ImportError:
216
216
  pass
217
217
 
218
218
 
219
- def make_index_unique(index: pd.Index, join: str = "-"):
219
+ def make_index_unique(index: pd.Index[str], join: str = "-") -> pd.Index[str]:
220
220
  """
221
221
  Makes the index unique by appending a number string to each duplicate index element:
222
222
  '1', '2', etc.
@@ -235,18 +235,18 @@ def make_index_unique(index: pd.Index, join: str = "-"):
235
235
  --------
236
236
  >>> from anndata import AnnData
237
237
  >>> adata = AnnData(np.ones((2, 3)), var=pd.DataFrame(index=["a", "a", "b"]))
238
- >>> adata.var_names
239
- Index(['a', 'a', 'b'], dtype='object')
238
+ >>> adata.var_names.astype("string")
239
+ Index(['a', 'a', 'b'], dtype='string')
240
240
  >>> adata.var_names_make_unique()
241
- >>> adata.var_names
242
- Index(['a', 'a-1', 'b'], dtype='object')
241
+ >>> adata.var_names.astype("string")
242
+ Index(['a', 'a-1', 'b'], dtype='string')
243
243
  """
244
244
  if index.is_unique:
245
245
  return index
246
246
  from collections import Counter
247
247
 
248
- values = index.values.copy()
249
- indices_dup = index.duplicated(keep="first")
248
+ values = index.array.copy()
249
+ indices_dup = index.duplicated(keep="first") & ~index.isna()
250
250
  values_dup = values[indices_dup]
251
251
  values_set = set(values)
252
252
  counter = Counter()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anndata
3
- Version: 0.12.6
3
+ Version: 0.12.7
4
4
  Summary: Annotated data.
5
5
  Project-URL: Documentation, https://anndata.readthedocs.io/
6
6
  Project-URL: Source, https://github.com/scverse/anndata
@@ -31,7 +31,7 @@ Requires-Dist: legacy-api-wrap
31
31
  Requires-Dist: natsort
32
32
  Requires-Dist: numpy>=1.26
33
33
  Requires-Dist: packaging>=24.2
34
- Requires-Dist: pandas!=2.1.2,>=2.1.0
34
+ Requires-Dist: pandas!=2.1.2,<3,>=2.1.0
35
35
  Requires-Dist: scipy>=1.12
36
36
  Requires-Dist: zarr!=3.0.*,>=2.18.7
37
37
  Provides-Extra: cu11
@@ -57,7 +57,7 @@ Requires-Dist: sphinx-copybutton; extra == 'doc'
57
57
  Requires-Dist: sphinx-design>=0.5.0; extra == 'doc'
58
58
  Requires-Dist: sphinx-issues>=5.0.1; extra == 'doc'
59
59
  Requires-Dist: sphinx-toolbox>=3.8.0; extra == 'doc'
60
- Requires-Dist: sphinx>=8.2.1; extra == 'doc'
60
+ Requires-Dist: sphinx<9,>=8.2.1; extra == 'doc'
61
61
  Requires-Dist: sphinxext-opengraph; extra == 'doc'
62
62
  Requires-Dist: towncrier>=24.8.0; extra == 'doc'
63
63
  Provides-Extra: gpu
@@ -80,12 +80,12 @@ Requires-Dist: loompy>=3.0.5; extra == 'test'
80
80
  Requires-Dist: matplotlib; extra == 'test'
81
81
  Requires-Dist: openpyxl; extra == 'test'
82
82
  Requires-Dist: pyarrow; extra == 'test'
83
+ Requires-Dist: pytest; extra == 'test'
83
84
  Requires-Dist: pytest-cov; extra == 'test'
84
85
  Requires-Dist: pytest-memray; extra == 'test'
85
86
  Requires-Dist: pytest-mock; extra == 'test'
86
87
  Requires-Dist: pytest-randomly; extra == 'test'
87
88
  Requires-Dist: pytest-xdist[psutil]; extra == 'test'
88
- Requires-Dist: pytest<8.3.4,>=8.2; extra == 'test'
89
89
  Requires-Dist: requests; extra == 'test'
90
90
  Requires-Dist: scanpy>=1.10; extra == 'test'
91
91
  Requires-Dist: scikit-learn; extra == 'test'
@@ -102,12 +102,12 @@ Requires-Dist: loompy>=3.0.5; extra == 'test-min'
102
102
  Requires-Dist: matplotlib; extra == 'test-min'
103
103
  Requires-Dist: openpyxl; extra == 'test-min'
104
104
  Requires-Dist: pyarrow; extra == 'test-min'
105
+ Requires-Dist: pytest; extra == 'test-min'
105
106
  Requires-Dist: pytest-cov; extra == 'test-min'
106
107
  Requires-Dist: pytest-memray; extra == 'test-min'
107
108
  Requires-Dist: pytest-mock; extra == 'test-min'
108
109
  Requires-Dist: pytest-randomly; extra == 'test-min'
109
110
  Requires-Dist: pytest-xdist[psutil]; extra == 'test-min'
110
- Requires-Dist: pytest<8.3.4,>=8.2; extra == 'test-min'
111
111
  Requires-Dist: scanpy>=1.10; extra == 'test-min'
112
112
  Requires-Dist: scikit-learn; extra == 'test-min'
113
113
  Description-Content-Type: text/markdown
@@ -1,57 +1,57 @@
1
1
  anndata/__init__.py,sha256=daAzY8GGouJxCe30Lcr2pl9Jwo2dcGXHPi7WxnHpuOE,1710
2
- anndata/_settings.py,sha256=AvCdIKCynrP6xUjVtYpvN8dFTefsGPu6anGp4c8M58I,17532
2
+ anndata/_settings.py,sha256=A5duA5C2-S2vt2O53kWpBpa6PyQi4qfGm24ndudU6fs,17603
3
3
  anndata/_settings.pyi,sha256=mJQQ3I66Y3sng8K-_aGjYuo5UoNgC5syw0yuNc1FADU,1643
4
- anndata/_types.py,sha256=c71REP9wS7Vz2cYrNxuNjPYdnq8MJ5g04MNrSi85ATA,5427
4
+ anndata/_types.py,sha256=RbSN6dc46J2qDTZ9y9JXrzqfwhoCX5zL1ZPH7wTQyrM,5415
5
5
  anndata/_warnings.py,sha256=iFXa9EzPyuPbzRAzoG04oTXAyjnXhQa5zxAMZdsGLwM,702
6
6
  anndata/abc.py,sha256=jG64k59ZZ9Hfn-QWt_btZLuF7eGv_YNYwH91WdbR240,1645
7
7
  anndata/io.py,sha256=DrIo-FU6qbrdk5aVKoUIBoMttZaO5QWP4bowS9xaebI,698
8
8
  anndata/logging.py,sha256=E6nlPl-Em0yBjL5p-EcQFmhHTIUirhnZbfXbQtSVSek,1662
9
9
  anndata/types.py,sha256=FF3wDkntl6Jq35l0r_kEET33ljj9L7pmIrUr5-MLAvE,698
10
10
  anndata/typing.py,sha256=sRiAg16asjnKyXk1L4BtKWggyHMPLoxXzxTDmX3i7MY,1555
11
- anndata/utils.py,sha256=D4t_iQdTPeHukN4H7no0QZTIzWzclMYsWAHSBpubvCg,14758
11
+ anndata/utils.py,sha256=25M9B1rv3ZqHv0ZSIWpWjBLZSg4Bn_kv1ZnfRZghSaU,14829
12
12
  anndata/_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  anndata/_core/access.py,sha256=pts7fGUKgGZANSsu_qAA7L10qHM-jT1zIehbl3441OY,873
14
- anndata/_core/aligned_df.py,sha256=EC01OveJ0tS5bQQHc_OprYSPprl-YtJQK-kIOY_4SX0,4214
14
+ anndata/_core/aligned_df.py,sha256=bM9kkEFURRLeUOUMk90WxVnRC-ZsXGEDx36kDj5gC9I,4278
15
15
  anndata/_core/aligned_mapping.py,sha256=BYU1jslMWIhtFTtUMaXY8ZCyt0J4_ZsJTmj6J2yAXTQ,14257
16
- anndata/_core/anndata.py,sha256=PmR6vKglxX8G837lOah6OhPANALbTWYNlUWHf1A-SPw,78963
16
+ anndata/_core/anndata.py,sha256=e_IgHjIpXPQPJqakeH83bl8Quu9T0N--Mb_QHINayv4,79276
17
17
  anndata/_core/extensions.py,sha256=9Rsho6qnr3PJHULrYGiZHCBinBZYJK6zyf3cFsl_gBY,10425
18
18
  anndata/_core/file_backing.py,sha256=6DhBfLQPDFDpoe6wSgnOFtpC4Hnbh-UgOPbqvYDxm8g,5603
19
19
  anndata/_core/index.py,sha256=F3TQBUbWpt09Pb4MpwB7xfCI9uPuv7jrqx8X74CwVDU,13472
20
- anndata/_core/merge.py,sha256=vRW9z_PbobmRGDzQIVxAoFc7G9Rq-y6ry57k_XMmsLc,60460
20
+ anndata/_core/merge.py,sha256=wFsUotHnQsnFp84UoCp78XMw3zVfvUH5eIi-8hkb7zo,60880
21
21
  anndata/_core/raw.py,sha256=x_PwwaDQscVQOFJ38kF7sNQ47LxowpS38h2RQfU5Zwo,7925
22
- anndata/_core/sparse_dataset.py,sha256=mE-PRX4znkDyuum3BBBv7MJwyn4XL9C3nIQNRjZJ94w,26877
22
+ anndata/_core/sparse_dataset.py,sha256=R2BeSLiREiwk9FNjdLCR3VfbYatz-7BK0l2F9XqCiTk,27280
23
23
  anndata/_core/storage.py,sha256=mHzqp7YBJ-rGQFulMAx__D-Z7y4omHPyb1cP7YxfbFE,2555
24
- anndata/_core/views.py,sha256=DIJgnqPvh07wbLousjZbGBsMC55oyBsMbSeybQC5sIY,15019
25
- anndata/_core/xarray.py,sha256=JeQjTuSQEiZF8cryKDYf9d7yt-ufQEVo9x94YaczuPQ,16078
24
+ anndata/_core/views.py,sha256=-tiUwugw0bRYXzewruhU0xXT7nnDLdYf4CiFByLl34w,15067
25
+ anndata/_core/xarray.py,sha256=0de8K7YjG9mnT-dFSRoxVxgwQktjrGI9n5Yy-1YJSHg,16624
26
26
  anndata/_io/__init__.py,sha256=GTNeUZ8d8aA3sK4P33tyljIc60KapLbkqBC6J1y3l9U,346
27
- anndata/_io/h5ad.py,sha256=BwBEYU_SZWn3KDD3RuxltDSkyqHxE3xXUfkiPh8OG-Y,13908
28
- anndata/_io/read.py,sha256=MuTR6dR2WItV2y0sKYvxSO2fu7OlDjaCRYJuT5UbuBo,15933
27
+ anndata/_io/h5ad.py,sha256=JT5DxTXXibz2jh1mjaQB3_0QYdhJ3gv4IcWLPjKD-dw,13976
28
+ anndata/_io/read.py,sha256=Z0QdFkaaXmGo5a25O9N9Ej2v8U7b9oV9Umw98YtB5uA,15950
29
29
  anndata/_io/utils.py,sha256=3Lg27Q0Uo3HYlz980bG2Y02_VFIt0PiXMNIj_o-mgC4,9490
30
30
  anndata/_io/write.py,sha256=r55w6yPIIuUSLW9wyYL8GnkzHHQdAxy6xiCEw9cAC38,4811
31
31
  anndata/_io/zarr.py,sha256=Z996SZ8LV1Fpa_q8o70vHnBzNLOLlVjhf_Rs5EM_Slo,5461
32
32
  anndata/_io/specs/__init__.py,sha256=Z6l8xqa7B480U3pqrNIg4-fhUvpBW85w4xA3i3maAUM,427
33
- anndata/_io/specs/lazy_methods.py,sha256=hnZ3ggrVCXR_rCMVH2Of9YE3s4saWg2WODcCtj0WaIQ,12600
34
- anndata/_io/specs/methods.py,sha256=nn9DFcUEWvCeAPPifiFxpuYW3AmtFhNDUbxo7i0toow,46314
33
+ anndata/_io/specs/lazy_methods.py,sha256=aCdmmYLrOHlMyT18t3sLE2I51YGT-jDna2F3m7b_kv0,13093
34
+ anndata/_io/specs/methods.py,sha256=awmdbUMAP9Xjkid56LAbyWNQfKcCOrkx0BeQ6CDKek4,46422
35
35
  anndata/_io/specs/registry.py,sha256=6Z_ffk3uOIagzRPcDCvEoszcgD-U3n8wYnGiPA71ZeI,17539
36
- anndata/compat/__init__.py,sha256=lsLHB7je0SHSePi9noY3p7kRbOAHhZzmMT1hs_ZSXys,12702
36
+ anndata/compat/__init__.py,sha256=9696gHdOUz2yKih9epmT8WGSr6UX0pI8dJYTrqn0SJQ,14968
37
37
  anndata/experimental/__init__.py,sha256=polIxriEkby0iEqw-IXkUzp8k0wp92BpYY4zl4BsHH0,1648
38
- anndata/experimental/_dispatch_io.py,sha256=JzH8Uvewabc1gIF3L16RZnM9m2NAG28bQIQ57uP097k,1869
39
- anndata/experimental/merge.py,sha256=pl4MtDs_M76cTEqrJ_YJ8zyB6ID7QGzjntlAL7vp_qk,23303
38
+ anndata/experimental/_dispatch_io.py,sha256=gb9JUcgS1cIERjxM1PBpWDXfPkKgMevoLF0QInZfC-g,1858
39
+ anndata/experimental/merge.py,sha256=gWBS4HSkG8E3seIs2AS7jqqFc0Zp6JW94KWtNXApafg,24882
40
40
  anndata/experimental/backed/__init__.py,sha256=4dc9M_-_SlfUidDrbWt8PRyD_8bYjypHJ86IpdThHus,230
41
41
  anndata/experimental/backed/_compat.py,sha256=rM7CnSJEZCko5wPBFRfvZA9ZKUSpaOVcWFy5u09p1go,519
42
42
  anndata/experimental/backed/_io.py,sha256=YM5FL6sKdLyQTHUa43cF0pDNbyj2xD9X7lzUiQesV20,6681
43
- anndata/experimental/backed/_lazy_arrays.py,sha256=WgTYQ88w7rTAP719nGjvyNkIz8cBT2bbkKJhPIxG4_g,7467
43
+ anndata/experimental/backed/_lazy_arrays.py,sha256=8vcu7eyoRRlzNXyAzuY0s9CqEZCOAYoZIo-iI5d71_g,7805
44
44
  anndata/experimental/multi_files/__init__.py,sha256=T7iNLlRbe-KnLT3o7Tb7_nE4Iy_hLkG66UjBOvj2Bj8,107
45
- anndata/experimental/multi_files/_anncollection.py,sha256=d_d-v8X2WJTGNjAJoo2Mdykd-woSTM_oXEf2PUIqS6A,35254
45
+ anndata/experimental/multi_files/_anncollection.py,sha256=Ra8A4MzyFWlid5RJd0cc2d4SJeSZ2HXz3odKSqAbChw,35264
46
46
  anndata/experimental/pytorch/__init__.py,sha256=4CkgrahLO8Kc-s2bmv6lVQfDxbO3IUyV0v4ygBDkttY,95
47
47
  anndata/experimental/pytorch/_annloader.py,sha256=7mpsFV5vBfxKIje1cPjahtDZ5afkU-H663XB4FJhmok,8075
48
48
  anndata/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- anndata/tests/helpers.py,sha256=hvcLsWY-fMR4EUrwPd6laiS8kwUlIxvlVq3oeH3jf9g,37618
49
+ anndata/tests/helpers.py,sha256=BORIeSbcD0R_PDzi1IeR252it-aq6bL8fGN-bDR-Q1I,37689
50
50
  testing/anndata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  testing/anndata/_doctest.py,sha256=Qew0N0zLLNiPKN1CLunqY5cTinFLaEhY5GagiYfm6KI,344
52
- testing/anndata/_pytest.py,sha256=cg4oWbtH9J1sRNul0n2oOraU1h7cprugr27EUPGDaN0,3997
52
+ testing/anndata/_pytest.py,sha256=C_R-N2x9NHKZ66YLkvMLWkXQG1WiouOkBnLQpYx_62Q,3994
53
53
  testing/anndata/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- anndata-0.12.6.dist-info/METADATA,sha256=nWoP_t6iBExzxOziJ_t0KVJ_lDa6lxsNV4rPXf1EobM,9957
55
- anndata-0.12.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- anndata-0.12.6.dist-info/licenses/LICENSE,sha256=VcrXoEVMhtNuvMvKYGP-I5lMT8qZ_6dFf22fsL180qA,1575
57
- anndata-0.12.6.dist-info/RECORD,,
54
+ anndata-0.12.7.dist-info/METADATA,sha256=LQgLMW_q9Q4V4k0yBNpYNCPnPSUCxzyubwSDxR_RqTE,9939
55
+ anndata-0.12.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
+ anndata-0.12.7.dist-info/licenses/LICENSE,sha256=VcrXoEVMhtNuvMvKYGP-I5lMT8qZ_6dFf22fsL180qA,1575
57
+ anndata-0.12.7.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -16,6 +16,8 @@ from typing import TYPE_CHECKING, cast
16
16
 
17
17
  import pytest
18
18
 
19
+ import anndata
20
+
19
21
  if TYPE_CHECKING:
20
22
  from collections.abc import Generator, Iterable
21
23
  from pathlib import Path
@@ -23,7 +25,6 @@ if TYPE_CHECKING:
23
25
 
24
26
  @pytest.fixture(autouse=True)
25
27
  def _anndata_test_env(request: pytest.FixtureRequest) -> None:
26
- import anndata
27
28
 
28
29
  if isinstance(request.node, pytest.DoctestItem):
29
30
  request.getfixturevalue("_doctest_env")