anndata 0.12.7__py3-none-any.whl → 0.12.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anndata/_core/anndata.py +6 -1
- anndata/_core/index.py +14 -12
- anndata/_core/merge.py +65 -90
- anndata/_core/xarray.py +44 -16
- anndata/_io/h5ad.py +2 -2
- anndata/_io/specs/lazy_methods.py +18 -22
- anndata/_io/specs/methods.py +19 -4
- anndata/_io/utils.py +30 -1
- anndata/_io/zarr.py +6 -2
- anndata/compat/__init__.py +23 -3
- anndata/experimental/backed/_io.py +17 -11
- anndata/experimental/backed/_lazy_arrays.py +31 -33
- anndata/experimental/merge.py +19 -13
- anndata/tests/helpers.py +14 -18
- {anndata-0.12.7.dist-info → anndata-0.12.8.dist-info}/METADATA +3 -3
- {anndata-0.12.7.dist-info → anndata-0.12.8.dist-info}/RECORD +18 -18
- {anndata-0.12.7.dist-info → anndata-0.12.8.dist-info}/WHEEL +0 -0
- {anndata-0.12.7.dist-info → anndata-0.12.8.dist-info}/licenses/LICENSE +0 -0
anndata/_core/anndata.py
CHANGED
|
@@ -362,7 +362,12 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641
|
|
|
362
362
|
|
|
363
363
|
# init from file
|
|
364
364
|
if filename is not None:
|
|
365
|
-
|
|
365
|
+
fileobj, filename = (
|
|
366
|
+
(filename, None)
|
|
367
|
+
if isinstance(filename, h5py.File)
|
|
368
|
+
else (None, filename)
|
|
369
|
+
)
|
|
370
|
+
self.file = AnnDataFileManager(self, filename, filemode, fileobj)
|
|
366
371
|
else:
|
|
367
372
|
self.file = AnnDataFileManager(self, None)
|
|
368
373
|
|
anndata/_core/index.py
CHANGED
|
@@ -25,12 +25,6 @@ def _normalize_indices(
|
|
|
25
25
|
# deal with tuples of length 1
|
|
26
26
|
if isinstance(index, tuple) and len(index) == 1:
|
|
27
27
|
index = index[0]
|
|
28
|
-
# deal with pd.Series
|
|
29
|
-
if isinstance(index, pd.Series):
|
|
30
|
-
index = index.values
|
|
31
|
-
if isinstance(index, tuple):
|
|
32
|
-
# TODO: The series should probably be aligned first
|
|
33
|
-
index = tuple(i.values if isinstance(i, pd.Series) else i for i in index)
|
|
34
28
|
ax0, ax1 = unpack_index(index)
|
|
35
29
|
ax0 = _normalize_index(ax0, names0)
|
|
36
30
|
ax1 = _normalize_index(ax1, names1)
|
|
@@ -45,6 +39,9 @@ def _normalize_index( # noqa: PLR0911, PLR0912
|
|
|
45
39
|
msg = f"Don’t call _normalize_index with non-categorical/string names and non-range index {index}"
|
|
46
40
|
raise TypeError(msg)
|
|
47
41
|
|
|
42
|
+
if isinstance(indexer, pd.Index | pd.Series):
|
|
43
|
+
indexer = indexer.array
|
|
44
|
+
|
|
48
45
|
# the following is insanely slow for sequences,
|
|
49
46
|
# we replaced it using pandas below
|
|
50
47
|
def name_idx(i):
|
|
@@ -65,16 +62,21 @@ def _normalize_index( # noqa: PLR0911, PLR0912
|
|
|
65
62
|
elif isinstance(indexer, str):
|
|
66
63
|
return index.get_loc(indexer) # int
|
|
67
64
|
elif isinstance(
|
|
68
|
-
indexer,
|
|
65
|
+
indexer,
|
|
66
|
+
Sequence
|
|
67
|
+
| np.ndarray
|
|
68
|
+
| pd.api.extensions.ExtensionArray
|
|
69
|
+
| CSMatrix
|
|
70
|
+
| np.matrix
|
|
71
|
+
| CSArray,
|
|
69
72
|
):
|
|
70
|
-
if
|
|
71
|
-
|
|
72
|
-
or (indexer.shape == (1, index.shape[0]))
|
|
73
|
+
if (shape := getattr(indexer, "shape", None)) is not None and (
|
|
74
|
+
shape == (index.shape[0], 1) or shape == (1, index.shape[0])
|
|
73
75
|
):
|
|
74
76
|
if isinstance(indexer, CSMatrix | CSArray):
|
|
75
77
|
indexer = indexer.toarray()
|
|
76
78
|
indexer = np.ravel(indexer)
|
|
77
|
-
if not isinstance(indexer, np.ndarray
|
|
79
|
+
if not isinstance(indexer, np.ndarray):
|
|
78
80
|
indexer = np.array(indexer)
|
|
79
81
|
if len(indexer) == 0:
|
|
80
82
|
indexer = indexer.astype(int)
|
|
@@ -111,7 +113,7 @@ def _normalize_index( # noqa: PLR0911, PLR0912
|
|
|
111
113
|
return indexer.data.compute()
|
|
112
114
|
return indexer.data
|
|
113
115
|
msg = f"Unknown indexer {indexer!r} of type {type(indexer)}"
|
|
114
|
-
raise IndexError()
|
|
116
|
+
raise IndexError(msg)
|
|
115
117
|
|
|
116
118
|
|
|
117
119
|
def _fix_slice_bounds(s: slice, length: int) -> slice:
|
anndata/_core/merge.py
CHANGED
|
@@ -10,7 +10,7 @@ from collections.abc import Callable, Mapping, MutableSet
|
|
|
10
10
|
from functools import partial, reduce, singledispatch
|
|
11
11
|
from itertools import repeat
|
|
12
12
|
from operator import and_, or_, sub
|
|
13
|
-
from typing import TYPE_CHECKING, Literal, TypeVar
|
|
13
|
+
from typing import TYPE_CHECKING, Literal, TypeVar, cast
|
|
14
14
|
from warnings import warn
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
@@ -44,7 +44,7 @@ if TYPE_CHECKING:
|
|
|
44
44
|
|
|
45
45
|
from anndata._types import Join_T
|
|
46
46
|
|
|
47
|
-
from ..compat import XDataArray
|
|
47
|
+
from ..compat import XDataArray
|
|
48
48
|
|
|
49
49
|
T = TypeVar("T")
|
|
50
50
|
|
|
@@ -244,110 +244,89 @@ def as_cp_sparse(x) -> CupySparseMatrix:
|
|
|
244
244
|
def unify_dtypes(
|
|
245
245
|
dfs: Iterable[pd.DataFrame | Dataset2D],
|
|
246
246
|
) -> list[pd.DataFrame | Dataset2D]:
|
|
247
|
-
"""
|
|
248
|
-
Attempts to unify datatypes from multiple dataframes.
|
|
247
|
+
"""Attempt to unify datatypes from multiple dataframes.
|
|
249
248
|
|
|
250
249
|
For catching cases where pandas would convert to object dtype.
|
|
251
250
|
"""
|
|
252
251
|
dfs = list(dfs)
|
|
253
252
|
# Get shared categorical columns
|
|
254
|
-
df_dtypes = [
|
|
253
|
+
df_dtypes = [
|
|
254
|
+
cast("pd.Series[ExtensionDtype]", df.dtypes).to_dict()
|
|
255
|
+
if isinstance(df, pd.DataFrame)
|
|
256
|
+
else df.dtypes
|
|
257
|
+
for df in dfs
|
|
258
|
+
]
|
|
255
259
|
columns = reduce(lambda x, y: x.union(y), [df.columns for df in dfs])
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
260
|
+
dtypes = {
|
|
261
|
+
col: (
|
|
262
|
+
[df[col] for df in df_dtypes if col in df],
|
|
263
|
+
any(col not in df for df in df_dtypes),
|
|
264
|
+
)
|
|
265
|
+
for col in columns
|
|
266
|
+
}
|
|
262
267
|
if len(dtypes) == 0:
|
|
263
268
|
return dfs
|
|
264
|
-
else:
|
|
265
|
-
dfs = [df.copy(deep=False) for df in dfs]
|
|
266
269
|
|
|
267
270
|
new_dtypes = {
|
|
268
271
|
col: target_dtype
|
|
269
|
-
for col,
|
|
270
|
-
if (target_dtype := try_unifying_dtype(
|
|
272
|
+
for col, (dts, has_missing) in dtypes.items()
|
|
273
|
+
if (target_dtype := try_unifying_dtype(dts, has_missing=has_missing))
|
|
274
|
+
is not None
|
|
271
275
|
}
|
|
272
276
|
|
|
277
|
+
dfs = [df.copy(deep=False) for df in dfs]
|
|
273
278
|
for df in dfs:
|
|
274
279
|
for col, dtype in new_dtypes.items():
|
|
275
280
|
if col in df:
|
|
276
281
|
df[col] = df[col].astype(dtype)
|
|
277
|
-
|
|
278
282
|
return dfs
|
|
279
283
|
|
|
280
284
|
|
|
281
|
-
def try_unifying_dtype(
|
|
282
|
-
|
|
283
|
-
) ->
|
|
284
|
-
"""
|
|
285
|
-
If dtypes can be unified, returns the dtype they would be unified to.
|
|
285
|
+
def try_unifying_dtype(
|
|
286
|
+
dtypes: Sequence[np.dtype | ExtensionDtype], *, has_missing: bool
|
|
287
|
+
) -> ExtensionDtype | type[object] | None:
|
|
288
|
+
"""Determine unified dtype if possible.
|
|
286
289
|
|
|
287
|
-
Returns None if they can
|
|
288
|
-
us.
|
|
290
|
+
Returns None if they can’t be unified, or if we can expect pandas to unify them for us.
|
|
289
291
|
|
|
290
292
|
Params
|
|
291
293
|
------
|
|
292
|
-
|
|
293
|
-
A list of dtypes to unify. Can be numpy
|
|
294
|
-
|
|
294
|
+
dtypes
|
|
295
|
+
A list of dtypes to unify. Can be numpy or pandas dtypes
|
|
296
|
+
has_missing
|
|
297
|
+
Whether the result needs to accommodate missing values
|
|
295
298
|
"""
|
|
296
|
-
dtypes: set[pd.CategoricalDtype] = set()
|
|
297
299
|
# Categorical
|
|
298
|
-
if any(isinstance(dtype, pd.CategoricalDtype) for dtype in
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
300
|
+
if any(isinstance(dtype, pd.CategoricalDtype) for dtype in dtypes):
|
|
301
|
+
if not all(isinstance(dtype, pd.CategoricalDtype) for dtype in dtypes):
|
|
302
|
+
return None
|
|
303
|
+
if TYPE_CHECKING:
|
|
304
|
+
dtypes = cast("Sequence[pd.CategoricalDtype]", dtypes)
|
|
305
|
+
|
|
306
|
+
all_categories = reduce(
|
|
307
|
+
lambda x, y: x.union(y), (dtype.categories for dtype in dtypes)
|
|
308
|
+
)
|
|
309
|
+
if not any(dtype.ordered for dtype in dtypes):
|
|
310
|
+
return pd.CategoricalDtype(natsorted(all_categories), ordered=False)
|
|
311
|
+
|
|
312
|
+
dtypes_with_categories = [
|
|
313
|
+
dtype for dtype in dtypes if len(dtype.categories) > 0
|
|
314
|
+
]
|
|
315
|
+
if dtypes_with_categories and all(
|
|
316
|
+
len(dtype.categories) == len(all_categories)
|
|
317
|
+
and dtype.ordered
|
|
318
|
+
and np.all(all_categories == dtype.categories)
|
|
319
|
+
for dtype in dtypes_with_categories
|
|
320
|
+
):
|
|
321
|
+
return dtypes_with_categories[0]
|
|
322
|
+
|
|
323
|
+
return object
|
|
311
324
|
|
|
312
|
-
if not ordered:
|
|
313
|
-
return pd.CategoricalDtype(natsorted(categories), ordered=False)
|
|
314
|
-
else: # for xarray Datasets, see https://github.com/pydata/xarray/issues/10247
|
|
315
|
-
categories_intersection = reduce(
|
|
316
|
-
lambda x, y: x.intersection(y),
|
|
317
|
-
(
|
|
318
|
-
dtype.categories
|
|
319
|
-
for dtype in dtypes
|
|
320
|
-
if not pd.isnull(dtype) and len(dtype.categories) > 0
|
|
321
|
-
),
|
|
322
|
-
)
|
|
323
|
-
if len(categories_intersection) < len(categories):
|
|
324
|
-
return object
|
|
325
|
-
else:
|
|
326
|
-
same_orders = all(
|
|
327
|
-
dtype.ordered
|
|
328
|
-
for dtype in dtypes
|
|
329
|
-
if not pd.isnull(dtype) and len(dtype.categories) > 0
|
|
330
|
-
)
|
|
331
|
-
same_orders &= all(
|
|
332
|
-
np.all(categories == dtype.categories)
|
|
333
|
-
for dtype in dtypes
|
|
334
|
-
if not pd.isnull(dtype) and len(dtype.categories) > 0
|
|
335
|
-
)
|
|
336
|
-
if same_orders:
|
|
337
|
-
return next(
|
|
338
|
-
dtype
|
|
339
|
-
for dtype in dtypes
|
|
340
|
-
if not pd.isnull(dtype) and len(dtype.categories) > 0
|
|
341
|
-
)
|
|
342
|
-
return object
|
|
343
325
|
# Boolean
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
return None
|
|
349
|
-
else:
|
|
350
|
-
return None
|
|
326
|
+
if all(pd.api.types.is_bool_dtype(dtype) for dtype in dtypes) and has_missing:
|
|
327
|
+
return pd.BooleanDtype()
|
|
328
|
+
|
|
329
|
+
return None
|
|
351
330
|
|
|
352
331
|
|
|
353
332
|
def check_combinable_cols(cols: list[pd.Index], join: Join_T):
|
|
@@ -1207,15 +1186,13 @@ def make_dask_col_from_extension_dtype(
|
|
|
1207
1186
|
A :class:`dask.Array`: representation of the column.
|
|
1208
1187
|
"""
|
|
1209
1188
|
import dask.array as da
|
|
1210
|
-
import xarray as xr
|
|
1211
|
-
from xarray.core.indexing import LazilyIndexedArray
|
|
1212
1189
|
|
|
1213
1190
|
from anndata._io.specs.lazy_methods import (
|
|
1214
1191
|
compute_chunk_layout_for_axis_size,
|
|
1215
1192
|
get_chunksize,
|
|
1216
1193
|
maybe_open_h5,
|
|
1217
1194
|
)
|
|
1218
|
-
from anndata.compat import
|
|
1195
|
+
from anndata.compat import xarray as xr
|
|
1219
1196
|
from anndata.experimental import read_elem_lazy
|
|
1220
1197
|
|
|
1221
1198
|
base_path_or_zarr_group = col.attrs.get("base_path_or_zarr_group")
|
|
@@ -1224,7 +1201,6 @@ def make_dask_col_from_extension_dtype(
|
|
|
1224
1201
|
base_path_or_zarr_group is not None and elem_name is not None
|
|
1225
1202
|
): # lazy, backed by store
|
|
1226
1203
|
dims = col.dims
|
|
1227
|
-
coords = col.coords.copy()
|
|
1228
1204
|
with maybe_open_h5(base_path_or_zarr_group, elem_name) as f:
|
|
1229
1205
|
maybe_chunk_size = get_chunksize(read_elem_lazy(f))
|
|
1230
1206
|
chunk_size = (
|
|
@@ -1238,17 +1214,14 @@ def make_dask_col_from_extension_dtype(
|
|
|
1238
1214
|
# reopening is important to get around h5py's unserializable lock in processes
|
|
1239
1215
|
with maybe_open_h5(base_path_or_zarr_group, elem_name) as f:
|
|
1240
1216
|
v = read_elem_lazy(f)
|
|
1241
|
-
variable = xr.Variable(
|
|
1242
|
-
|
|
1243
|
-
variable,
|
|
1244
|
-
coords=coords,
|
|
1245
|
-
dims=dims,
|
|
1217
|
+
variable = xr.Variable(
|
|
1218
|
+
data=xr.core.indexing.LazilyIndexedArray(v), dims=dims
|
|
1246
1219
|
)
|
|
1247
1220
|
idx = tuple(
|
|
1248
1221
|
slice(start, stop)
|
|
1249
1222
|
for start, stop in block_info[None]["array-location"]
|
|
1250
1223
|
)
|
|
1251
|
-
chunk = np.array(
|
|
1224
|
+
chunk = np.array(variable.data[idx])
|
|
1252
1225
|
return chunk
|
|
1253
1226
|
|
|
1254
1227
|
if col.dtype == "category" or col.dtype == "string" or use_only_object_dtype: # noqa PLR1714
|
|
@@ -1268,7 +1241,7 @@ def make_dask_col_from_extension_dtype(
|
|
|
1268
1241
|
|
|
1269
1242
|
def make_xarray_extension_dtypes_dask(
|
|
1270
1243
|
annotations: Iterable[Dataset2D], *, use_only_object_dtype: bool = False
|
|
1271
|
-
) -> Generator[
|
|
1244
|
+
) -> Generator[Dataset2D, None, None]:
|
|
1272
1245
|
"""
|
|
1273
1246
|
Creates a generator of Dataset2D objects with dask arrays in place of :class:`pandas.api.extensions.ExtensionArray` dtype columns.
|
|
1274
1247
|
|
|
@@ -1710,6 +1683,9 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915
|
|
|
1710
1683
|
alt_annotations, use_only_object_dtype=True
|
|
1711
1684
|
)
|
|
1712
1685
|
)
|
|
1686
|
+
for a in annotations_with_only_dask:
|
|
1687
|
+
if a.true_index_dim != a.index_dim:
|
|
1688
|
+
a.index = a.true_index
|
|
1713
1689
|
annotations_with_only_dask = [
|
|
1714
1690
|
a.ds.rename({a.true_index_dim: "merge_index"})
|
|
1715
1691
|
for a in annotations_with_only_dask
|
|
@@ -1717,7 +1693,6 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915
|
|
|
1717
1693
|
alt_annot = Dataset2D(
|
|
1718
1694
|
xr.merge(annotations_with_only_dask, join=join, compat="override")
|
|
1719
1695
|
)
|
|
1720
|
-
alt_annot.true_index_dim = "merge_index"
|
|
1721
1696
|
|
|
1722
1697
|
X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value)
|
|
1723
1698
|
|
anndata/_core/xarray.py
CHANGED
|
@@ -117,7 +117,8 @@ class Dataset2D:
|
|
|
117
117
|
|
|
118
118
|
@property
|
|
119
119
|
def true_index_dim(self) -> str:
|
|
120
|
-
"""
|
|
120
|
+
"""Key of the “true” index.
|
|
121
|
+
|
|
121
122
|
Because xarray loads its coordinates/indexes in memory,
|
|
122
123
|
we allow for signaling that a given variable, which is not a coordinate, is the "true" index.
|
|
123
124
|
|
|
@@ -130,7 +131,7 @@ class Dataset2D:
|
|
|
130
131
|
return self.ds.attrs.get("indexing_key", self.index_dim)
|
|
131
132
|
|
|
132
133
|
@true_index_dim.setter
|
|
133
|
-
def true_index_dim(self, val: str):
|
|
134
|
+
def true_index_dim(self, val: str | None) -> None:
|
|
134
135
|
if val is None or (val == self.index_dim and "indexing_key" in self.ds.attrs):
|
|
135
136
|
del self.ds.attrs["indexing_key"]
|
|
136
137
|
elif val not in self.ds.dims:
|
|
@@ -146,8 +147,10 @@ class Dataset2D:
|
|
|
146
147
|
|
|
147
148
|
@property
|
|
148
149
|
def index(self) -> pd.Index:
|
|
149
|
-
""":
|
|
150
|
-
|
|
150
|
+
"""A :class:`pandas.Index` object corresponding to :attr:`anndata.experimental.backed.Dataset2D.index_dim`.
|
|
151
|
+
|
|
152
|
+
:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.index` so this ensures usability.
|
|
153
|
+
|
|
151
154
|
Returns
|
|
152
155
|
-------
|
|
153
156
|
The index of the of the dataframe as resolved from :attr:`~xarray.Dataset.coords`.
|
|
@@ -155,14 +158,26 @@ class Dataset2D:
|
|
|
155
158
|
return self.ds.indexes[self.index_dim]
|
|
156
159
|
|
|
157
160
|
@index.setter
|
|
158
|
-
def index(self, val) -> None:
|
|
161
|
+
def index(self, val: object | pd.Index | XDataArray) -> None:
|
|
159
162
|
index_dim = self.index_dim
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
163
|
+
if (
|
|
164
|
+
isinstance(val, pd.Index | XDataArray)
|
|
165
|
+
and val.name is not None
|
|
166
|
+
and val.name != index_dim
|
|
167
|
+
):
|
|
168
|
+
# swap the names of the dimensions out and drop the old index variable, setting `coords` in the process if `val` came from this dataset.
|
|
169
|
+
self._ds = self.ds.swap_dims({index_dim: val.name}).drop_vars(index_dim)
|
|
170
|
+
# swapping dims only changes the name, but not the underlying value i.e., the coordinate, if the underlying value was not present in the dataset.
|
|
171
|
+
# If we were to `__setitem__` on `.coords` without checking, `val` could have the old `index_dim` as its `name` because it was present in the dataset.
|
|
172
|
+
if val.name not in self.ds.coords:
|
|
173
|
+
self.ds.coords[val.name] = val
|
|
174
|
+
self._validate_shape_invariants(self._ds)
|
|
175
|
+
else:
|
|
176
|
+
self.ds.coords[index_dim] = (index_dim, val)
|
|
164
177
|
# without `indexing_key` explicitly set on `self.ds.attrs`, `self.true_index_dim` will use the `self.index_dim`
|
|
165
|
-
if "indexing_key" in self.ds.attrs
|
|
178
|
+
if "indexing_key" in self.ds.attrs and (
|
|
179
|
+
hasattr(val, "name") and val.name == self.ds.attrs["indexing_key"]
|
|
180
|
+
):
|
|
166
181
|
del self.ds.attrs["indexing_key"]
|
|
167
182
|
|
|
168
183
|
@property
|
|
@@ -172,12 +187,14 @@ class Dataset2D:
|
|
|
172
187
|
|
|
173
188
|
@property
|
|
174
189
|
def true_index(self) -> pd.Index:
|
|
175
|
-
""":attr:`~anndata.experimental.backed.Dataset2D.true_xr_index` as a :class:`pandas.Index
|
|
176
|
-
|
|
190
|
+
""":attr:`~anndata.experimental.backed.Dataset2D.true_xr_index` as a :class:`pandas.Index`."""
|
|
191
|
+
idx = self.true_xr_index.to_index()
|
|
192
|
+
idx.name = self.true_xr_index.name
|
|
193
|
+
return idx
|
|
177
194
|
|
|
178
195
|
@property
|
|
179
196
|
def shape(self) -> tuple[int, int]:
|
|
180
|
-
""":attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.shape` so this ensures usability
|
|
197
|
+
""":attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.shape` so this ensures usability.
|
|
181
198
|
|
|
182
199
|
Returns
|
|
183
200
|
-------
|
|
@@ -187,7 +204,7 @@ class Dataset2D:
|
|
|
187
204
|
|
|
188
205
|
@property
|
|
189
206
|
def iloc(self) -> Dataset2DIlocIndexer:
|
|
190
|
-
""":attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.iloc` so this ensures usability
|
|
207
|
+
""":attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.iloc` so this ensures usability.
|
|
191
208
|
|
|
192
209
|
Returns
|
|
193
210
|
-------
|
|
@@ -268,6 +285,17 @@ class Dataset2D:
|
|
|
268
285
|
columns.discard(index_key)
|
|
269
286
|
return pd.Index(columns)
|
|
270
287
|
|
|
288
|
+
@columns.setter
|
|
289
|
+
def columns(self, val) -> None:
|
|
290
|
+
if len(self.columns.symmetric_difference(val)) > 0:
|
|
291
|
+
msg = "Trying to rename the keys of the mapping with new names - please use a different API to rename the keys of the underlying dataset mapping."
|
|
292
|
+
raise ValueError(msg)
|
|
293
|
+
warnings.warn(
|
|
294
|
+
"Renaming or reordering columns on `Dataset2D` has no effect because the underlying data structure has no apparent ordering on its keys",
|
|
295
|
+
UserWarning,
|
|
296
|
+
stacklevel=2,
|
|
297
|
+
)
|
|
298
|
+
|
|
271
299
|
def __setitem__(
|
|
272
300
|
self, key: Hashable | Iterable[Hashable] | Mapping, value: Any
|
|
273
301
|
) -> None:
|
|
@@ -348,9 +376,9 @@ class Dataset2D:
|
|
|
348
376
|
return len(self.ds)
|
|
349
377
|
|
|
350
378
|
@property
|
|
351
|
-
def dtypes(self) ->
|
|
379
|
+
def dtypes(self) -> Mapping[Hashable, np.dtype]:
|
|
352
380
|
"""
|
|
353
|
-
Return a
|
|
381
|
+
Return a Mapping with the dtypes of the variables in the Dataset2D.
|
|
354
382
|
"""
|
|
355
383
|
return self.ds.dtypes
|
|
356
384
|
|
anndata/_io/h5ad.py
CHANGED
|
@@ -172,9 +172,8 @@ def write_sparse_as_dense(
|
|
|
172
172
|
def read_h5ad_backed(
|
|
173
173
|
filename: str | PathLike[str], mode: Literal["r", "r+"]
|
|
174
174
|
) -> AnnData:
|
|
175
|
-
d = dict(filename=filename, filemode=mode)
|
|
176
|
-
|
|
177
175
|
f = h5py.File(filename, mode)
|
|
176
|
+
d = dict(filename=f)
|
|
178
177
|
|
|
179
178
|
attributes = ["obsm", "varm", "obsp", "varp", "uns", "layers"]
|
|
180
179
|
df_attributes = ["obs", "var"]
|
|
@@ -191,6 +190,7 @@ def read_h5ad_backed(
|
|
|
191
190
|
d["raw"] = _read_raw(f, attrs={"var", "varm"})
|
|
192
191
|
|
|
193
192
|
adata = AnnData(**d)
|
|
193
|
+
assert adata.file._file is f
|
|
194
194
|
|
|
195
195
|
# Backwards compat to <0.7
|
|
196
196
|
if isinstance(f["obs"], h5py.Dataset):
|
|
@@ -16,11 +16,12 @@ from anndata._core.xarray import Dataset2D, requires_xarray
|
|
|
16
16
|
from anndata.abc import CSCDataset, CSRDataset
|
|
17
17
|
from anndata.compat import (
|
|
18
18
|
NULLABLE_NUMPY_STRING_TYPE,
|
|
19
|
+
NUMPY_2,
|
|
19
20
|
DaskArray,
|
|
20
21
|
H5Array,
|
|
21
22
|
H5Group,
|
|
22
|
-
XDataArray,
|
|
23
23
|
XDataset,
|
|
24
|
+
XVariable,
|
|
24
25
|
ZarrArray,
|
|
25
26
|
ZarrGroup,
|
|
26
27
|
)
|
|
@@ -248,24 +249,18 @@ def _gen_xarray_dict_iterator_from_elems(
|
|
|
248
249
|
elem_dict: dict[str, LazyDataStructures],
|
|
249
250
|
dim_name: str,
|
|
250
251
|
index: np.NDArray,
|
|
251
|
-
) -> Generator[tuple[str,
|
|
252
|
+
) -> Generator[tuple[str, XVariable], None, None]:
|
|
252
253
|
from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray
|
|
253
254
|
|
|
254
|
-
from ...compat import XDataArray
|
|
255
255
|
from ...compat import xarray as xr
|
|
256
256
|
|
|
257
257
|
for k, v in elem_dict.items():
|
|
258
258
|
if isinstance(v, DaskArray) and k != dim_name:
|
|
259
|
-
|
|
259
|
+
variable = xr.Variable([dim_name], data=v)
|
|
260
260
|
elif isinstance(v, CategoricalArray | MaskedArray) and k != dim_name:
|
|
261
261
|
variable = xr.Variable(
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
data_array = XDataArray(
|
|
265
|
-
variable,
|
|
266
|
-
coords=[index],
|
|
267
|
-
dims=[dim_name],
|
|
268
|
-
name=k,
|
|
262
|
+
[dim_name],
|
|
263
|
+
data=xr.core.indexing.LazilyIndexedArray(v),
|
|
269
264
|
attrs={
|
|
270
265
|
"base_path_or_zarr_group": v.base_path_or_zarr_group,
|
|
271
266
|
"elem_name": v.elem_name,
|
|
@@ -277,13 +272,11 @@ def _gen_xarray_dict_iterator_from_elems(
|
|
|
277
272
|
},
|
|
278
273
|
)
|
|
279
274
|
elif k == dim_name:
|
|
280
|
-
|
|
281
|
-
index, coords=[index], dims=[dim_name], name=dim_name
|
|
282
|
-
)
|
|
275
|
+
variable = xr.Variable([dim_name], data=index)
|
|
283
276
|
else:
|
|
284
277
|
msg = f"Could not read {k}: {v} from into xarray Dataset2D"
|
|
285
278
|
raise ValueError(msg)
|
|
286
|
-
yield k,
|
|
279
|
+
yield k, variable
|
|
287
280
|
|
|
288
281
|
|
|
289
282
|
DUMMY_RANGE_INDEX_KEY = "_anndata_dummy_range_index"
|
|
@@ -325,11 +318,9 @@ def read_dataframe(
|
|
|
325
318
|
_gen_xarray_dict_iterator_from_elems(elem_dict, dim_name, index)
|
|
326
319
|
)
|
|
327
320
|
if use_range_index:
|
|
328
|
-
elem_xarray_dict[DUMMY_RANGE_INDEX_KEY] =
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
dims=[DUMMY_RANGE_INDEX_KEY],
|
|
332
|
-
name=DUMMY_RANGE_INDEX_KEY,
|
|
321
|
+
elem_xarray_dict[DUMMY_RANGE_INDEX_KEY] = XVariable(
|
|
322
|
+
[DUMMY_RANGE_INDEX_KEY],
|
|
323
|
+
data=index,
|
|
333
324
|
)
|
|
334
325
|
ds = Dataset2D(XDataset(elem_xarray_dict))
|
|
335
326
|
ds.is_backed = True
|
|
@@ -377,9 +368,14 @@ def read_nullable(
|
|
|
377
368
|
Path(filename(elem)) if isinstance(elem, H5Group) else elem
|
|
378
369
|
)
|
|
379
370
|
elem_name = get_elem_name(elem)
|
|
371
|
+
values = elem["values"]
|
|
372
|
+
# HDF5 stores strings as bytes; use .astype("T") to decode on access
|
|
373
|
+
# h5py recommends .astype("T") over .asstr() when using numpy ≥2
|
|
374
|
+
if encoding_type == "nullable-string-array" and isinstance(elem, H5Group):
|
|
375
|
+
values = values.astype("T") if NUMPY_2 else values.asstr()
|
|
380
376
|
return MaskedArray(
|
|
381
|
-
values=
|
|
382
|
-
mask=elem
|
|
377
|
+
values=values,
|
|
378
|
+
mask=elem["mask"],
|
|
383
379
|
dtype_str=encoding_type,
|
|
384
380
|
base_path_or_zarr_group=base_path_or_zarr_group,
|
|
385
381
|
elem_name=elem_name,
|
anndata/_io/specs/methods.py
CHANGED
|
@@ -95,9 +95,21 @@ GLOBAL_LOCK = Lock()
|
|
|
95
95
|
# return False
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
def zarr_v3_compressor_compat(dataset_kwargs) -> dict:
|
|
99
|
-
|
|
100
|
-
|
|
98
|
+
def zarr_v3_compressor_compat(dataset_kwargs: dict) -> dict:
|
|
99
|
+
"""Handle mismatch between our compressor kwarg and :func:`zarr.create_array` in v3's `compressors` arg
|
|
100
|
+
See https://zarr.readthedocs.io/en/stable/api/zarr/create/#zarr.create_array
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
dataset_kwarg
|
|
105
|
+
The kwarg dict potentially containing "compressor"
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
The kwarg dict with "compressor" moved to "compressors" if zarr v3 is in use.
|
|
110
|
+
"""
|
|
111
|
+
if not is_zarr_v2() and "compressor" in dataset_kwargs:
|
|
112
|
+
dataset_kwargs["compressors"] = dataset_kwargs.pop("compressor")
|
|
101
113
|
return dataset_kwargs
|
|
102
114
|
|
|
103
115
|
|
|
@@ -1098,7 +1110,10 @@ def write_categorical(
|
|
|
1098
1110
|
|
|
1099
1111
|
_writer.write_elem(g, "codes", v.codes, dataset_kwargs=dataset_kwargs)
|
|
1100
1112
|
_writer.write_elem(
|
|
1101
|
-
g,
|
|
1113
|
+
g,
|
|
1114
|
+
"categories",
|
|
1115
|
+
v.categories.to_numpy(),
|
|
1116
|
+
dataset_kwargs=dataset_kwargs,
|
|
1102
1117
|
)
|
|
1103
1118
|
|
|
1104
1119
|
|
anndata/_io/utils.py
CHANGED
|
@@ -1,17 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from collections.abc import Callable
|
|
4
|
-
from functools import WRAPPER_ASSIGNMENTS, wraps
|
|
4
|
+
from functools import WRAPPER_ASSIGNMENTS, cache, wraps
|
|
5
5
|
from itertools import pairwise
|
|
6
6
|
from typing import TYPE_CHECKING, Literal, cast
|
|
7
7
|
from warnings import warn
|
|
8
8
|
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
9
12
|
from .._core.sparse_dataset import BaseCompressedSparseDataset
|
|
10
13
|
|
|
11
14
|
if TYPE_CHECKING:
|
|
12
15
|
from collections.abc import Callable, Mapping
|
|
13
16
|
from typing import Any, Literal
|
|
14
17
|
|
|
18
|
+
from pandas.core.dtypes.dtypes import BaseMaskedDtype
|
|
19
|
+
|
|
15
20
|
from .._types import StorageType, _WriteInternal
|
|
16
21
|
from ..compat import H5Group, ZarrGroup
|
|
17
22
|
from ..typing import RWAble
|
|
@@ -119,6 +124,30 @@ def check_key(key):
|
|
|
119
124
|
raise TypeError(msg)
|
|
120
125
|
|
|
121
126
|
|
|
127
|
+
@cache
|
|
128
|
+
def pandas_nullable_dtype(dtype: np.dtype) -> BaseMaskedDtype:
|
|
129
|
+
"""Infer nullable dtype from numpy dtype.
|
|
130
|
+
|
|
131
|
+
There is no public pandas API for this, so this is the cleanest way.
|
|
132
|
+
See <https://github.com/pandas-dev/pandas/issues/63608>
|
|
133
|
+
"""
|
|
134
|
+
try:
|
|
135
|
+
from pandas.core.dtypes.dtypes import BaseMaskedDtype
|
|
136
|
+
except ImportError:
|
|
137
|
+
pass
|
|
138
|
+
else:
|
|
139
|
+
return BaseMaskedDtype.from_numpy_dtype(dtype)
|
|
140
|
+
|
|
141
|
+
match dtype.kind:
|
|
142
|
+
case "b":
|
|
143
|
+
array_type = pd.arrays.BooleanArray
|
|
144
|
+
case "i" | "u":
|
|
145
|
+
array_type = pd.arrays.IntegerArray
|
|
146
|
+
case _:
|
|
147
|
+
raise NotImplementedError
|
|
148
|
+
return array_type(np.ones(1, dtype), np.ones(1, bool)).dtype
|
|
149
|
+
|
|
150
|
+
|
|
122
151
|
# -------------------------------------------------------------------------------
|
|
123
152
|
# Generic functions
|
|
124
153
|
# -------------------------------------------------------------------------------
|
anndata/_io/zarr.py
CHANGED
|
@@ -150,8 +150,12 @@ def read_dataframe(group: zarr.Group | zarr.Array) -> pd.DataFrame:
|
|
|
150
150
|
def open_write_group(
|
|
151
151
|
store: StoreLike, *, mode: AccessModeLiteral = "w", **kwargs
|
|
152
152
|
) -> zarr.Group:
|
|
153
|
-
if
|
|
154
|
-
|
|
153
|
+
if "zarr_format" not in kwargs:
|
|
154
|
+
if settings.zarr_write_format == 2 or is_zarr_v2():
|
|
155
|
+
msg = "Writing zarr v2 data will no longer be the default in the next minor release. v3 data will be written by default. If you are explicitly setting this configuration, consider migrating to the zarr v3 file format."
|
|
156
|
+
warn(msg, UserWarning, stacklevel=2)
|
|
157
|
+
if not is_zarr_v2():
|
|
158
|
+
kwargs["zarr_format"] = settings.zarr_write_format
|
|
155
159
|
return zarr.open_group(store, mode=mode, **kwargs)
|
|
156
160
|
|
|
157
161
|
|
anndata/compat/__init__.py
CHANGED
|
@@ -51,6 +51,7 @@ Index1D = (
|
|
|
51
51
|
| Sequence[bool]
|
|
52
52
|
| pd.Series # bool, int, str
|
|
53
53
|
| pd.Index
|
|
54
|
+
| pd.api.extensions.ExtensionArray # bool | int | str
|
|
54
55
|
| NDArray[np.str_]
|
|
55
56
|
| np.matrix # bool
|
|
56
57
|
| CSMatrix # bool
|
|
@@ -71,6 +72,26 @@ H5Group = h5py.Group
|
|
|
71
72
|
H5Array = h5py.Dataset
|
|
72
73
|
H5File = h5py.File
|
|
73
74
|
|
|
75
|
+
# h5py recommends using .astype("T") over .asstr() when using numpy ≥2
|
|
76
|
+
if TYPE_CHECKING:
|
|
77
|
+
from h5py._hl.dataset import AsStrView as H5AsStrView
|
|
78
|
+
from h5py._hl.dataset import AsTypeView as H5AsTypeView
|
|
79
|
+
else:
|
|
80
|
+
try:
|
|
81
|
+
try:
|
|
82
|
+
from h5py._hl.dataset import AsStrView as H5AsStrView
|
|
83
|
+
from h5py._hl.dataset import AsTypeView as H5AsTypeView
|
|
84
|
+
except ImportError:
|
|
85
|
+
# h5py 3.11 uses AsStrWrapper/AstypeWrapper (lowercase 't')
|
|
86
|
+
from h5py._hl.dataset import AsStrWrapper as H5AsStrView
|
|
87
|
+
from h5py._hl.dataset import AstypeWrapper as H5AsTypeView
|
|
88
|
+
except ImportError: # pragma: no cover
|
|
89
|
+
warn("AsTypeView changed import location", DeprecationWarning, stacklevel=1)
|
|
90
|
+
_ds = h5py.File.in_memory().create_dataset("x", shape=(), dtype="S1")
|
|
91
|
+
H5AsStrView = type(_ds.asstr())
|
|
92
|
+
H5AsTypeView = type(_ds.astype("U1"))
|
|
93
|
+
del _ds
|
|
94
|
+
|
|
74
95
|
|
|
75
96
|
#############################
|
|
76
97
|
# Optional deps
|
|
@@ -209,11 +230,10 @@ else:
|
|
|
209
230
|
# IO helpers
|
|
210
231
|
#############################
|
|
211
232
|
|
|
233
|
+
NUMPY_2 = Version(version("numpy")) >= Version("2")
|
|
212
234
|
|
|
213
235
|
NULLABLE_NUMPY_STRING_TYPE = (
|
|
214
|
-
np.dtype("O")
|
|
215
|
-
if Version(version("numpy")) < Version("2")
|
|
216
|
-
else np.dtypes.StringDType(na_object=pd.NA)
|
|
236
|
+
np.dtypes.StringDType(na_object=pd.NA) if NUMPY_2 else np.dtype("O")
|
|
217
237
|
)
|
|
218
238
|
|
|
219
239
|
PANDAS_SUPPORTS_NA_VALUE = Version(version("pandas")) >= Version("2.3")
|
|
@@ -59,19 +59,25 @@ def read_lazy(
|
|
|
59
59
|
Preparing example objects
|
|
60
60
|
|
|
61
61
|
>>> import anndata as ad
|
|
62
|
-
>>>
|
|
62
|
+
>>> import pooch
|
|
63
63
|
>>> import scanpy as sc
|
|
64
64
|
>>> base_url = "https://datasets.cellxgene.cziscience.com"
|
|
65
|
-
>>>
|
|
66
|
-
|
|
67
|
-
...
|
|
68
|
-
...
|
|
69
|
-
...
|
|
70
|
-
...
|
|
71
|
-
...
|
|
72
|
-
...
|
|
73
|
-
>>> path_b_cells = get_cellxgene_data(
|
|
74
|
-
|
|
65
|
+
>>> # To update hashes: pooch.retrieve(url, known_hash=None) prints the new hash
|
|
66
|
+
>>> def get_cellxgene_data(id_: str, hash_: str):
|
|
67
|
+
... return pooch.retrieve(
|
|
68
|
+
... f"{base_url}/{id_}.h5ad",
|
|
69
|
+
... known_hash=hash_,
|
|
70
|
+
... fname=f"{id_}.h5ad",
|
|
71
|
+
... path=sc.settings.datasetdir,
|
|
72
|
+
... )
|
|
73
|
+
>>> path_b_cells = get_cellxgene_data(
|
|
74
|
+
... "a93eab58-3d82-4b61-8a2f-d7666dcdb7c4",
|
|
75
|
+
... "sha256:dac90fe2aa8b78aee2c1fc963104592f8eff7b873ca21d01a51a5e416734651c",
|
|
76
|
+
... )
|
|
77
|
+
>>> path_fetal = get_cellxgene_data(
|
|
78
|
+
... "d170ff04-6da0-4156-a719-f8e1bbefbf53",
|
|
79
|
+
... "sha256:d497eebca03533919877b6fc876e8c9d8ba063199ddc86dd9fbcb9d1d87a3622",
|
|
80
|
+
... )
|
|
75
81
|
>>> b_cells_adata = ad.experimental.read_lazy(path_b_cells)
|
|
76
82
|
>>> fetal_adata = ad.experimental.read_lazy(path_fetal)
|
|
77
83
|
>>> print(b_cells_adata)
|
|
@@ -10,10 +10,13 @@ from anndata._core.index import _subset
|
|
|
10
10
|
from anndata._core.views import as_view
|
|
11
11
|
from anndata._io.specs.lazy_methods import get_chunksize
|
|
12
12
|
|
|
13
|
+
from ..._io.utils import pandas_nullable_dtype
|
|
13
14
|
from ..._settings import settings
|
|
14
15
|
from ...compat import (
|
|
15
16
|
NULLABLE_NUMPY_STRING_TYPE,
|
|
16
17
|
H5Array,
|
|
18
|
+
H5AsStrView,
|
|
19
|
+
H5AsTypeView,
|
|
17
20
|
XBackendArray,
|
|
18
21
|
XDataArray,
|
|
19
22
|
XZarrArrayWrapper,
|
|
@@ -24,8 +27,9 @@ if TYPE_CHECKING:
|
|
|
24
27
|
from pathlib import Path
|
|
25
28
|
from typing import Literal
|
|
26
29
|
|
|
30
|
+
from numpy.typing import NDArray
|
|
27
31
|
from pandas._libs.missing import NAType
|
|
28
|
-
from pandas.core.dtypes.
|
|
32
|
+
from pandas.core.dtypes.dtypes import BaseMaskedDtype
|
|
29
33
|
|
|
30
34
|
from anndata.compat import ZarrGroup
|
|
31
35
|
|
|
@@ -36,12 +40,13 @@ if TYPE_CHECKING:
|
|
|
36
40
|
from xarray.core.indexing import ExplicitIndexer
|
|
37
41
|
|
|
38
42
|
|
|
39
|
-
K = TypeVar("K", H5Array, ZarrArray)
|
|
43
|
+
K = TypeVar("K", H5Array | H5AsStrView | H5AsTypeView, ZarrArray)
|
|
40
44
|
|
|
41
45
|
|
|
42
46
|
class ZarrOrHDF5Wrapper(XZarrArrayWrapper, Generic[K]):
|
|
43
|
-
def __init__(self, array: K):
|
|
44
|
-
|
|
47
|
+
def __init__(self, array: K) -> None:
|
|
48
|
+
# AstypeView from h5py .astype() lacks chunks attribute
|
|
49
|
+
self.chunks = getattr(array, "chunks", None)
|
|
45
50
|
if isinstance(array, ZarrArray):
|
|
46
51
|
super().__init__(array)
|
|
47
52
|
return
|
|
@@ -73,7 +78,7 @@ class ZarrOrHDF5Wrapper(XZarrArrayWrapper, Generic[K]):
|
|
|
73
78
|
if (
|
|
74
79
|
isinstance(key, np.ndarray)
|
|
75
80
|
and np.issubdtype(key.dtype, np.integer)
|
|
76
|
-
and isinstance(self._array, H5Array)
|
|
81
|
+
and isinstance(self._array, H5Array | H5AsTypeView | H5AsStrView)
|
|
77
82
|
):
|
|
78
83
|
key_mask = np.zeros(self._array.shape).astype("bool")
|
|
79
84
|
key_mask[key] = True
|
|
@@ -89,7 +94,7 @@ class CategoricalArray(XBackendArray, Generic[K]):
|
|
|
89
94
|
"""
|
|
90
95
|
|
|
91
96
|
_codes: ZarrOrHDF5Wrapper[K]
|
|
92
|
-
_categories:
|
|
97
|
+
_categories: K
|
|
93
98
|
shape: tuple[int, ...]
|
|
94
99
|
base_path_or_zarr_group: Path | ZarrGroup
|
|
95
100
|
elem_name: str
|
|
@@ -97,7 +102,7 @@ class CategoricalArray(XBackendArray, Generic[K]):
|
|
|
97
102
|
def __init__(
|
|
98
103
|
self,
|
|
99
104
|
codes: K,
|
|
100
|
-
categories:
|
|
105
|
+
categories: K,
|
|
101
106
|
base_path_or_zarr_group: Path | ZarrGroup,
|
|
102
107
|
elem_name: str,
|
|
103
108
|
*args,
|
|
@@ -153,11 +158,11 @@ class MaskedArray(XBackendArray, Generic[K]):
|
|
|
153
158
|
|
|
154
159
|
def __init__(
|
|
155
160
|
self,
|
|
156
|
-
values:
|
|
161
|
+
values: K,
|
|
157
162
|
dtype_str: Literal[
|
|
158
163
|
"nullable-integer", "nullable-boolean", "nullable-string-array"
|
|
159
164
|
],
|
|
160
|
-
mask:
|
|
165
|
+
mask: K,
|
|
161
166
|
base_path_or_zarr_group: Path | ZarrGroup,
|
|
162
167
|
elem_name: str,
|
|
163
168
|
):
|
|
@@ -169,40 +174,33 @@ class MaskedArray(XBackendArray, Generic[K]):
|
|
|
169
174
|
self.file_format = "zarr" if isinstance(mask, ZarrArray) else "h5"
|
|
170
175
|
self.elem_name = elem_name
|
|
171
176
|
|
|
172
|
-
def __getitem__(
|
|
173
|
-
|
|
174
|
-
|
|
177
|
+
def __getitem__(
|
|
178
|
+
self, key: ExplicitIndexer
|
|
179
|
+
) -> PandasExtensionArray | NDArray[np.str_]:
|
|
175
180
|
values = self._values[key]
|
|
176
181
|
mask = self._mask[key]
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
extension_array = pd.arrays.IntegerArray(values, mask=mask)
|
|
180
|
-
elif self._dtype_str == "nullable-boolean":
|
|
181
|
-
extension_array = pd.arrays.BooleanArray(values, mask=mask)
|
|
182
|
-
elif self._dtype_str == "nullable-string-array":
|
|
182
|
+
|
|
183
|
+
if isinstance(self.dtype, np.dtypes.StringDType):
|
|
183
184
|
# https://github.com/pydata/xarray/issues/10419
|
|
184
185
|
values = values.astype(self.dtype)
|
|
185
186
|
values[mask] = pd.NA
|
|
186
187
|
return values
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
188
|
+
|
|
189
|
+
from xarray.core.extension_array import PandasExtensionArray
|
|
190
|
+
|
|
191
|
+
cls = self.dtype.construct_array_type()
|
|
192
|
+
return PandasExtensionArray(cls(values, mask))
|
|
191
193
|
|
|
192
194
|
@cached_property
|
|
193
|
-
def dtype(self) -> np.dtypes.StringDType[NAType]
|
|
194
|
-
if self._dtype_str == "nullable-
|
|
195
|
-
return pd.array(
|
|
196
|
-
[],
|
|
197
|
-
dtype=str(pd.api.types.pandas_dtype(self._values.dtype)).capitalize(),
|
|
198
|
-
).dtype
|
|
199
|
-
elif self._dtype_str == "nullable-boolean":
|
|
200
|
-
return pd.BooleanDtype()
|
|
201
|
-
elif self._dtype_str == "nullable-string-array":
|
|
195
|
+
def dtype(self) -> BaseMaskedDtype | np.dtypes.StringDType[NAType]:
|
|
196
|
+
if self._dtype_str == "nullable-string-array":
|
|
202
197
|
# https://github.com/pydata/xarray/issues/10419
|
|
203
198
|
return NULLABLE_NUMPY_STRING_TYPE
|
|
204
|
-
|
|
205
|
-
|
|
199
|
+
try:
|
|
200
|
+
return pandas_nullable_dtype(self._values.dtype)
|
|
201
|
+
except NotImplementedError:
|
|
202
|
+
msg = f"Invalid dtype_str {self._dtype_str}"
|
|
203
|
+
raise RuntimeError(msg) from None
|
|
206
204
|
|
|
207
205
|
|
|
208
206
|
@_subset.register(XDataArray)
|
anndata/experimental/merge.py
CHANGED
|
@@ -551,19 +551,25 @@ def concat_on_disk( # noqa: PLR0913
|
|
|
551
551
|
|
|
552
552
|
First, let’s get some “big” datasets with a compatible ``var`` axis:
|
|
553
553
|
|
|
554
|
-
>>> import
|
|
554
|
+
>>> import pooch
|
|
555
555
|
>>> import scanpy as sc
|
|
556
556
|
>>> base_url = "https://datasets.cellxgene.cziscience.com"
|
|
557
|
-
>>>
|
|
558
|
-
|
|
559
|
-
...
|
|
560
|
-
...
|
|
561
|
-
...
|
|
562
|
-
...
|
|
563
|
-
...
|
|
564
|
-
...
|
|
565
|
-
>>> path_b_cells = get_cellxgene_data(
|
|
566
|
-
|
|
557
|
+
>>> # To update hashes: pooch.retrieve(url, known_hash=None) prints the new hash
|
|
558
|
+
>>> def get_cellxgene_data(id_: str, hash_: str):
|
|
559
|
+
... return pooch.retrieve(
|
|
560
|
+
... f"{base_url}/{id_}.h5ad",
|
|
561
|
+
... known_hash=hash_,
|
|
562
|
+
... fname=f"{id_}.h5ad",
|
|
563
|
+
... path=sc.settings.datasetdir,
|
|
564
|
+
... )
|
|
565
|
+
>>> path_b_cells = get_cellxgene_data(
|
|
566
|
+
... 'a93eab58-3d82-4b61-8a2f-d7666dcdb7c4',
|
|
567
|
+
... 'sha256:dac90fe2aa8b78aee2c1fc963104592f8eff7b873ca21d01a51a5e416734651c',
|
|
568
|
+
... )
|
|
569
|
+
>>> path_fetal = get_cellxgene_data(
|
|
570
|
+
... 'd170ff04-6da0-4156-a719-f8e1bbefbf53',
|
|
571
|
+
... 'sha256:d497eebca03533919877b6fc876e8c9d8ba063199ddc86dd9fbcb9d1d87a3622',
|
|
572
|
+
... )
|
|
567
573
|
|
|
568
574
|
Now we can concatenate them on-disk:
|
|
569
575
|
|
|
@@ -613,10 +619,10 @@ def concat_on_disk( # noqa: PLR0913
|
|
|
613
619
|
|
|
614
620
|
if (
|
|
615
621
|
len(in_files) == 1
|
|
616
|
-
and isinstance(in_files[0], str | PathLike)
|
|
622
|
+
and isinstance(in_file := in_files[0], str | PathLike)
|
|
617
623
|
and is_out_path_like
|
|
618
624
|
):
|
|
619
|
-
shutil.copy2(
|
|
625
|
+
(shutil.copytree if in_file.is_dir() else shutil.copy2)(in_file, out_file)
|
|
620
626
|
return
|
|
621
627
|
|
|
622
628
|
if keys is None:
|
anndata/tests/helpers.py
CHANGED
|
@@ -75,12 +75,13 @@ DEFAULT_KEY_TYPES = (
|
|
|
75
75
|
DEFAULT_COL_TYPES = (
|
|
76
76
|
pd.CategoricalDtype(ordered=False),
|
|
77
77
|
pd.CategoricalDtype(ordered=True),
|
|
78
|
-
np.int64,
|
|
79
|
-
np.float64,
|
|
80
|
-
np.uint8,
|
|
81
|
-
np.
|
|
82
|
-
pd.BooleanDtype,
|
|
83
|
-
pd.Int32Dtype,
|
|
78
|
+
np.dtype(np.int64),
|
|
79
|
+
np.dtype(np.float64),
|
|
80
|
+
np.dtype(np.uint8),
|
|
81
|
+
np.dtype(bool),
|
|
82
|
+
pd.BooleanDtype(),
|
|
83
|
+
pd.Int32Dtype(),
|
|
84
|
+
pd.UInt8Dtype(),
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
|
|
@@ -108,13 +109,11 @@ def gen_vstr_recarray(m, n, dtype=None):
|
|
|
108
109
|
|
|
109
110
|
|
|
110
111
|
def issubdtype(
|
|
111
|
-
a: np.dtype | pd.api.extensions.ExtensionDtype | type,
|
|
112
|
-
b: type[DT] | tuple[type[DT], ...],
|
|
112
|
+
a: np.dtype | pd.api.extensions.ExtensionDtype, b: type[DT] | tuple[type[DT], ...]
|
|
113
113
|
) -> TypeGuard[DT]:
|
|
114
|
+
assert not isinstance(a, type)
|
|
114
115
|
if isinstance(b, tuple):
|
|
115
116
|
return any(issubdtype(a, t) for t in b)
|
|
116
|
-
if isinstance(a, type) and issubclass(a, pd.api.extensions.ExtensionDtype):
|
|
117
|
-
return issubclass(a, b)
|
|
118
117
|
if isinstance(a, pd.api.extensions.ExtensionDtype):
|
|
119
118
|
return isinstance(a, b)
|
|
120
119
|
try:
|
|
@@ -126,6 +125,7 @@ def issubdtype(
|
|
|
126
125
|
def gen_random_column( # noqa: PLR0911
|
|
127
126
|
n: int, dtype: np.dtype | pd.api.extensions.ExtensionDtype
|
|
128
127
|
) -> tuple[str, np.ndarray | pd.api.extensions.ExtensionArray]:
|
|
128
|
+
assert isinstance(dtype, np.dtype | pd.api.extensions.ExtensionDtype)
|
|
129
129
|
if issubdtype(dtype, pd.CategoricalDtype):
|
|
130
130
|
# TODO: Think about allowing index to be passed for n
|
|
131
131
|
letters = np.fromiter(iter(ascii_letters), "U1")
|
|
@@ -142,13 +142,9 @@ def gen_random_column( # noqa: PLR0911
|
|
|
142
142
|
),
|
|
143
143
|
)
|
|
144
144
|
if issubdtype(dtype, IntegerDtype):
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
np.random.randint(0, 1000, size=n, dtype=np.int32),
|
|
149
|
-
mask=np.random.randint(0, 2, size=n, dtype=bool),
|
|
150
|
-
),
|
|
151
|
-
)
|
|
145
|
+
name, values = gen_random_column(n, dtype.numpy_dtype)
|
|
146
|
+
mask = np.random.randint(0, 2, size=n, dtype=bool)
|
|
147
|
+
return f"nullable-{name}", pd.arrays.IntegerArray(values, mask)
|
|
152
148
|
if issubdtype(dtype, pd.StringDtype):
|
|
153
149
|
letters = np.fromiter(iter(ascii_letters), "U1")
|
|
154
150
|
array = pd.array(np.random.choice(letters, n), dtype=pd.StringDtype())
|
|
@@ -162,7 +158,7 @@ def gen_random_column( # noqa: PLR0911
|
|
|
162
158
|
if not issubdtype(dtype, np.number): # pragma: no cover
|
|
163
159
|
pytest.fail(f"Unexpected dtype: {dtype}")
|
|
164
160
|
|
|
165
|
-
n_bits = 8 *
|
|
161
|
+
n_bits = 8 * dtype.itemsize
|
|
166
162
|
|
|
167
163
|
if issubdtype(dtype, np.unsignedinteger):
|
|
168
164
|
return f"uint{n_bits}", np.random.randint(0, 255, n, dtype=dtype)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: anndata
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.8
|
|
4
4
|
Summary: Annotated data.
|
|
5
5
|
Project-URL: Documentation, https://anndata.readthedocs.io/
|
|
6
6
|
Project-URL: Source, https://github.com/scverse/anndata
|
|
@@ -74,11 +74,11 @@ Requires-Dist: boltons; extra == 'test'
|
|
|
74
74
|
Requires-Dist: dask[array]!=2024.8.*,!=2024.9.*,!=2025.2.*,!=2025.3.*,!=2025.4.*,!=2025.5.*,!=2025.6.*,!=2025.7.*,!=2025.8.*,>=2023.5.1; extra == 'test'
|
|
75
75
|
Requires-Dist: dask[distributed]; extra == 'test'
|
|
76
76
|
Requires-Dist: filelock; extra == 'test'
|
|
77
|
-
Requires-Dist: httpx<1.0; extra == 'test'
|
|
78
77
|
Requires-Dist: joblib; extra == 'test'
|
|
79
78
|
Requires-Dist: loompy>=3.0.5; extra == 'test'
|
|
80
79
|
Requires-Dist: matplotlib; extra == 'test'
|
|
81
80
|
Requires-Dist: openpyxl; extra == 'test'
|
|
81
|
+
Requires-Dist: pooch; extra == 'test'
|
|
82
82
|
Requires-Dist: pyarrow; extra == 'test'
|
|
83
83
|
Requires-Dist: pytest; extra == 'test'
|
|
84
84
|
Requires-Dist: pytest-cov; extra == 'test'
|
|
@@ -96,11 +96,11 @@ Requires-Dist: boltons; extra == 'test-min'
|
|
|
96
96
|
Requires-Dist: dask[array]!=2024.8.*,!=2024.9.*,!=2025.2.*,!=2025.3.*,!=2025.4.*,!=2025.5.*,!=2025.6.*,!=2025.7.*,!=2025.8.*,>=2023.5.1; extra == 'test-min'
|
|
97
97
|
Requires-Dist: dask[distributed]; extra == 'test-min'
|
|
98
98
|
Requires-Dist: filelock; extra == 'test-min'
|
|
99
|
-
Requires-Dist: httpx<1.0; extra == 'test-min'
|
|
100
99
|
Requires-Dist: joblib; extra == 'test-min'
|
|
101
100
|
Requires-Dist: loompy>=3.0.5; extra == 'test-min'
|
|
102
101
|
Requires-Dist: matplotlib; extra == 'test-min'
|
|
103
102
|
Requires-Dist: openpyxl; extra == 'test-min'
|
|
103
|
+
Requires-Dist: pooch; extra == 'test-min'
|
|
104
104
|
Requires-Dist: pyarrow; extra == 'test-min'
|
|
105
105
|
Requires-Dist: pytest; extra == 'test-min'
|
|
106
106
|
Requires-Dist: pytest-cov; extra == 'test-min'
|
|
@@ -13,45 +13,45 @@ anndata/_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
13
13
|
anndata/_core/access.py,sha256=pts7fGUKgGZANSsu_qAA7L10qHM-jT1zIehbl3441OY,873
|
|
14
14
|
anndata/_core/aligned_df.py,sha256=bM9kkEFURRLeUOUMk90WxVnRC-ZsXGEDx36kDj5gC9I,4278
|
|
15
15
|
anndata/_core/aligned_mapping.py,sha256=BYU1jslMWIhtFTtUMaXY8ZCyt0J4_ZsJTmj6J2yAXTQ,14257
|
|
16
|
-
anndata/_core/anndata.py,sha256=
|
|
16
|
+
anndata/_core/anndata.py,sha256=s-ExKqeQXuvin9dk0vKa1GLx6tRvrAX5HlvmqRR9uRw,79455
|
|
17
17
|
anndata/_core/extensions.py,sha256=9Rsho6qnr3PJHULrYGiZHCBinBZYJK6zyf3cFsl_gBY,10425
|
|
18
18
|
anndata/_core/file_backing.py,sha256=6DhBfLQPDFDpoe6wSgnOFtpC4Hnbh-UgOPbqvYDxm8g,5603
|
|
19
|
-
anndata/_core/index.py,sha256=
|
|
20
|
-
anndata/_core/merge.py,sha256=
|
|
19
|
+
anndata/_core/index.py,sha256=dz2jhrklxsNIDN-q0WhiXhxwtOreK-T8Iate-MGXpH0,13350
|
|
20
|
+
anndata/_core/merge.py,sha256=xtVLQzUIUiIv3seNWtMmURaxkJ1X4LXSzHCP_m_HJCs,59738
|
|
21
21
|
anndata/_core/raw.py,sha256=x_PwwaDQscVQOFJ38kF7sNQ47LxowpS38h2RQfU5Zwo,7925
|
|
22
22
|
anndata/_core/sparse_dataset.py,sha256=R2BeSLiREiwk9FNjdLCR3VfbYatz-7BK0l2F9XqCiTk,27280
|
|
23
23
|
anndata/_core/storage.py,sha256=mHzqp7YBJ-rGQFulMAx__D-Z7y4omHPyb1cP7YxfbFE,2555
|
|
24
24
|
anndata/_core/views.py,sha256=-tiUwugw0bRYXzewruhU0xXT7nnDLdYf4CiFByLl34w,15067
|
|
25
|
-
anndata/_core/xarray.py,sha256=
|
|
25
|
+
anndata/_core/xarray.py,sha256=XKpmkp9IyMuMAFI9ho5PoKKQSx9vX-Gau_k6moHJQ5w,18095
|
|
26
26
|
anndata/_io/__init__.py,sha256=GTNeUZ8d8aA3sK4P33tyljIc60KapLbkqBC6J1y3l9U,346
|
|
27
|
-
anndata/_io/h5ad.py,sha256=
|
|
27
|
+
anndata/_io/h5ad.py,sha256=IPM2WrS6Xg2-LRkya3uVeNdCBcjQlxEFKHYdcGItg9c,13986
|
|
28
28
|
anndata/_io/read.py,sha256=Z0QdFkaaXmGo5a25O9N9Ej2v8U7b9oV9Umw98YtB5uA,15950
|
|
29
|
-
anndata/_io/utils.py,sha256=
|
|
29
|
+
anndata/_io/utils.py,sha256=RqD5JAiGtfQmxxsmr3GSg4i0Oq2XckK1MhttjdsoFLM,10309
|
|
30
30
|
anndata/_io/write.py,sha256=r55w6yPIIuUSLW9wyYL8GnkzHHQdAxy6xiCEw9cAC38,4811
|
|
31
|
-
anndata/_io/zarr.py,sha256=
|
|
31
|
+
anndata/_io/zarr.py,sha256=k0hhYKxf7FwgpYUyDFwfpLOrWTjx5BDW1XUX6c8xhcs,5813
|
|
32
32
|
anndata/_io/specs/__init__.py,sha256=Z6l8xqa7B480U3pqrNIg4-fhUvpBW85w4xA3i3maAUM,427
|
|
33
|
-
anndata/_io/specs/lazy_methods.py,sha256=
|
|
34
|
-
anndata/_io/specs/methods.py,sha256=
|
|
33
|
+
anndata/_io/specs/lazy_methods.py,sha256=ueV9ICJ87a-mY3fTTaYd98ug7JwOh4dfJY4bQj_HU8c,13055
|
|
34
|
+
anndata/_io/specs/methods.py,sha256=jB0qq-f4UxppfbvhoFAbYVBE6Nz3u-UIfFuqM7BwOiw,46870
|
|
35
35
|
anndata/_io/specs/registry.py,sha256=6Z_ffk3uOIagzRPcDCvEoszcgD-U3n8wYnGiPA71ZeI,17539
|
|
36
|
-
anndata/compat/__init__.py,sha256=
|
|
36
|
+
anndata/compat/__init__.py,sha256=fvdnMtf7mhkK5nPXvWvQI-H7mWb016sKqVJ4pEVKUL4,15959
|
|
37
37
|
anndata/experimental/__init__.py,sha256=polIxriEkby0iEqw-IXkUzp8k0wp92BpYY4zl4BsHH0,1648
|
|
38
38
|
anndata/experimental/_dispatch_io.py,sha256=gb9JUcgS1cIERjxM1PBpWDXfPkKgMevoLF0QInZfC-g,1858
|
|
39
|
-
anndata/experimental/merge.py,sha256=
|
|
39
|
+
anndata/experimental/merge.py,sha256=b9rrAtE0t5UzcUulc9mXH9u7RW68p_SYIFPJOqUxSNY,25120
|
|
40
40
|
anndata/experimental/backed/__init__.py,sha256=4dc9M_-_SlfUidDrbWt8PRyD_8bYjypHJ86IpdThHus,230
|
|
41
41
|
anndata/experimental/backed/_compat.py,sha256=rM7CnSJEZCko5wPBFRfvZA9ZKUSpaOVcWFy5u09p1go,519
|
|
42
|
-
anndata/experimental/backed/_io.py,sha256=
|
|
43
|
-
anndata/experimental/backed/_lazy_arrays.py,sha256=
|
|
42
|
+
anndata/experimental/backed/_io.py,sha256=fG_KkGVxnqK0VukiMGYHSKasSiurFLKeWqyKftJnblw,6861
|
|
43
|
+
anndata/experimental/backed/_lazy_arrays.py,sha256=mwcovT31AoXdVfoeyoNzmqCXL1SSmKF33hYK8ftUxM0,7509
|
|
44
44
|
anndata/experimental/multi_files/__init__.py,sha256=T7iNLlRbe-KnLT3o7Tb7_nE4Iy_hLkG66UjBOvj2Bj8,107
|
|
45
45
|
anndata/experimental/multi_files/_anncollection.py,sha256=Ra8A4MzyFWlid5RJd0cc2d4SJeSZ2HXz3odKSqAbChw,35264
|
|
46
46
|
anndata/experimental/pytorch/__init__.py,sha256=4CkgrahLO8Kc-s2bmv6lVQfDxbO3IUyV0v4ygBDkttY,95
|
|
47
47
|
anndata/experimental/pytorch/_annloader.py,sha256=7mpsFV5vBfxKIje1cPjahtDZ5afkU-H663XB4FJhmok,8075
|
|
48
48
|
anndata/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
-
anndata/tests/helpers.py,sha256=
|
|
49
|
+
anndata/tests/helpers.py,sha256=9Tpt8QMZig7ggMFQGI2hDZE2u6IxQtVOEJXDZB3j1Ao,37638
|
|
50
50
|
testing/anndata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
testing/anndata/_doctest.py,sha256=Qew0N0zLLNiPKN1CLunqY5cTinFLaEhY5GagiYfm6KI,344
|
|
52
52
|
testing/anndata/_pytest.py,sha256=C_R-N2x9NHKZ66YLkvMLWkXQG1WiouOkBnLQpYx_62Q,3994
|
|
53
53
|
testing/anndata/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
anndata-0.12.
|
|
55
|
-
anndata-0.12.
|
|
56
|
-
anndata-0.12.
|
|
57
|
-
anndata-0.12.
|
|
54
|
+
anndata-0.12.8.dist-info/METADATA,sha256=rlyz7u-gJSRMTRJSDWO-wGdsBzt7L59gVK__C_BAqko,9931
|
|
55
|
+
anndata-0.12.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
56
|
+
anndata-0.12.8.dist-info/licenses/LICENSE,sha256=VcrXoEVMhtNuvMvKYGP-I5lMT8qZ_6dFf22fsL180qA,1575
|
|
57
|
+
anndata-0.12.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|