lamindb 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +85 -43
- lamindb/_can_validate.py +100 -35
- lamindb/_collection.py +36 -28
- lamindb/_curate.py +432 -181
- lamindb/_feature_set.py +5 -5
- lamindb/_filter.py +3 -3
- lamindb/_finish.py +29 -23
- lamindb/_from_values.py +47 -66
- lamindb/_is_versioned.py +1 -1
- lamindb/_parents.py +38 -13
- lamindb/_record.py +41 -42
- lamindb/_save.py +7 -7
- lamindb/_transform.py +27 -16
- lamindb/_view.py +13 -11
- lamindb/core/__init__.py +2 -0
- lamindb/core/_data.py +18 -20
- lamindb/core/_feature_manager.py +50 -50
- lamindb/core/_label_manager.py +17 -19
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_run_context.py +6 -8
- lamindb/core/datasets/_core.py +7 -7
- lamindb/core/exceptions.py +11 -0
- lamindb/core/schema.py +5 -5
- lamindb/core/storage/__init__.py +12 -2
- lamindb/core/storage/_anndata_accessor.py +735 -0
- lamindb/core/storage/_backed_access.py +77 -747
- lamindb/core/storage/_valid_suffixes.py +16 -2
- lamindb/core/storage/paths.py +9 -14
- lamindb/core/types.py +3 -0
- lamindb/core/versioning.py +1 -1
- lamindb/integrations/__init__.py +1 -0
- lamindb/integrations/_vitessce.py +68 -31
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/METADATA +5 -5
- lamindb-0.75.1.dist-info/RECORD +58 -0
- lamindb-0.74.3.dist-info/RECORD +0 -57
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/LICENSE +0 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,735 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import inspect
|
4
|
+
from functools import cached_property
|
5
|
+
from itertools import chain
|
6
|
+
from typing import TYPE_CHECKING, Callable, Literal, Mapping, Union
|
7
|
+
|
8
|
+
import h5py
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
from anndata import AnnData
|
12
|
+
from anndata import __version__ as anndata_version
|
13
|
+
from anndata._core.index import Index, _normalize_indices
|
14
|
+
from anndata._core.views import _resolve_idx
|
15
|
+
from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
|
16
|
+
from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
|
17
|
+
from anndata.compat import _read_attr
|
18
|
+
from fsspec.implementations.local import LocalFileSystem
|
19
|
+
from lamin_utils import logger
|
20
|
+
from lamindb_setup.core.upath import UPath, create_mapper, infer_filesystem
|
21
|
+
from packaging import version
|
22
|
+
|
23
|
+
if TYPE_CHECKING:
|
24
|
+
from pathlib import Path
|
25
|
+
|
26
|
+
from fsspec.core import OpenFile
|
27
|
+
from lamindb_setup.core.types import UPathStr
|
28
|
+
|
29
|
+
|
30
|
+
anndata_version_parse = version.parse(anndata_version)
|
31
|
+
|
32
|
+
if anndata_version_parse < version.parse("0.10.0"):
|
33
|
+
if anndata_version_parse < version.parse("0.9.1"):
|
34
|
+
logger.warning(
|
35
|
+
"Full backed capabilities are not available for this version of anndata,"
|
36
|
+
" please install anndata>=0.9.1."
|
37
|
+
)
|
38
|
+
|
39
|
+
from anndata._core.sparse_dataset import SparseDataset
|
40
|
+
|
41
|
+
# try csr for groups with no encoding_type
|
42
|
+
class CSRDataset(SparseDataset):
|
43
|
+
@property
|
44
|
+
def format_str(self) -> str:
|
45
|
+
return "csr"
|
46
|
+
|
47
|
+
def sparse_dataset(group):
|
48
|
+
return SparseDataset(group)
|
49
|
+
|
50
|
+
else:
|
51
|
+
from anndata._core.sparse_dataset import (
|
52
|
+
BaseCompressedSparseDataset as SparseDataset,
|
53
|
+
)
|
54
|
+
from anndata._core.sparse_dataset import ( # type: ignore
|
55
|
+
CSRDataset,
|
56
|
+
sparse_dataset,
|
57
|
+
)
|
58
|
+
|
59
|
+
def _check_group_format(*args):
|
60
|
+
pass
|
61
|
+
|
62
|
+
CSRDataset._check_group_format = _check_group_format
|
63
|
+
|
64
|
+
|
65
|
+
# zarr and CSRDataset have problems with full selection
|
66
|
+
def _subset_sparse(sparse_ds: CSRDataset | SparseDataset, indices):
|
67
|
+
has_arrays = isinstance(indices[0], np.ndarray) or isinstance(
|
68
|
+
indices[1], np.ndarray
|
69
|
+
)
|
70
|
+
if not has_arrays and indices == (slice(None), slice(None)):
|
71
|
+
return sparse_ds.to_memory()
|
72
|
+
else:
|
73
|
+
return sparse_ds[indices]
|
74
|
+
|
75
|
+
|
76
|
+
def get_module_name(obj):
|
77
|
+
return inspect.getmodule(obj).__name__.partition(".")[0]
|
78
|
+
|
79
|
+
|
80
|
+
def _records_to_df(obj):
|
81
|
+
if isinstance(obj, pd.DataFrame):
|
82
|
+
return obj
|
83
|
+
|
84
|
+
if hasattr(obj, "dtype") and obj.dtype.names is not None:
|
85
|
+
formats = []
|
86
|
+
for name, (dt, _) in obj.dtype.fields.items():
|
87
|
+
if dt.char == "S":
|
88
|
+
new_dt = str(dt).replace("S", "U")
|
89
|
+
else:
|
90
|
+
new_dt = dt
|
91
|
+
formats.append((name, new_dt))
|
92
|
+
df = pd.DataFrame(obj.astype(formats, copy=False))
|
93
|
+
for index_name in ("index", "_index"):
|
94
|
+
if index_name in df.columns:
|
95
|
+
return df.set_index(index_name)
|
96
|
+
return df
|
97
|
+
else:
|
98
|
+
return obj
|
99
|
+
|
100
|
+
|
101
|
+
class AccessRegistry:
|
102
|
+
def __init__(self):
|
103
|
+
self._registry = {}
|
104
|
+
self._openers = {}
|
105
|
+
|
106
|
+
def register_open(self, module: str):
|
107
|
+
def wrapper(func: Callable):
|
108
|
+
self._openers[module] = func
|
109
|
+
return func
|
110
|
+
|
111
|
+
return wrapper
|
112
|
+
|
113
|
+
def open(self, module: str, *args, **kwargs):
|
114
|
+
if module in self._openers:
|
115
|
+
return self._openers[module](*args, **kwargs)
|
116
|
+
else:
|
117
|
+
raise ValueError(f"Module {module} not found, please install it.")
|
118
|
+
|
119
|
+
def register(self, module: str):
|
120
|
+
def wrapper(func: Callable):
|
121
|
+
func_name = func.__name__
|
122
|
+
if func_name not in self._registry:
|
123
|
+
self._registry[func_name] = {}
|
124
|
+
self._registry[func_name][module] = func
|
125
|
+
return func
|
126
|
+
|
127
|
+
return wrapper
|
128
|
+
|
129
|
+
def __getattr__(self, func_name: str):
|
130
|
+
def wrapper(*args, **kwargs):
|
131
|
+
func_registry = self._registry[func_name]
|
132
|
+
for arg in chain(args, kwargs.values()):
|
133
|
+
arg_module = get_module_name(arg)
|
134
|
+
if arg_module in func_registry:
|
135
|
+
return func_registry[arg_module](*args, **kwargs)
|
136
|
+
raise ValueError(f"{func_name} is not registered for this module.")
|
137
|
+
|
138
|
+
return wrapper
|
139
|
+
|
140
|
+
|
141
|
+
# storage specific functions should be registered and called through the registry
|
142
|
+
registry = AccessRegistry()
|
143
|
+
|
144
|
+
|
145
|
+
@registry.register_open("h5py")
|
146
|
+
def open(filepath: UPathStr, mode: str = "r"):
|
147
|
+
fs, file_path_str = infer_filesystem(filepath)
|
148
|
+
if isinstance(fs, LocalFileSystem):
|
149
|
+
assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
|
150
|
+
return None, h5py.File(file_path_str, mode=mode)
|
151
|
+
if mode == "r":
|
152
|
+
conn_mode = "rb"
|
153
|
+
elif mode == "w":
|
154
|
+
conn_mode = "wb"
|
155
|
+
elif mode == "a":
|
156
|
+
conn_mode = "ab"
|
157
|
+
else:
|
158
|
+
raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
|
159
|
+
conn = fs.open(file_path_str, mode=conn_mode)
|
160
|
+
try:
|
161
|
+
storage = h5py.File(conn, mode=mode)
|
162
|
+
except Exception as e:
|
163
|
+
conn.close()
|
164
|
+
raise e
|
165
|
+
return conn, storage
|
166
|
+
|
167
|
+
|
168
|
+
@registry.register("h5py")
|
169
|
+
def read_dataframe(elem: h5py.Dataset | h5py.Group):
|
170
|
+
if isinstance(elem, h5py.Dataset):
|
171
|
+
return read_dataframe_legacy_h5(elem)
|
172
|
+
else:
|
173
|
+
return read_elem(elem)
|
174
|
+
|
175
|
+
|
176
|
+
@registry.register("h5py")
|
177
|
+
def safer_read_partial(elem, indices):
|
178
|
+
is_dataset = isinstance(elem, h5py.Dataset)
|
179
|
+
indices_inverse: list | None = None
|
180
|
+
encoding_type = get_spec(elem).encoding_type
|
181
|
+
# h5py selection for datasets requires sorted indices
|
182
|
+
if is_dataset or encoding_type == "dataframe":
|
183
|
+
indices_increasing = []
|
184
|
+
indices_inverse = []
|
185
|
+
for indices_dim in indices:
|
186
|
+
# should be integer or bool
|
187
|
+
# ignore bool or increasing unique integers
|
188
|
+
if (
|
189
|
+
isinstance(indices_dim, np.ndarray)
|
190
|
+
and indices_dim.dtype != "bool"
|
191
|
+
and not np.all(np.diff(indices_dim) > 0)
|
192
|
+
):
|
193
|
+
idx_unique, idx_inverse = np.unique(indices_dim, return_inverse=True)
|
194
|
+
indices_increasing.append(idx_unique)
|
195
|
+
indices_inverse.append(idx_inverse)
|
196
|
+
else:
|
197
|
+
indices_increasing.append(indices_dim)
|
198
|
+
indices_inverse.append(None)
|
199
|
+
indices = tuple(indices_increasing)
|
200
|
+
if all(idx is None for idx in indices_inverse):
|
201
|
+
indices_inverse = None
|
202
|
+
result = None
|
203
|
+
if encoding_type == "":
|
204
|
+
if is_dataset:
|
205
|
+
dims = len(elem.shape)
|
206
|
+
if dims == 2:
|
207
|
+
result = elem[indices]
|
208
|
+
elif dims == 1:
|
209
|
+
if indices[0] == slice(None):
|
210
|
+
result = elem[indices[1]]
|
211
|
+
elif indices[1] == slice(None):
|
212
|
+
result = elem[indices[0]]
|
213
|
+
elif isinstance(elem, h5py.Group):
|
214
|
+
try:
|
215
|
+
ds = CSRDataset(elem)
|
216
|
+
result = _subset_sparse(ds, indices)
|
217
|
+
except Exception as e:
|
218
|
+
logger.debug(
|
219
|
+
f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
|
220
|
+
)
|
221
|
+
if result is None:
|
222
|
+
raise ValueError(
|
223
|
+
"Can not get a subset of the element of type"
|
224
|
+
f" {type(elem).__name__} with an empty spec."
|
225
|
+
)
|
226
|
+
else:
|
227
|
+
result = read_elem_partial(elem, indices=indices)
|
228
|
+
if indices_inverse is None:
|
229
|
+
return result
|
230
|
+
else:
|
231
|
+
if indices_inverse[0] is None:
|
232
|
+
if len(result.shape) == 2:
|
233
|
+
return result[:, indices_inverse[1]]
|
234
|
+
else:
|
235
|
+
return result[indices_inverse[1]]
|
236
|
+
elif indices_inverse[1] is None:
|
237
|
+
if isinstance(result, pd.DataFrame):
|
238
|
+
return result.iloc[indices_inverse[0]]
|
239
|
+
else:
|
240
|
+
return result[indices_inverse[0]]
|
241
|
+
else:
|
242
|
+
return result[tuple(indices_inverse)]
|
243
|
+
|
244
|
+
|
245
|
+
@registry.register("h5py")
|
246
|
+
def keys(storage: h5py.File):
|
247
|
+
attrs_keys: dict[str, list] = {}
|
248
|
+
for attr in storage.keys():
|
249
|
+
if attr == "X":
|
250
|
+
continue
|
251
|
+
attr_obj = storage[attr]
|
252
|
+
if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
|
253
|
+
keys = list(attr_obj.dtype.fields.keys())
|
254
|
+
else:
|
255
|
+
keys = list(attr_obj.keys())
|
256
|
+
if len(keys) > 0:
|
257
|
+
attrs_keys[attr] = keys
|
258
|
+
return attrs_keys
|
259
|
+
|
260
|
+
|
261
|
+
ArrayTypes = [h5py.Dataset]
|
262
|
+
GroupTypes = [h5py.Group]
|
263
|
+
StorageTypes = [h5py.File]
|
264
|
+
|
265
|
+
|
266
|
+
ZARR_INSTALLED = False
|
267
|
+
try:
|
268
|
+
import zarr
|
269
|
+
|
270
|
+
ZARR_INSTALLED = True
|
271
|
+
except ImportError:
|
272
|
+
pass
|
273
|
+
|
274
|
+
if ZARR_INSTALLED:
|
275
|
+
from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
|
276
|
+
|
277
|
+
ArrayTypes.append(zarr.Array)
|
278
|
+
GroupTypes.append(zarr.Group)
|
279
|
+
StorageTypes.append(zarr.Group)
|
280
|
+
|
281
|
+
@registry.register_open("zarr")
|
282
|
+
def open(filepath: UPathStr, mode: Literal["r", "r+", "a", "w", "w-"] = "r"):
|
283
|
+
assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
|
284
|
+
|
285
|
+
fs, file_path_str = infer_filesystem(filepath)
|
286
|
+
conn = None
|
287
|
+
if isinstance(fs, LocalFileSystem):
|
288
|
+
# this is faster than through an fsspec mapper for local
|
289
|
+
open_obj = file_path_str
|
290
|
+
else:
|
291
|
+
open_obj = create_mapper(fs, file_path_str, check=True)
|
292
|
+
storage = zarr.open(open_obj, mode=mode)
|
293
|
+
return conn, storage
|
294
|
+
|
295
|
+
@registry.register("zarr")
|
296
|
+
def read_dataframe(elem: Union[zarr.Array, zarr.Group]): # noqa
|
297
|
+
if isinstance(elem, zarr.Array):
|
298
|
+
return read_dataframe_legacy_zarr(elem)
|
299
|
+
else:
|
300
|
+
return read_elem(elem)
|
301
|
+
|
302
|
+
@registry.register("zarr")
|
303
|
+
def safer_read_partial(elem, indices):
|
304
|
+
encoding_type = get_spec(elem).encoding_type
|
305
|
+
if encoding_type == "":
|
306
|
+
if isinstance(elem, zarr.Array):
|
307
|
+
dims = len(elem.shape)
|
308
|
+
if dims == 2:
|
309
|
+
return elem.oindex[indices]
|
310
|
+
elif dims == 1:
|
311
|
+
if indices[0] == slice(None):
|
312
|
+
return elem.oindex[indices[1]]
|
313
|
+
elif indices[1] == slice(None):
|
314
|
+
return elem.oindex[indices[0]]
|
315
|
+
elif isinstance(elem, zarr.Group):
|
316
|
+
try:
|
317
|
+
ds = CSRDataset(elem)
|
318
|
+
return _subset_sparse(ds, indices)
|
319
|
+
except Exception as e:
|
320
|
+
logger.debug(
|
321
|
+
f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
|
322
|
+
)
|
323
|
+
raise ValueError(
|
324
|
+
"Can not get a subset of the element of type"
|
325
|
+
f" {type(elem).__name__} with an empty spec."
|
326
|
+
)
|
327
|
+
else:
|
328
|
+
if encoding_type in ("csr_matrix", "csc_matrix"):
|
329
|
+
ds = sparse_dataset(elem)
|
330
|
+
return _subset_sparse(ds, indices)
|
331
|
+
else:
|
332
|
+
return read_elem_partial(elem, indices=indices)
|
333
|
+
|
334
|
+
# this is needed because accessing zarr.Group.keys() directly is very slow
|
335
|
+
@registry.register("zarr")
|
336
|
+
def keys(storage: zarr.Group):
|
337
|
+
paths = storage._store.keys()
|
338
|
+
|
339
|
+
attrs_keys: dict[str, list] = {}
|
340
|
+
obs_var_arrays = []
|
341
|
+
|
342
|
+
for path in paths:
|
343
|
+
if path in (".zattrs", ".zgroup"):
|
344
|
+
continue
|
345
|
+
parts = path.split("/")
|
346
|
+
if len(parts) < 2:
|
347
|
+
continue
|
348
|
+
attr = parts[0]
|
349
|
+
key = parts[1]
|
350
|
+
|
351
|
+
if attr == "X":
|
352
|
+
continue
|
353
|
+
|
354
|
+
if attr in ("obs", "var"):
|
355
|
+
if attr in obs_var_arrays:
|
356
|
+
continue
|
357
|
+
if key == ".zarray":
|
358
|
+
attrs_keys.pop(attr, None)
|
359
|
+
obs_var_arrays.append(attr)
|
360
|
+
|
361
|
+
if attr not in attrs_keys:
|
362
|
+
attrs_keys[attr] = []
|
363
|
+
|
364
|
+
if key in (".zattrs", ".zgroup", ".zarray"):
|
365
|
+
continue
|
366
|
+
attr_keys = attrs_keys[attr]
|
367
|
+
if key not in attr_keys:
|
368
|
+
attr_keys.append(key)
|
369
|
+
|
370
|
+
for attr in obs_var_arrays:
|
371
|
+
attrs_keys[attr] = list(storage[attr].dtype.fields.keys())
|
372
|
+
|
373
|
+
return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}
|
374
|
+
|
375
|
+
|
376
|
+
ArrayTypes = tuple(ArrayTypes) # type: ignore
|
377
|
+
GroupTypes = tuple(GroupTypes) # type: ignore
|
378
|
+
StorageTypes = tuple(StorageTypes) # type: ignore
|
379
|
+
|
380
|
+
|
381
|
+
ArrayType = Union[ArrayTypes] # type: ignore
|
382
|
+
GroupType = Union[GroupTypes] # type: ignore
|
383
|
+
StorageType = Union[StorageTypes] # type: ignore
|
384
|
+
|
385
|
+
|
386
|
+
def _to_memory(elem):
|
387
|
+
if isinstance(elem, ArrayTypes):
|
388
|
+
return elem[()]
|
389
|
+
elif isinstance(elem, SparseDataset):
|
390
|
+
return elem.to_memory()
|
391
|
+
else:
|
392
|
+
return elem
|
393
|
+
|
394
|
+
|
395
|
+
def _try_backed_full(elem):
|
396
|
+
# think what to do for compatibility with old var and obs
|
397
|
+
if isinstance(elem, ArrayTypes):
|
398
|
+
return elem
|
399
|
+
|
400
|
+
if isinstance(elem, GroupTypes):
|
401
|
+
encoding_type = get_spec(elem).encoding_type
|
402
|
+
if encoding_type in ("csr_matrix", "csc_matrix"):
|
403
|
+
return sparse_dataset(elem)
|
404
|
+
if "h5sparse_format" in elem.attrs:
|
405
|
+
return sparse_dataset(elem)
|
406
|
+
if encoding_type == "" and "indptr" in elem:
|
407
|
+
return CSRDataset(elem)
|
408
|
+
|
409
|
+
return read_elem(elem)
|
410
|
+
|
411
|
+
|
412
|
+
def _safer_read_index(elem):
|
413
|
+
if isinstance(elem, GroupTypes):
|
414
|
+
return pd.Index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
|
415
|
+
elif isinstance(elem, ArrayTypes):
|
416
|
+
indices = None
|
417
|
+
for index_name in ("index", "_index"):
|
418
|
+
if index_name in elem.dtype.names:
|
419
|
+
indices = elem[index_name]
|
420
|
+
break
|
421
|
+
if indices is not None and len(indices) > 0:
|
422
|
+
if isinstance(indices[0], bytes):
|
423
|
+
indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
|
424
|
+
return pd.Index(indices)
|
425
|
+
else:
|
426
|
+
raise ValueError("Indices not found.")
|
427
|
+
else:
|
428
|
+
raise ValueError(f"Unknown elem type {type(elem)} when reading indices.")
|
429
|
+
|
430
|
+
|
431
|
+
class _MapAccessor:
|
432
|
+
def __init__(self, elem, name, indices=None):
|
433
|
+
self.elem = elem
|
434
|
+
self.indices = indices
|
435
|
+
self.name = name
|
436
|
+
|
437
|
+
def __getitem__(self, key):
|
438
|
+
if self.indices is None:
|
439
|
+
return _try_backed_full(self.elem[key])
|
440
|
+
else:
|
441
|
+
return registry.safer_read_partial(self.elem[key], indices=self.indices)
|
442
|
+
|
443
|
+
def keys(self):
|
444
|
+
return list(self.elem.keys())
|
445
|
+
|
446
|
+
def __repr__(self):
|
447
|
+
"""Description of the _MapAccessor object."""
|
448
|
+
descr = f"Accessor for the AnnData attribute {self.name}"
|
449
|
+
descr += f"\n with keys: {self.keys()}"
|
450
|
+
return descr
|
451
|
+
|
452
|
+
|
453
|
+
class _AnnDataAttrsMixin:
|
454
|
+
storage: StorageType
|
455
|
+
_attrs_keys: Mapping[str, list]
|
456
|
+
|
457
|
+
@cached_property
|
458
|
+
def obs(self) -> pd.DataFrame:
|
459
|
+
if "obs" not in self._attrs_keys:
|
460
|
+
return None
|
461
|
+
indices = getattr(self, "indices", None)
|
462
|
+
if indices is not None:
|
463
|
+
indices = (indices[0], slice(None))
|
464
|
+
obj = registry.safer_read_partial(self.storage["obs"], indices=indices) # type: ignore
|
465
|
+
return _records_to_df(obj)
|
466
|
+
else:
|
467
|
+
return registry.read_dataframe(self.storage["obs"]) # type: ignore
|
468
|
+
|
469
|
+
@cached_property
|
470
|
+
def var(self) -> pd.DataFrame:
|
471
|
+
if "var" not in self._attrs_keys:
|
472
|
+
return None
|
473
|
+
indices = getattr(self, "indices", None)
|
474
|
+
if indices is not None:
|
475
|
+
indices = (indices[1], slice(None))
|
476
|
+
obj = registry.safer_read_partial(self.storage["var"], indices=indices) # type: ignore
|
477
|
+
return _records_to_df(obj)
|
478
|
+
else:
|
479
|
+
return registry.read_dataframe(self.storage["var"]) # type: ignore
|
480
|
+
|
481
|
+
@cached_property
|
482
|
+
def uns(self):
|
483
|
+
if "uns" not in self._attrs_keys:
|
484
|
+
return None
|
485
|
+
return read_elem(self.storage["uns"])
|
486
|
+
|
487
|
+
@cached_property
|
488
|
+
def X(self):
|
489
|
+
indices = getattr(self, "indices", None)
|
490
|
+
if indices is not None:
|
491
|
+
return registry.safer_read_partial(self.storage["X"], indices=indices)
|
492
|
+
else:
|
493
|
+
return _try_backed_full(self.storage["X"])
|
494
|
+
|
495
|
+
@cached_property
|
496
|
+
def obsm(self):
|
497
|
+
if "obsm" not in self._attrs_keys:
|
498
|
+
return None
|
499
|
+
indices = getattr(self, "indices", None)
|
500
|
+
if indices is not None:
|
501
|
+
indices = (indices[0], slice(None))
|
502
|
+
return _MapAccessor(self.storage["obsm"], "obsm", indices)
|
503
|
+
|
504
|
+
@cached_property
|
505
|
+
def varm(self):
|
506
|
+
if "varm" not in self._attrs_keys:
|
507
|
+
return None
|
508
|
+
indices = getattr(self, "indices", None)
|
509
|
+
if indices is not None:
|
510
|
+
indices = (indices[1], slice(None))
|
511
|
+
return _MapAccessor(self.storage["varm"], "varm", indices)
|
512
|
+
|
513
|
+
@cached_property
|
514
|
+
def obsp(self):
|
515
|
+
if "obsp" not in self._attrs_keys:
|
516
|
+
return None
|
517
|
+
indices = getattr(self, "indices", None)
|
518
|
+
if indices is not None:
|
519
|
+
indices = (indices[0], indices[0])
|
520
|
+
return _MapAccessor(self.storage["obsp"], "obsp", indices)
|
521
|
+
|
522
|
+
@cached_property
|
523
|
+
def varp(self):
|
524
|
+
if "varp" not in self._attrs_keys:
|
525
|
+
return None
|
526
|
+
indices = getattr(self, "indices", None)
|
527
|
+
if indices is not None:
|
528
|
+
indices = (indices[1], indices[1])
|
529
|
+
return _MapAccessor(self.storage["varp"], "varp", indices)
|
530
|
+
|
531
|
+
@cached_property
|
532
|
+
def layers(self):
|
533
|
+
if "layers" not in self._attrs_keys:
|
534
|
+
return None
|
535
|
+
indices = getattr(self, "indices", None)
|
536
|
+
return _MapAccessor(self.storage["layers"], "layers", indices)
|
537
|
+
|
538
|
+
@property
|
539
|
+
def obs_names(self):
|
540
|
+
return self._obs_names
|
541
|
+
|
542
|
+
@property
|
543
|
+
def var_names(self):
|
544
|
+
return self._var_names
|
545
|
+
|
546
|
+
@cached_property
|
547
|
+
def shape(self):
|
548
|
+
return len(self._obs_names), len(self._var_names)
|
549
|
+
|
550
|
+
def to_dict(self):
|
551
|
+
prepare_adata = {}
|
552
|
+
|
553
|
+
prepare_adata["X"] = _to_memory(self.X)
|
554
|
+
|
555
|
+
if "uns" in self._attrs_keys:
|
556
|
+
prepare_adata["uns"] = self.uns
|
557
|
+
|
558
|
+
for attr in ("obs", "var"):
|
559
|
+
if attr in self._attrs_keys:
|
560
|
+
prepare_adata[attr] = getattr(self, attr)
|
561
|
+
|
562
|
+
for attr in ("obsm", "varm", "obsp", "varp", "layers"):
|
563
|
+
if attr in self._attrs_keys:
|
564
|
+
prepare_adata[attr] = {}
|
565
|
+
get_attr = getattr(self, attr)
|
566
|
+
for key in self._attrs_keys[attr]:
|
567
|
+
prepare_adata[attr][key] = _to_memory(get_attr[key])
|
568
|
+
|
569
|
+
if "raw" in self._attrs_keys:
|
570
|
+
prepare_adata["raw"] = self.raw.to_dict()
|
571
|
+
|
572
|
+
return prepare_adata
|
573
|
+
|
574
|
+
def to_memory(self):
|
575
|
+
adata = AnnData(**self.to_dict())
|
576
|
+
return adata
|
577
|
+
|
578
|
+
|
579
|
+
class AnnDataAccessorSubset(_AnnDataAttrsMixin):
|
580
|
+
def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):
|
581
|
+
self.storage = storage
|
582
|
+
self.indices = indices
|
583
|
+
|
584
|
+
self._attrs_keys = attrs_keys
|
585
|
+
self._obs_names, self._var_names = obs_names, var_names
|
586
|
+
|
587
|
+
self._ref_shape = ref_shape
|
588
|
+
|
589
|
+
def __getitem__(self, index: Index):
|
590
|
+
"""Access a subset of the underlying AnnData object."""
|
591
|
+
oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
|
592
|
+
new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
|
593
|
+
if self.indices is not None:
|
594
|
+
oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
|
595
|
+
vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
|
596
|
+
return type(self)(
|
597
|
+
self.storage,
|
598
|
+
(oidx, vidx),
|
599
|
+
self._attrs_keys,
|
600
|
+
new_obs_names,
|
601
|
+
new_var_names,
|
602
|
+
self._ref_shape,
|
603
|
+
)
|
604
|
+
|
605
|
+
def __repr__(self):
|
606
|
+
"""Description of the object."""
|
607
|
+
n_obs, n_vars = self.shape
|
608
|
+
descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
|
609
|
+
for attr, keys in self._attrs_keys.items():
|
610
|
+
descr += f"\n {attr}: {keys}"
|
611
|
+
return descr
|
612
|
+
|
613
|
+
@cached_property
|
614
|
+
def raw(self):
|
615
|
+
if "raw" not in self._attrs_keys:
|
616
|
+
return None
|
617
|
+
prepare_indices = None
|
618
|
+
if self.indices is not None:
|
619
|
+
oidx = self.indices[0]
|
620
|
+
if isinstance(oidx, np.ndarray) or oidx != slice(None):
|
621
|
+
prepare_indices = oidx, slice(None)
|
622
|
+
return AnnDataRawAccessor(
|
623
|
+
self.storage["raw"],
|
624
|
+
prepare_indices,
|
625
|
+
None,
|
626
|
+
self._obs_names,
|
627
|
+
None,
|
628
|
+
self._ref_shape[0],
|
629
|
+
)
|
630
|
+
|
631
|
+
|
632
|
+
class AnnDataRawAccessor(AnnDataAccessorSubset):
|
633
|
+
def __init__(
|
634
|
+
self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
|
635
|
+
):
|
636
|
+
var_raw = storage_raw["var"]
|
637
|
+
|
638
|
+
if var_names is None:
|
639
|
+
var_names = _safer_read_index(var_raw)
|
640
|
+
|
641
|
+
if isinstance(ref_shape, int):
|
642
|
+
ref_shape = ref_shape, len(var_names)
|
643
|
+
elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
|
644
|
+
ref_shape = ref_shape[0], len(var_names)
|
645
|
+
|
646
|
+
if attrs_keys is None:
|
647
|
+
attrs_keys = {}
|
648
|
+
if isinstance(var_raw, ArrayTypes):
|
649
|
+
attrs_keys["var"] = list(var_raw.dtype.fields.keys())
|
650
|
+
else:
|
651
|
+
# for some reason list(var_raw.keys()) is very slow for zarr
|
652
|
+
# maybe also directly get keys from the underlying mapper
|
653
|
+
attrs_keys["var"] = list(var_raw)
|
654
|
+
if "varm" in storage_raw:
|
655
|
+
varm_keys_raw = list(storage_raw["varm"])
|
656
|
+
if len(varm_keys_raw) > 0:
|
657
|
+
attrs_keys["varm"] = varm_keys_raw
|
658
|
+
|
659
|
+
super().__init__(
|
660
|
+
storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
|
661
|
+
)
|
662
|
+
|
663
|
+
@property
|
664
|
+
def raw(self):
|
665
|
+
raise AttributeError
|
666
|
+
|
667
|
+
|
668
|
+
class AnnDataAccessor(_AnnDataAttrsMixin):
|
669
|
+
"""Cloud-backed AnnData."""
|
670
|
+
|
671
|
+
def __init__(
|
672
|
+
self,
|
673
|
+
connection: OpenFile | None,
|
674
|
+
storage: StorageType,
|
675
|
+
filename: str,
|
676
|
+
):
|
677
|
+
self._conn = connection
|
678
|
+
self.storage = storage
|
679
|
+
|
680
|
+
self._attrs_keys = registry.keys(self.storage)
|
681
|
+
|
682
|
+
self._name = filename
|
683
|
+
|
684
|
+
self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
|
685
|
+
self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
|
686
|
+
|
687
|
+
self._closed = False
|
688
|
+
|
689
|
+
def close(self):
|
690
|
+
"""Closes the connection."""
|
691
|
+
if hasattr(self, "storage") and hasattr(self.storage, "close"):
|
692
|
+
self.storage.close()
|
693
|
+
if hasattr(self, "_conn") and hasattr(self._conn, "close"):
|
694
|
+
self._conn.close()
|
695
|
+
self._closed = True
|
696
|
+
|
697
|
+
@property
|
698
|
+
def closed(self):
|
699
|
+
return self._closed
|
700
|
+
|
701
|
+
def __enter__(self):
|
702
|
+
return self
|
703
|
+
|
704
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
705
|
+
self.close()
|
706
|
+
|
707
|
+
def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
|
708
|
+
"""Access a subset of the underlying AnnData object."""
|
709
|
+
oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
|
710
|
+
new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
|
711
|
+
return AnnDataAccessorSubset(
|
712
|
+
self.storage,
|
713
|
+
(oidx, vidx),
|
714
|
+
self._attrs_keys,
|
715
|
+
new_obs_names,
|
716
|
+
new_var_names,
|
717
|
+
self.shape,
|
718
|
+
)
|
719
|
+
|
720
|
+
def __repr__(self):
|
721
|
+
"""Description of the AnnDataAccessor object."""
|
722
|
+
n_obs, n_vars = self.shape
|
723
|
+
descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
|
724
|
+
descr += f"\n constructed for the AnnData object {self._name}"
|
725
|
+
for attr, keys in self._attrs_keys.items():
|
726
|
+
descr += f"\n {attr}: {keys}"
|
727
|
+
return descr
|
728
|
+
|
729
|
+
@cached_property
|
730
|
+
def raw(self):
|
731
|
+
if "raw" not in self._attrs_keys:
|
732
|
+
return None
|
733
|
+
return AnnDataRawAccessor(
|
734
|
+
self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
|
735
|
+
)
|