lamindb 0.48a2__py3-none-any.whl → 0.48.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +15 -24
- lamindb/_context.py +5 -2
- lamindb/_dataset.py +6 -3
- lamindb/_delete.py +6 -6
- lamindb/_feature.py +61 -26
- lamindb/_feature_manager.py +176 -0
- lamindb/_feature_set.py +63 -27
- lamindb/_file.py +120 -76
- lamindb/_from_values.py +88 -28
- lamindb/_label.py +85 -0
- lamindb/_logger.py +1 -1
- lamindb/_manager.py +24 -17
- lamindb/_orm.py +157 -33
- lamindb/_queryset.py +37 -35
- lamindb/_save.py +19 -9
- lamindb/_transform.py +12 -3
- lamindb/_view.py +1 -1
- lamindb/dev/__init__.py +4 -0
- lamindb/dev/_settings.py +1 -1
- lamindb/dev/_view_parents.py +70 -34
- lamindb/dev/datasets/__init__.py +12 -0
- lamindb/dev/datasets/_core.py +116 -65
- lamindb/dev/storage/__init__.py +1 -5
- lamindb/dev/storage/_backed_access.py +505 -379
- lamindb/dev/storage/file.py +3 -1
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/METADATA +10 -8
- lamindb-0.48.1.dist-info/RECORD +42 -0
- lamindb/_category.py +0 -42
- lamindb-0.48a2.dist-info/RECORD +0 -41
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/LICENSE +0 -0
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/WHEEL +0 -0
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/entry_points.txt +0 -0
@@ -1,23 +1,130 @@
|
|
1
|
+
import inspect
|
1
2
|
from dataclasses import dataclass
|
2
3
|
from functools import cached_property
|
3
|
-
from
|
4
|
+
from itertools import chain
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Callable, Dict, Mapping, Union
|
4
7
|
|
5
8
|
import h5py
|
9
|
+
import numpy as np
|
6
10
|
import pandas as pd
|
7
11
|
from anndata import AnnData
|
8
12
|
from anndata._core.index import Index, _normalize_indices
|
9
13
|
from anndata._core.sparse_dataset import SparseDataset
|
10
14
|
from anndata._core.views import _resolve_idx
|
11
15
|
from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
|
12
|
-
from anndata._io.specs.methods import read_indices
|
13
16
|
from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
|
14
17
|
from anndata.compat import _read_attr
|
15
18
|
from fsspec.core import OpenFile
|
16
|
-
from lamindb_setup.dev.upath import infer_filesystem
|
19
|
+
from lamindb_setup.dev.upath import UPath, infer_filesystem
|
17
20
|
from lnschema_core import File
|
18
21
|
|
19
22
|
from lamindb.dev.storage.file import filepath_from_file
|
20
23
|
|
24
|
+
|
25
|
+
def get_module_name(obj):
|
26
|
+
return inspect.getmodule(obj).__name__.partition(".")[0]
|
27
|
+
|
28
|
+
|
29
|
+
class Registry:
|
30
|
+
def __init__(self):
|
31
|
+
self._registry = {}
|
32
|
+
self._openers = {}
|
33
|
+
|
34
|
+
def register_open(self, module: str):
|
35
|
+
def wrapper(func: Callable):
|
36
|
+
self._openers[module] = func
|
37
|
+
return func
|
38
|
+
|
39
|
+
return wrapper
|
40
|
+
|
41
|
+
def open(self, module: str, *args, **kwargs):
|
42
|
+
if module in self._openers:
|
43
|
+
return self._openers[module](*args, **kwargs)
|
44
|
+
else:
|
45
|
+
raise ValueError(f"Module {module} not found, please install it.")
|
46
|
+
|
47
|
+
def register(self, module: str):
|
48
|
+
def wrapper(func: Callable):
|
49
|
+
func_name = func.__name__
|
50
|
+
if func_name not in self._registry:
|
51
|
+
self._registry[func_name] = {}
|
52
|
+
self._registry[func_name][module] = func
|
53
|
+
return func
|
54
|
+
|
55
|
+
return wrapper
|
56
|
+
|
57
|
+
def __getattr__(self, func_name: str):
|
58
|
+
def wrapper(*args, **kwargs):
|
59
|
+
func_registry = self._registry[func_name]
|
60
|
+
for arg in chain(args, kwargs.values()):
|
61
|
+
arg_module = get_module_name(arg)
|
62
|
+
if arg_module in func_registry:
|
63
|
+
return func_registry[arg_module](*args, **kwargs)
|
64
|
+
raise ValueError(f"{func_name} is not registered for this module.")
|
65
|
+
|
66
|
+
return wrapper
|
67
|
+
|
68
|
+
|
69
|
+
# storage specific functions should be registered and called through the registry
|
70
|
+
registry = Registry()
|
71
|
+
|
72
|
+
|
73
|
+
@registry.register_open("h5py")
|
74
|
+
def open(filepath: Union[UPath, Path, str]):
|
75
|
+
fs, file_path_str = infer_filesystem(filepath)
|
76
|
+
conn = fs.open(file_path_str, mode="rb")
|
77
|
+
try:
|
78
|
+
storage = h5py.File(conn, mode="r")
|
79
|
+
except Exception as e:
|
80
|
+
conn.close()
|
81
|
+
raise e
|
82
|
+
return conn, storage
|
83
|
+
|
84
|
+
|
85
|
+
@registry.register("h5py")
|
86
|
+
def read_dataframe(elem: Union[h5py.Dataset, h5py.Group]):
|
87
|
+
if isinstance(elem, h5py.Dataset):
|
88
|
+
return read_dataframe_legacy_h5(elem)
|
89
|
+
else:
|
90
|
+
return read_elem(elem)
|
91
|
+
|
92
|
+
|
93
|
+
@registry.register("h5py")
|
94
|
+
def safer_read_partial(elem, indices):
|
95
|
+
if get_spec(elem).encoding_type == "":
|
96
|
+
if isinstance(elem, h5py.Datatset):
|
97
|
+
return elem[indices]
|
98
|
+
else:
|
99
|
+
raise ValueError(
|
100
|
+
"Can not get a subset of the element of type"
|
101
|
+
f" {type(elem).__name__} with an empty spec."
|
102
|
+
)
|
103
|
+
else:
|
104
|
+
return read_elem_partial(elem, indices=indices)
|
105
|
+
|
106
|
+
|
107
|
+
@registry.register("h5py")
|
108
|
+
def keys(storage: h5py.File):
|
109
|
+
attrs_keys: Dict[str, list] = {}
|
110
|
+
for attr in storage.keys():
|
111
|
+
if attr == "X":
|
112
|
+
continue
|
113
|
+
attr_obj = storage[attr]
|
114
|
+
if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
|
115
|
+
keys = list(attr_obj.dtype.fields.keys())
|
116
|
+
else:
|
117
|
+
keys = list(attr_obj.keys())
|
118
|
+
if len(keys) > 0:
|
119
|
+
attrs_keys[attr] = keys
|
120
|
+
return attrs_keys
|
121
|
+
|
122
|
+
|
123
|
+
ArrayTypes = [h5py.Dataset]
|
124
|
+
GroupTypes = [h5py.Group]
|
125
|
+
StorageTypes = [h5py.File]
|
126
|
+
|
127
|
+
|
21
128
|
ZARR_INSTALLED = False
|
22
129
|
try:
|
23
130
|
import zarr
|
@@ -27,44 +134,30 @@ except ImportError:
|
|
27
134
|
pass
|
28
135
|
|
29
136
|
if ZARR_INSTALLED:
|
137
|
+
from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
|
30
138
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
139
|
+
ArrayTypes.append(zarr.Array)
|
140
|
+
GroupTypes.append(zarr.Group)
|
141
|
+
StorageTypes.append(zarr.Group)
|
142
|
+
|
143
|
+
@registry.register_open("zarr")
|
144
|
+
def open(filepath: Union[UPath, Path, str]): # noqa
|
145
|
+
fs, file_path_str = infer_filesystem(filepath)
|
146
|
+
conn = None
|
147
|
+
storage = zarr.open(fs.get_mapper(file_path_str, check=True), mode="r")
|
148
|
+
return conn, storage
|
36
149
|
|
150
|
+
@registry.register("zarr")
|
151
|
+
def read_dataframe(elem: Union[zarr.Array, zarr.Group]): # noqa
|
152
|
+
if isinstance(elem, zarr.Array):
|
37
153
|
return read_dataframe_legacy_zarr(elem)
|
38
|
-
elif isinstance(elem, h5py.Dataset):
|
39
|
-
return read_dataframe_legacy_h5(elem)
|
40
154
|
else:
|
41
155
|
return read_elem(elem)
|
42
156
|
|
43
|
-
|
44
|
-
|
45
|
-
return elem[()]
|
46
|
-
else:
|
47
|
-
return elem
|
48
|
-
|
49
|
-
def _try_backed_full(elem):
|
50
|
-
# think what to do for compatibility with old var and obs
|
51
|
-
if isinstance(elem, (h5py.Dataset, zarr.Array)):
|
52
|
-
return elem
|
53
|
-
|
54
|
-
if isinstance(elem, (h5py.Group, zarr.Group)):
|
55
|
-
encoding_type = get_spec(elem).encoding_type
|
56
|
-
if encoding_type in ("csr_matrix", "csc_matrix"):
|
57
|
-
return SparseDataset(elem)
|
58
|
-
if "h5sparse_format" in elem.attrs:
|
59
|
-
return SparseDataset(elem)
|
60
|
-
|
61
|
-
return read_elem(elem)
|
62
|
-
|
63
|
-
def _safer_read_partial(elem, indices):
|
157
|
+
@registry.register("zarr")
|
158
|
+
def safer_read_partial(elem, indices): # noqa
|
64
159
|
if get_spec(elem).encoding_type == "":
|
65
|
-
if isinstance(elem,
|
66
|
-
return elem[indices]
|
67
|
-
elif isinstance(elem, zarr.Array):
|
160
|
+
if isinstance(elem, zarr.Array):
|
68
161
|
return elem.oindex[indices]
|
69
162
|
else:
|
70
163
|
raise ValueError(
|
@@ -74,302 +167,9 @@ if ZARR_INSTALLED:
|
|
74
167
|
else:
|
75
168
|
return read_elem_partial(elem, indices=indices)
|
76
169
|
|
77
|
-
class _MapAccessor:
|
78
|
-
def __init__(self, elem, name, indices=None):
|
79
|
-
self.elem = elem
|
80
|
-
self.indices = indices
|
81
|
-
self.name = name
|
82
|
-
|
83
|
-
def __getitem__(self, key):
|
84
|
-
if self.indices is None:
|
85
|
-
return _try_backed_full(self.elem[key])
|
86
|
-
else:
|
87
|
-
return _safer_read_partial(self.elem[key], indices=self.indices)
|
88
|
-
|
89
|
-
def keys(self):
|
90
|
-
return list(self.elem.keys())
|
91
|
-
|
92
|
-
def __repr__(self):
|
93
|
-
"""Description of the _MapAccessor object."""
|
94
|
-
descr = f"Accessor for the AnnData attribute {self.name}"
|
95
|
-
descr += f"\n with keys: {self.keys()}"
|
96
|
-
return descr
|
97
|
-
|
98
|
-
class _AnnDataAttrsMixin:
|
99
|
-
storage: Union[h5py.File, zarr.Group]
|
100
|
-
_attrs_keys: Mapping[str, list]
|
101
|
-
|
102
|
-
@cached_property
|
103
|
-
def obs(self) -> pd.DataFrame:
|
104
|
-
if "obs" not in self._attrs_keys:
|
105
|
-
return None
|
106
|
-
indices = getattr(self, "indices", None)
|
107
|
-
if indices is not None:
|
108
|
-
indices = (indices[0], slice(None))
|
109
|
-
return _safer_read_partial(self.storage["obs"], indices=indices)
|
110
|
-
else:
|
111
|
-
return _read_dataframe(self.storage["obs"])
|
112
|
-
|
113
|
-
@cached_property
|
114
|
-
def var(self) -> pd.DataFrame:
|
115
|
-
if "var" not in self._attrs_keys:
|
116
|
-
return None
|
117
|
-
indices = getattr(self, "indices", None)
|
118
|
-
if indices is not None:
|
119
|
-
indices = (indices[1], slice(None))
|
120
|
-
return _safer_read_partial(self.storage["var"], indices=indices)
|
121
|
-
else:
|
122
|
-
return _read_dataframe(self.storage["var"])
|
123
|
-
|
124
|
-
@cached_property
|
125
|
-
def uns(self):
|
126
|
-
if "uns" not in self._attrs_keys:
|
127
|
-
return None
|
128
|
-
return read_elem(self.storage["uns"])
|
129
|
-
|
130
|
-
@cached_property
|
131
|
-
def X(self):
|
132
|
-
indices = getattr(self, "indices", None)
|
133
|
-
if indices is not None:
|
134
|
-
return _safer_read_partial(self.storage["X"], indices=indices)
|
135
|
-
else:
|
136
|
-
return _try_backed_full(self.storage["X"])
|
137
|
-
|
138
|
-
@cached_property
|
139
|
-
def obsm(self):
|
140
|
-
if "obsm" not in self._attrs_keys:
|
141
|
-
return None
|
142
|
-
indices = getattr(self, "indices", None)
|
143
|
-
if indices is not None:
|
144
|
-
indices = (indices[0], slice(None))
|
145
|
-
return _MapAccessor(self.storage["obsm"], "obsm", indices)
|
146
|
-
|
147
|
-
@cached_property
|
148
|
-
def varm(self):
|
149
|
-
if "varm" not in self._attrs_keys:
|
150
|
-
return None
|
151
|
-
indices = getattr(self, "indices", None)
|
152
|
-
if indices is not None:
|
153
|
-
indices = (indices[1], slice(None))
|
154
|
-
return _MapAccessor(self.storage["varm"], "varm", indices)
|
155
|
-
|
156
|
-
@cached_property
|
157
|
-
def obsp(self):
|
158
|
-
if "obsp" not in self._attrs_keys:
|
159
|
-
return None
|
160
|
-
indices = getattr(self, "indices", None)
|
161
|
-
if indices is not None:
|
162
|
-
indices = (indices[0], indices[0])
|
163
|
-
return _MapAccessor(self.storage["obsp"], "obsp", indices)
|
164
|
-
|
165
|
-
@cached_property
|
166
|
-
def varp(self):
|
167
|
-
if "varp" not in self._attrs_keys:
|
168
|
-
return None
|
169
|
-
indices = getattr(self, "indices", None)
|
170
|
-
if indices is not None:
|
171
|
-
indices = (indices[1], indices[1])
|
172
|
-
return _MapAccessor(self.storage["varp"], "varp", indices)
|
173
|
-
|
174
|
-
@cached_property
|
175
|
-
def layers(self):
|
176
|
-
if "layers" not in self._attrs_keys:
|
177
|
-
return None
|
178
|
-
indices = getattr(self, "indices", None)
|
179
|
-
return _MapAccessor(self.storage["layers"], "layers", indices)
|
180
|
-
|
181
|
-
@property
|
182
|
-
def obs_names(self):
|
183
|
-
return self._obs_names
|
184
|
-
|
185
|
-
@property
|
186
|
-
def var_names(self):
|
187
|
-
return self._var_names
|
188
|
-
|
189
|
-
@cached_property
|
190
|
-
def shape(self):
|
191
|
-
return len(self._obs_names), len(self._var_names)
|
192
|
-
|
193
|
-
def to_dict(self):
|
194
|
-
prepare_adata = {}
|
195
|
-
|
196
|
-
prepare_adata["X"] = _to_memory(self.X)
|
197
|
-
|
198
|
-
if "uns" in self._attrs_keys:
|
199
|
-
prepare_adata["uns"] = self.uns
|
200
|
-
|
201
|
-
for attr in ("obs", "var"):
|
202
|
-
if attr in self._attrs_keys:
|
203
|
-
prepare_adata[attr] = getattr(self, attr)
|
204
|
-
|
205
|
-
for attr in ("obsm", "varm", "obsp", "varp", "layers"):
|
206
|
-
if attr in self._attrs_keys:
|
207
|
-
prepare_adata[attr] = {}
|
208
|
-
get_attr = getattr(self, attr)
|
209
|
-
for key in self._attrs_keys[attr]:
|
210
|
-
prepare_adata[attr][key] = _to_memory(get_attr[key])
|
211
|
-
|
212
|
-
if "raw" in self._attrs_keys:
|
213
|
-
prepare_adata["raw"] = self.raw.to_dict()
|
214
|
-
|
215
|
-
return prepare_adata
|
216
|
-
|
217
|
-
def to_memory(self):
|
218
|
-
adata = AnnData(**self.to_dict())
|
219
|
-
return adata
|
220
|
-
|
221
|
-
class AnnDataAccessorSubset(_AnnDataAttrsMixin):
|
222
|
-
def __init__(
|
223
|
-
self, storage, indices, attrs_keys, obs_names, var_names, ref_shape
|
224
|
-
):
|
225
|
-
self.storage = storage
|
226
|
-
self.indices = indices
|
227
|
-
|
228
|
-
self._attrs_keys = attrs_keys
|
229
|
-
self._obs_names, self._var_names = obs_names, var_names
|
230
|
-
|
231
|
-
self._ref_shape = ref_shape
|
232
|
-
|
233
|
-
def __getitem__(self, index: Index):
|
234
|
-
"""Access a subset of the underlying AnnData object."""
|
235
|
-
oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
|
236
|
-
new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
|
237
|
-
if self.indices is not None:
|
238
|
-
oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
|
239
|
-
vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
|
240
|
-
return type(self)(
|
241
|
-
self.storage,
|
242
|
-
(oidx, vidx),
|
243
|
-
self._attrs_keys,
|
244
|
-
new_obs_names,
|
245
|
-
new_var_names,
|
246
|
-
self._ref_shape,
|
247
|
-
)
|
248
|
-
|
249
|
-
def __repr__(self):
|
250
|
-
"""Description of the object."""
|
251
|
-
n_obs, n_vars = self.shape
|
252
|
-
descr = (
|
253
|
-
f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
|
254
|
-
)
|
255
|
-
for attr, keys in self._attrs_keys.items():
|
256
|
-
descr += f"\n {attr}: {keys}"
|
257
|
-
return descr
|
258
|
-
|
259
|
-
@cached_property
|
260
|
-
def raw(self):
|
261
|
-
if "raw" not in self._attrs_keys:
|
262
|
-
return None
|
263
|
-
prepare_indices = None
|
264
|
-
if self.indices is not None:
|
265
|
-
oidx = self.indices[0]
|
266
|
-
if oidx != slice(None):
|
267
|
-
prepare_indices = oidx, slice(None)
|
268
|
-
return AnnDataRawAccessor(
|
269
|
-
self.storage["raw"],
|
270
|
-
prepare_indices,
|
271
|
-
None,
|
272
|
-
self._obs_names,
|
273
|
-
None,
|
274
|
-
self._ref_shape[0],
|
275
|
-
)
|
276
|
-
|
277
|
-
class AnnDataRawAccessor(AnnDataAccessorSubset):
|
278
|
-
def __init__(
|
279
|
-
self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
|
280
|
-
):
|
281
|
-
var_raw = storage_raw["var"]
|
282
|
-
|
283
|
-
if var_names is None:
|
284
|
-
var_names = read_elem(var_raw[_read_attr(var_raw.attrs, "_index")])
|
285
|
-
|
286
|
-
if isinstance(ref_shape, int):
|
287
|
-
ref_shape = ref_shape, len(var_names)
|
288
|
-
elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
|
289
|
-
ref_shape = ref_shape[0], len(var_names)
|
290
|
-
|
291
|
-
if attrs_keys is None:
|
292
|
-
attrs_keys = {}
|
293
|
-
if isinstance(var_raw, (h5py.Dataset, zarr.Array)):
|
294
|
-
attrs_keys["var"] = list(var_raw.dtype.fields.keys())
|
295
|
-
else:
|
296
|
-
# for some reason list(var_raw.keys()) is very slow for zarr
|
297
|
-
# maybe also directly get keys from the underlying mapper
|
298
|
-
attrs_keys["var"] = [key for key in var_raw]
|
299
|
-
if "varm" in storage_raw:
|
300
|
-
varm_keys_raw = [key for key in storage_raw["varm"]]
|
301
|
-
if len(varm_keys_raw) > 0:
|
302
|
-
attrs_keys["varm"] = varm_keys_raw
|
303
|
-
|
304
|
-
super().__init__(
|
305
|
-
storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
|
306
|
-
)
|
307
|
-
|
308
|
-
@property
|
309
|
-
def raw(self):
|
310
|
-
raise AttributeError
|
311
|
-
|
312
|
-
class AnnDataAccessor(_AnnDataAttrsMixin):
|
313
|
-
"""Cloud-backed AnnData."""
|
314
|
-
|
315
|
-
def __init__(
|
316
|
-
self,
|
317
|
-
connection: Union[OpenFile, None],
|
318
|
-
storage: Union[h5py.File, zarr.Group],
|
319
|
-
filename: str,
|
320
|
-
):
|
321
|
-
self._conn = connection
|
322
|
-
self.storage = storage
|
323
|
-
|
324
|
-
if isinstance(self.storage, h5py.File):
|
325
|
-
self._attrs_keys = _keys_h5(self.storage)
|
326
|
-
elif isinstance(self.storage, zarr.Group):
|
327
|
-
self._attrs_keys = _keys_zarr(self.storage)
|
328
|
-
else:
|
329
|
-
raise ValueError("Unknown type of storage.")
|
330
|
-
|
331
|
-
self._name = filename
|
332
|
-
|
333
|
-
self._obs_names, self._var_names = read_indices(self.storage)
|
334
|
-
|
335
|
-
def __del__(self):
|
336
|
-
"""Closes the connection."""
|
337
|
-
if self._conn is not None:
|
338
|
-
self.storage.close()
|
339
|
-
self._conn.close()
|
340
|
-
|
341
|
-
def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
|
342
|
-
"""Access a subset of the underlying AnnData object."""
|
343
|
-
oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
|
344
|
-
new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
|
345
|
-
return AnnDataAccessorSubset(
|
346
|
-
self.storage,
|
347
|
-
(oidx, vidx),
|
348
|
-
self._attrs_keys,
|
349
|
-
new_obs_names,
|
350
|
-
new_var_names,
|
351
|
-
self.shape,
|
352
|
-
)
|
353
|
-
|
354
|
-
def __repr__(self):
|
355
|
-
"""Description of the AnnDataAccessor object."""
|
356
|
-
n_obs, n_vars = self.shape
|
357
|
-
descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
|
358
|
-
descr += f"\n constructed for the AnnData object {self._name}"
|
359
|
-
for attr, keys in self._attrs_keys.items():
|
360
|
-
descr += f"\n {attr}: {keys}"
|
361
|
-
return descr
|
362
|
-
|
363
|
-
@cached_property
|
364
|
-
def raw(self):
|
365
|
-
if "raw" not in self._attrs_keys:
|
366
|
-
return None
|
367
|
-
return AnnDataRawAccessor(
|
368
|
-
self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
|
369
|
-
)
|
370
|
-
|
371
170
|
# this is needed because accessing zarr.Group.keys() directly is very slow
|
372
|
-
|
171
|
+
@registry.register("zarr")
|
172
|
+
def keys(storage: zarr.Group): # noqa
|
373
173
|
paths = storage._store.keys()
|
374
174
|
|
375
175
|
attrs_keys: Dict[str, list] = {}
|
@@ -408,58 +208,384 @@ if ZARR_INSTALLED:
|
|
408
208
|
|
409
209
|
return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}
|
410
210
|
|
411
|
-
def _keys_h5(storage: h5py.File):
|
412
|
-
attrs_keys: Dict[str, list] = {}
|
413
|
-
for attr in storage.keys():
|
414
|
-
if attr == "X":
|
415
|
-
continue
|
416
|
-
attr_obj = storage[attr]
|
417
|
-
if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
|
418
|
-
keys = list(attr_obj.dtype.fields.keys())
|
419
|
-
else:
|
420
|
-
keys = list(attr_obj.keys())
|
421
|
-
if len(keys) > 0:
|
422
|
-
attrs_keys[attr] = keys
|
423
|
-
return attrs_keys
|
424
|
-
|
425
|
-
@dataclass
|
426
|
-
class BackedAccessor:
|
427
|
-
"""h5py.File or zarr.Group accessor."""
|
428
|
-
|
429
|
-
connection: OpenFile
|
430
|
-
"""The connection."""
|
431
|
-
storage: Union[h5py.File, zarr.Group]
|
432
|
-
"""The storage access."""
|
433
|
-
|
434
|
-
def backed_access(file_or_filepath: File) -> Union[AnnDataAccessor, BackedAccessor]:
|
435
|
-
if isinstance(file_or_filepath, File):
|
436
|
-
filepath = filepath_from_file(file_or_filepath)
|
437
|
-
name = File.key
|
438
|
-
else:
|
439
|
-
filepath = file_or_filepath
|
440
|
-
name = filepath.name
|
441
|
-
fs, file_path_str = infer_filesystem(filepath)
|
442
211
|
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
212
|
+
ArrayTypes = tuple(ArrayTypes) # type: ignore
|
213
|
+
GroupTypes = tuple(GroupTypes) # type: ignore
|
214
|
+
StorageTypes = tuple(StorageTypes) # type: ignore
|
215
|
+
|
216
|
+
ArrayOrSparseTypes = ArrayTypes + (SparseDataset,) # type: ignore
|
217
|
+
|
218
|
+
|
219
|
+
ArrayType = Union[ArrayTypes] # type: ignore
|
220
|
+
GroupType = Union[GroupTypes] # type: ignore
|
221
|
+
StorageType = Union[StorageTypes] # type: ignore
|
222
|
+
|
223
|
+
|
224
|
+
def _to_memory(elem):
|
225
|
+
if isinstance(elem, ArrayOrSparseTypes):
|
226
|
+
return elem[()]
|
227
|
+
else:
|
228
|
+
return elem
|
229
|
+
|
230
|
+
|
231
|
+
def _try_backed_full(elem):
|
232
|
+
# think what to do for compatibility with old var and obs
|
233
|
+
if isinstance(elem, ArrayTypes):
|
234
|
+
return elem
|
235
|
+
|
236
|
+
if isinstance(elem, GroupTypes):
|
237
|
+
encoding_type = get_spec(elem).encoding_type
|
238
|
+
if encoding_type in ("csr_matrix", "csc_matrix"):
|
239
|
+
return SparseDataset(elem)
|
240
|
+
if "h5sparse_format" in elem.attrs:
|
241
|
+
return SparseDataset(elem)
|
242
|
+
|
243
|
+
return read_elem(elem)
|
244
|
+
|
245
|
+
|
246
|
+
def _safer_read_index(elem):
|
247
|
+
if isinstance(elem, GroupTypes):
|
248
|
+
return read_elem(elem[_read_attr(elem.attrs, "_index")])
|
249
|
+
elif isinstance(elem, ArrayTypes):
|
250
|
+
indices = None
|
251
|
+
for index_name in ("index", "_index"):
|
252
|
+
if index_name in elem.dtype.names:
|
253
|
+
indices = elem[index_name]
|
254
|
+
break
|
255
|
+
if indices is not None and len(indices) > 0:
|
256
|
+
if isinstance(indices[0], bytes):
|
257
|
+
indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
|
258
|
+
return indices
|
453
259
|
else:
|
454
|
-
raise ValueError(
|
455
|
-
|
456
|
-
|
457
|
-
)
|
260
|
+
raise ValueError("Indices not found.")
|
261
|
+
else:
|
262
|
+
raise ValueError(f"Unknown elem type {type(elem)} when reading indices.")
|
458
263
|
|
459
|
-
|
460
|
-
|
264
|
+
|
265
|
+
class _MapAccessor:
|
266
|
+
def __init__(self, elem, name, indices=None):
|
267
|
+
self.elem = elem
|
268
|
+
self.indices = indices
|
269
|
+
self.name = name
|
270
|
+
|
271
|
+
def __getitem__(self, key):
|
272
|
+
if self.indices is None:
|
273
|
+
return _try_backed_full(self.elem[key])
|
274
|
+
else:
|
275
|
+
return registry.safer_read_partial(self.elem[key], indices=self.indices)
|
276
|
+
|
277
|
+
def keys(self):
|
278
|
+
return list(self.elem.keys())
|
279
|
+
|
280
|
+
def __repr__(self):
|
281
|
+
"""Description of the _MapAccessor object."""
|
282
|
+
descr = f"Accessor for the AnnData attribute {self.name}"
|
283
|
+
descr += f"\n with keys: {self.keys()}"
|
284
|
+
return descr
|
285
|
+
|
286
|
+
|
287
|
+
class _AnnDataAttrsMixin:
|
288
|
+
storage: StorageType
|
289
|
+
_attrs_keys: Mapping[str, list]
|
290
|
+
|
291
|
+
@cached_property
|
292
|
+
def obs(self) -> pd.DataFrame:
|
293
|
+
if "obs" not in self._attrs_keys:
|
294
|
+
return None
|
295
|
+
indices = getattr(self, "indices", None)
|
296
|
+
if indices is not None:
|
297
|
+
indices = (indices[0], slice(None))
|
298
|
+
return registry.safer_read_partial(self.storage["obs"], indices=indices) # type: ignore # noqa
|
461
299
|
else:
|
462
|
-
|
463
|
-
|
300
|
+
return registry.read_dataframe(self.storage["obs"]) # type: ignore
|
301
|
+
|
302
|
+
@cached_property
|
303
|
+
def var(self) -> pd.DataFrame:
|
304
|
+
if "var" not in self._attrs_keys:
|
305
|
+
return None
|
306
|
+
indices = getattr(self, "indices", None)
|
307
|
+
if indices is not None:
|
308
|
+
indices = (indices[1], slice(None))
|
309
|
+
return registry.safer_read_partial(self.storage["var"], indices=indices) # type: ignore # noqa
|
310
|
+
else:
|
311
|
+
return registry.read_dataframe(self.storage["var"]) # type: ignore
|
312
|
+
|
313
|
+
@cached_property
|
314
|
+
def uns(self):
|
315
|
+
if "uns" not in self._attrs_keys:
|
316
|
+
return None
|
317
|
+
return read_elem(self.storage["uns"])
|
318
|
+
|
319
|
+
@cached_property
|
320
|
+
def X(self):
|
321
|
+
indices = getattr(self, "indices", None)
|
322
|
+
if indices is not None:
|
323
|
+
return registry.safer_read_partial(self.storage["X"], indices=indices)
|
324
|
+
else:
|
325
|
+
return _try_backed_full(self.storage["X"])
|
326
|
+
|
327
|
+
@cached_property
|
328
|
+
def obsm(self):
|
329
|
+
if "obsm" not in self._attrs_keys:
|
330
|
+
return None
|
331
|
+
indices = getattr(self, "indices", None)
|
332
|
+
if indices is not None:
|
333
|
+
indices = (indices[0], slice(None))
|
334
|
+
return _MapAccessor(self.storage["obsm"], "obsm", indices)
|
335
|
+
|
336
|
+
@cached_property
|
337
|
+
def varm(self):
|
338
|
+
if "varm" not in self._attrs_keys:
|
339
|
+
return None
|
340
|
+
indices = getattr(self, "indices", None)
|
341
|
+
if indices is not None:
|
342
|
+
indices = (indices[1], slice(None))
|
343
|
+
return _MapAccessor(self.storage["varm"], "varm", indices)
|
344
|
+
|
345
|
+
@cached_property
|
346
|
+
def obsp(self):
|
347
|
+
if "obsp" not in self._attrs_keys:
|
348
|
+
return None
|
349
|
+
indices = getattr(self, "indices", None)
|
350
|
+
if indices is not None:
|
351
|
+
indices = (indices[0], indices[0])
|
352
|
+
return _MapAccessor(self.storage["obsp"], "obsp", indices)
|
353
|
+
|
354
|
+
@cached_property
|
355
|
+
def varp(self):
|
356
|
+
if "varp" not in self._attrs_keys:
|
357
|
+
return None
|
358
|
+
indices = getattr(self, "indices", None)
|
359
|
+
if indices is not None:
|
360
|
+
indices = (indices[1], indices[1])
|
361
|
+
return _MapAccessor(self.storage["varp"], "varp", indices)
|
362
|
+
|
363
|
+
@cached_property
|
364
|
+
def layers(self):
|
365
|
+
if "layers" not in self._attrs_keys:
|
366
|
+
return None
|
367
|
+
indices = getattr(self, "indices", None)
|
368
|
+
return _MapAccessor(self.storage["layers"], "layers", indices)
|
369
|
+
|
370
|
+
@property
|
371
|
+
def obs_names(self):
|
372
|
+
return self._obs_names
|
373
|
+
|
374
|
+
@property
|
375
|
+
def var_names(self):
|
376
|
+
return self._var_names
|
377
|
+
|
378
|
+
@cached_property
|
379
|
+
def shape(self):
|
380
|
+
return len(self._obs_names), len(self._var_names)
|
381
|
+
|
382
|
+
def to_dict(self):
|
383
|
+
prepare_adata = {}
|
384
|
+
|
385
|
+
prepare_adata["X"] = _to_memory(self.X)
|
386
|
+
|
387
|
+
if "uns" in self._attrs_keys:
|
388
|
+
prepare_adata["uns"] = self.uns
|
389
|
+
|
390
|
+
for attr in ("obs", "var"):
|
391
|
+
if attr in self._attrs_keys:
|
392
|
+
prepare_adata[attr] = getattr(self, attr)
|
393
|
+
|
394
|
+
for attr in ("obsm", "varm", "obsp", "varp", "layers"):
|
395
|
+
if attr in self._attrs_keys:
|
396
|
+
prepare_adata[attr] = {}
|
397
|
+
get_attr = getattr(self, attr)
|
398
|
+
for key in self._attrs_keys[attr]:
|
399
|
+
prepare_adata[attr][key] = _to_memory(get_attr[key])
|
400
|
+
|
401
|
+
if "raw" in self._attrs_keys:
|
402
|
+
prepare_adata["raw"] = self.raw.to_dict()
|
403
|
+
|
404
|
+
return prepare_adata
|
405
|
+
|
406
|
+
def to_memory(self):
|
407
|
+
adata = AnnData(**self.to_dict())
|
408
|
+
return adata
|
409
|
+
|
410
|
+
|
411
|
+
class AnnDataAccessorSubset(_AnnDataAttrsMixin):
|
412
|
+
def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):
|
413
|
+
self.storage = storage
|
414
|
+
self.indices = indices
|
415
|
+
|
416
|
+
self._attrs_keys = attrs_keys
|
417
|
+
self._obs_names, self._var_names = obs_names, var_names
|
418
|
+
|
419
|
+
self._ref_shape = ref_shape
|
420
|
+
|
421
|
+
def __getitem__(self, index: Index):
|
422
|
+
"""Access a subset of the underlying AnnData object."""
|
423
|
+
oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
|
424
|
+
new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
|
425
|
+
if self.indices is not None:
|
426
|
+
oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
|
427
|
+
vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
|
428
|
+
return type(self)(
|
429
|
+
self.storage,
|
430
|
+
(oidx, vidx),
|
431
|
+
self._attrs_keys,
|
432
|
+
new_obs_names,
|
433
|
+
new_var_names,
|
434
|
+
self._ref_shape,
|
435
|
+
)
|
436
|
+
|
437
|
+
def __repr__(self):
|
438
|
+
"""Description of the object."""
|
439
|
+
n_obs, n_vars = self.shape
|
440
|
+
descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
|
441
|
+
for attr, keys in self._attrs_keys.items():
|
442
|
+
descr += f"\n {attr}: {keys}"
|
443
|
+
return descr
|
444
|
+
|
445
|
+
@cached_property
|
446
|
+
def raw(self):
|
447
|
+
if "raw" not in self._attrs_keys:
|
448
|
+
return None
|
449
|
+
prepare_indices = None
|
450
|
+
if self.indices is not None:
|
451
|
+
oidx = self.indices[0]
|
452
|
+
if oidx != slice(None):
|
453
|
+
prepare_indices = oidx, slice(None)
|
454
|
+
return AnnDataRawAccessor(
|
455
|
+
self.storage["raw"],
|
456
|
+
prepare_indices,
|
457
|
+
None,
|
458
|
+
self._obs_names,
|
459
|
+
None,
|
460
|
+
self._ref_shape[0],
|
461
|
+
)
|
462
|
+
|
463
|
+
|
464
|
+
class AnnDataRawAccessor(AnnDataAccessorSubset):
|
465
|
+
def __init__(
|
466
|
+
self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
|
467
|
+
):
|
468
|
+
var_raw = storage_raw["var"]
|
469
|
+
|
470
|
+
if var_names is None:
|
471
|
+
var_names = _safer_read_index(var_raw)
|
472
|
+
|
473
|
+
if isinstance(ref_shape, int):
|
474
|
+
ref_shape = ref_shape, len(var_names)
|
475
|
+
elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
|
476
|
+
ref_shape = ref_shape[0], len(var_names)
|
477
|
+
|
478
|
+
if attrs_keys is None:
|
479
|
+
attrs_keys = {}
|
480
|
+
if isinstance(var_raw, (h5py.Dataset, zarr.Array)):
|
481
|
+
attrs_keys["var"] = list(var_raw.dtype.fields.keys())
|
464
482
|
else:
|
465
|
-
|
483
|
+
# for some reason list(var_raw.keys()) is very slow for zarr
|
484
|
+
# maybe also directly get keys from the underlying mapper
|
485
|
+
attrs_keys["var"] = [key for key in var_raw]
|
486
|
+
if "varm" in storage_raw:
|
487
|
+
varm_keys_raw = [key for key in storage_raw["varm"]]
|
488
|
+
if len(varm_keys_raw) > 0:
|
489
|
+
attrs_keys["varm"] = varm_keys_raw
|
490
|
+
|
491
|
+
super().__init__(
|
492
|
+
storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
|
493
|
+
)
|
494
|
+
|
495
|
+
@property
|
496
|
+
def raw(self):
|
497
|
+
raise AttributeError
|
498
|
+
|
499
|
+
|
500
|
+
class AnnDataAccessor(_AnnDataAttrsMixin):
|
501
|
+
"""Cloud-backed AnnData."""
|
502
|
+
|
503
|
+
def __init__(
|
504
|
+
self,
|
505
|
+
connection: Union[OpenFile, None],
|
506
|
+
storage: StorageType,
|
507
|
+
filename: str,
|
508
|
+
):
|
509
|
+
self._conn = connection
|
510
|
+
self.storage = storage
|
511
|
+
|
512
|
+
self._attrs_keys = registry.keys(self.storage)
|
513
|
+
|
514
|
+
self._name = filename
|
515
|
+
|
516
|
+
self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
|
517
|
+
self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
|
518
|
+
|
519
|
+
def __del__(self):
|
520
|
+
"""Closes the connection."""
|
521
|
+
if self._conn is not None:
|
522
|
+
self.storage.close()
|
523
|
+
self._conn.close()
|
524
|
+
|
525
|
+
def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
|
526
|
+
"""Access a subset of the underlying AnnData object."""
|
527
|
+
oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
|
528
|
+
new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
|
529
|
+
return AnnDataAccessorSubset(
|
530
|
+
self.storage,
|
531
|
+
(oidx, vidx),
|
532
|
+
self._attrs_keys,
|
533
|
+
new_obs_names,
|
534
|
+
new_var_names,
|
535
|
+
self.shape,
|
536
|
+
)
|
537
|
+
|
538
|
+
def __repr__(self):
|
539
|
+
"""Description of the AnnDataAccessor object."""
|
540
|
+
n_obs, n_vars = self.shape
|
541
|
+
descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
|
542
|
+
descr += f"\n constructed for the AnnData object {self._name}"
|
543
|
+
for attr, keys in self._attrs_keys.items():
|
544
|
+
descr += f"\n {attr}: {keys}"
|
545
|
+
return descr
|
546
|
+
|
547
|
+
@cached_property
|
548
|
+
def raw(self):
|
549
|
+
if "raw" not in self._attrs_keys:
|
550
|
+
return None
|
551
|
+
return AnnDataRawAccessor(
|
552
|
+
self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
|
553
|
+
)
|
554
|
+
|
555
|
+
|
556
|
+
@dataclass
|
557
|
+
class BackedAccessor:
|
558
|
+
"""h5py.File or zarr.Group accessor."""
|
559
|
+
|
560
|
+
connection: OpenFile
|
561
|
+
"""The connection."""
|
562
|
+
storage: StorageType
|
563
|
+
"""The storage access."""
|
564
|
+
|
565
|
+
|
566
|
+
def backed_access(
|
567
|
+
file_or_filepath: Union[File, Path]
|
568
|
+
) -> Union[AnnDataAccessor, BackedAccessor]:
|
569
|
+
if isinstance(file_or_filepath, File):
|
570
|
+
filepath = filepath_from_file(file_or_filepath)
|
571
|
+
else:
|
572
|
+
filepath = file_or_filepath
|
573
|
+
name = filepath.name
|
574
|
+
|
575
|
+
if filepath.suffix in (".h5", ".hdf5", ".h5ad"):
|
576
|
+
conn, storage = registry.open("h5py", filepath)
|
577
|
+
elif filepath.suffix in (".zarr", ".zrad"):
|
578
|
+
conn, storage = registry.open("zarr", filepath)
|
579
|
+
else:
|
580
|
+
raise ValueError(
|
581
|
+
"file should have .h5, .hdf5, .h5ad, .zarr or .zrad suffix, not"
|
582
|
+
f" {filepath.suffix}."
|
583
|
+
)
|
584
|
+
|
585
|
+
if filepath.suffix in (".h5ad", ".zrad"):
|
586
|
+
return AnnDataAccessor(conn, storage, name)
|
587
|
+
else:
|
588
|
+
if get_spec(storage).encoding_type == "anndata":
|
589
|
+
return AnnDataAccessor(conn, storage, name)
|
590
|
+
else:
|
591
|
+
return BackedAccessor(conn, storage)
|