lamindb 0.74.3__py3-none-any.whl → 0.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +85 -43
- lamindb/_can_validate.py +55 -20
- lamindb/_collection.py +36 -28
- lamindb/_curate.py +55 -44
- lamindb/_feature_set.py +5 -5
- lamindb/_filter.py +3 -3
- lamindb/_finish.py +29 -23
- lamindb/_from_values.py +41 -60
- lamindb/_is_versioned.py +1 -1
- lamindb/_parents.py +38 -13
- lamindb/_record.py +19 -20
- lamindb/_save.py +2 -2
- lamindb/_transform.py +27 -16
- lamindb/core/_data.py +14 -16
- lamindb/core/_feature_manager.py +34 -44
- lamindb/core/_label_manager.py +17 -19
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_run_context.py +6 -8
- lamindb/core/datasets/_core.py +7 -7
- lamindb/core/exceptions.py +11 -0
- lamindb/core/storage/__init__.py +1 -0
- lamindb/core/storage/_anndata_accessor.py +735 -0
- lamindb/core/storage/_backed_access.py +77 -747
- lamindb/core/storage/paths.py +9 -14
- lamindb/core/types.py +3 -0
- lamindb/core/versioning.py +1 -1
- lamindb/integrations/__init__.py +1 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/METADATA +5 -5
- lamindb-0.75.0.dist-info/RECORD +58 -0
- lamindb-0.74.3.dist-info/RECORD +0 -57
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/LICENSE +0 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/WHEEL +0 -0
@@ -1,731 +1,92 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import inspect
|
4
3
|
from dataclasses import dataclass
|
5
|
-
from
|
6
|
-
from itertools import chain
|
7
|
-
from typing import TYPE_CHECKING, Callable, Mapping, Union
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
8
5
|
|
9
|
-
import
|
10
|
-
import numpy as np
|
11
|
-
import pandas as pd
|
12
|
-
from anndata import AnnData
|
13
|
-
from anndata import __version__ as anndata_version
|
14
|
-
from anndata._core.index import Index, _normalize_indices
|
15
|
-
from anndata._core.views import _resolve_idx
|
16
|
-
from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
|
17
|
-
from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
|
18
|
-
from anndata.compat import _read_attr
|
19
|
-
from fsspec.implementations.local import LocalFileSystem
|
20
|
-
from lamin_utils import logger
|
21
|
-
from lamindb_setup.core.upath import UPath, create_mapper, infer_filesystem
|
6
|
+
from anndata._io.specs.registry import get_spec
|
22
7
|
from lnschema_core import Artifact
|
23
|
-
from packaging import version
|
24
8
|
|
25
|
-
from
|
9
|
+
from ._anndata_accessor import AnnDataAccessor, StorageType, registry
|
10
|
+
from .paths import filepath_from_artifact
|
26
11
|
|
27
12
|
if TYPE_CHECKING:
|
28
|
-
from pathlib import Path
|
29
|
-
|
30
13
|
from fsspec.core import OpenFile
|
31
14
|
from tiledbsoma import Collection as SOMACollection
|
32
15
|
from tiledbsoma import Experiment as SOMAExperiment
|
16
|
+
from upath import UPath
|
33
17
|
|
34
|
-
anndata_version_parse = version.parse(anndata_version)
|
35
|
-
|
36
|
-
if anndata_version_parse < version.parse("0.10.0"):
|
37
|
-
if anndata_version_parse < version.parse("0.9.1"):
|
38
|
-
logger.warning(
|
39
|
-
"Full backed capabilities are not available for this version of anndata,"
|
40
|
-
" please install anndata>=0.9.1."
|
41
|
-
)
|
42
|
-
|
43
|
-
from anndata._core.sparse_dataset import SparseDataset
|
44
|
-
|
45
|
-
# try csr for groups with no encoding_type
|
46
|
-
class CSRDataset(SparseDataset):
|
47
|
-
@property
|
48
|
-
def format_str(self) -> str:
|
49
|
-
return "csr"
|
50
|
-
|
51
|
-
def sparse_dataset(group):
|
52
|
-
return SparseDataset(group)
|
53
|
-
|
54
|
-
else:
|
55
|
-
from anndata._core.sparse_dataset import (
|
56
|
-
BaseCompressedSparseDataset as SparseDataset,
|
57
|
-
)
|
58
|
-
from anndata._core.sparse_dataset import ( # type: ignore
|
59
|
-
CSRDataset,
|
60
|
-
sparse_dataset,
|
61
|
-
)
|
62
|
-
|
63
|
-
def _check_group_format(*args):
|
64
|
-
pass
|
65
18
|
|
66
|
-
|
19
|
+
# this dynamically creates a subclass of a context manager class
|
20
|
+
# and reassigns it to an instance of the superclass
|
21
|
+
# so that the instance calls finalize on close or exit
|
22
|
+
def _track_writes_factory(obj: Any, finalize: Callable):
|
23
|
+
closed: bool = False
|
67
24
|
|
25
|
+
tracked_class = obj.__class__
|
26
|
+
type_dict = {"__doc__": tracked_class.__doc__}
|
27
|
+
if hasattr(tracked_class, "__slots__"):
|
28
|
+
type_dict["__slots__"] = ()
|
29
|
+
if hasattr(tracked_class, "__exit__"):
|
68
30
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
return sparse_ds.to_memory()
|
76
|
-
else:
|
77
|
-
return sparse_ds[indices]
|
78
|
-
|
79
|
-
|
80
|
-
def get_module_name(obj):
|
81
|
-
return inspect.getmodule(obj).__name__.partition(".")[0]
|
82
|
-
|
83
|
-
|
84
|
-
def _records_to_df(obj):
|
85
|
-
if isinstance(obj, pd.DataFrame):
|
86
|
-
return obj
|
87
|
-
|
88
|
-
if hasattr(obj, "dtype") and obj.dtype.names is not None:
|
89
|
-
formats = []
|
90
|
-
for name, (dt, _) in obj.dtype.fields.items():
|
91
|
-
if dt.char == "S":
|
92
|
-
new_dt = str(dt).replace("S", "U")
|
93
|
-
else:
|
94
|
-
new_dt = dt
|
95
|
-
formats.append((name, new_dt))
|
96
|
-
df = pd.DataFrame(obj.astype(formats, copy=False))
|
97
|
-
for index_name in ("index", "_index"):
|
98
|
-
if index_name in df.columns:
|
99
|
-
return df.set_index(index_name)
|
100
|
-
return df
|
101
|
-
else:
|
102
|
-
return obj
|
103
|
-
|
31
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
32
|
+
nonlocal closed
|
33
|
+
tracked_class.__exit__(self, exc_type, exc_val, exc_tb)
|
34
|
+
if not closed:
|
35
|
+
finalize()
|
36
|
+
closed = True
|
104
37
|
|
105
|
-
|
106
|
-
|
107
|
-
self._registry = {}
|
108
|
-
self._openers = {}
|
38
|
+
type_dict["__exit__"] = __exit__
|
39
|
+
if hasattr(tracked_class, "close"):
|
109
40
|
|
110
|
-
|
111
|
-
|
112
|
-
self
|
113
|
-
|
41
|
+
def close(self, *args, **kwargs):
|
42
|
+
nonlocal closed
|
43
|
+
tracked_class.close(self, *args, **kwargs)
|
44
|
+
if not closed:
|
45
|
+
finalize()
|
46
|
+
closed = True
|
114
47
|
|
115
|
-
|
48
|
+
type_dict["close"] = close
|
116
49
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
else:
|
121
|
-
raise ValueError(f"Module {module} not found, please install it.")
|
50
|
+
Track = type(tracked_class.__name__ + "Track", (tracked_class,), type_dict)
|
51
|
+
obj.__class__ = Track
|
52
|
+
return obj
|
122
53
|
|
123
|
-
def register(self, module: str):
|
124
|
-
def wrapper(func: Callable):
|
125
|
-
func_name = func.__name__
|
126
|
-
if func_name not in self._registry:
|
127
|
-
self._registry[func_name] = {}
|
128
|
-
self._registry[func_name][module] = func
|
129
|
-
return func
|
130
54
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
def wrapper(*args, **kwargs):
|
135
|
-
func_registry = self._registry[func_name]
|
136
|
-
for arg in chain(args, kwargs.values()):
|
137
|
-
arg_module = get_module_name(arg)
|
138
|
-
if arg_module in func_registry:
|
139
|
-
return func_registry[arg_module](*args, **kwargs)
|
140
|
-
raise ValueError(f"{func_name} is not registered for this module.")
|
141
|
-
|
142
|
-
return wrapper
|
143
|
-
|
144
|
-
|
145
|
-
# storage specific functions should be registered and called through the registry
|
146
|
-
registry = AccessRecord()
|
147
|
-
|
148
|
-
|
149
|
-
@registry.register_open("h5py")
|
150
|
-
def open(filepath: UPath | Path | str):
|
151
|
-
fs, file_path_str = infer_filesystem(filepath)
|
152
|
-
if isinstance(fs, LocalFileSystem):
|
153
|
-
return None, h5py.File(file_path_str, mode="r")
|
154
|
-
conn = fs.open(file_path_str, mode="rb")
|
55
|
+
def _open_tiledbsoma(
|
56
|
+
filepath: UPath, mode: Literal["r", "w"] = "r"
|
57
|
+
) -> SOMACollection | SOMAExperiment:
|
155
58
|
try:
|
156
|
-
|
157
|
-
except
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
indices_increasing = []
|
179
|
-
indices_inverse = []
|
180
|
-
for indices_dim in indices:
|
181
|
-
# should be integer or bool
|
182
|
-
# ignore bool or increasing unique integers
|
183
|
-
if (
|
184
|
-
isinstance(indices_dim, np.ndarray)
|
185
|
-
and indices_dim.dtype != "bool"
|
186
|
-
and not np.all(np.diff(indices_dim) > 0)
|
187
|
-
):
|
188
|
-
idx_unique, idx_inverse = np.unique(indices_dim, return_inverse=True)
|
189
|
-
indices_increasing.append(idx_unique)
|
190
|
-
indices_inverse.append(idx_inverse)
|
191
|
-
else:
|
192
|
-
indices_increasing.append(indices_dim)
|
193
|
-
indices_inverse.append(None)
|
194
|
-
indices = tuple(indices_increasing)
|
195
|
-
if all(idx is None for idx in indices_inverse):
|
196
|
-
indices_inverse = None
|
197
|
-
result = None
|
198
|
-
if encoding_type == "":
|
199
|
-
if is_dataset:
|
200
|
-
dims = len(elem.shape)
|
201
|
-
if dims == 2:
|
202
|
-
result = elem[indices]
|
203
|
-
elif dims == 1:
|
204
|
-
if indices[0] == slice(None):
|
205
|
-
result = elem[indices[1]]
|
206
|
-
elif indices[1] == slice(None):
|
207
|
-
result = elem[indices[0]]
|
208
|
-
elif isinstance(elem, h5py.Group):
|
209
|
-
try:
|
210
|
-
ds = CSRDataset(elem)
|
211
|
-
result = _subset_sparse(ds, indices)
|
212
|
-
except Exception as e:
|
213
|
-
logger.debug(
|
214
|
-
f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
|
215
|
-
)
|
216
|
-
if result is None:
|
217
|
-
raise ValueError(
|
218
|
-
"Can not get a subset of the element of type"
|
219
|
-
f" {type(elem).__name__} with an empty spec."
|
220
|
-
)
|
221
|
-
else:
|
222
|
-
result = read_elem_partial(elem, indices=indices)
|
223
|
-
if indices_inverse is None:
|
224
|
-
return result
|
225
|
-
else:
|
226
|
-
if indices_inverse[0] is None:
|
227
|
-
if len(result.shape) == 2:
|
228
|
-
return result[:, indices_inverse[1]]
|
229
|
-
else:
|
230
|
-
return result[indices_inverse[1]]
|
231
|
-
elif indices_inverse[1] is None:
|
232
|
-
if isinstance(result, pd.DataFrame):
|
233
|
-
return result.iloc[indices_inverse[0]]
|
234
|
-
else:
|
235
|
-
return result[indices_inverse[0]]
|
236
|
-
else:
|
237
|
-
return result[tuple(indices_inverse)]
|
238
|
-
|
239
|
-
|
240
|
-
@registry.register("h5py")
|
241
|
-
def keys(storage: h5py.File):
|
242
|
-
attrs_keys: dict[str, list] = {}
|
243
|
-
for attr in storage.keys():
|
244
|
-
if attr == "X":
|
245
|
-
continue
|
246
|
-
attr_obj = storage[attr]
|
247
|
-
if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
|
248
|
-
keys = list(attr_obj.dtype.fields.keys())
|
249
|
-
else:
|
250
|
-
keys = list(attr_obj.keys())
|
251
|
-
if len(keys) > 0:
|
252
|
-
attrs_keys[attr] = keys
|
253
|
-
return attrs_keys
|
254
|
-
|
255
|
-
|
256
|
-
ArrayTypes = [h5py.Dataset]
|
257
|
-
GroupTypes = [h5py.Group]
|
258
|
-
StorageTypes = [h5py.File]
|
259
|
-
|
260
|
-
|
261
|
-
ZARR_INSTALLED = False
|
262
|
-
try:
|
263
|
-
import zarr
|
264
|
-
|
265
|
-
ZARR_INSTALLED = True
|
266
|
-
except ImportError:
|
267
|
-
pass
|
268
|
-
|
269
|
-
if ZARR_INSTALLED:
|
270
|
-
from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
|
271
|
-
|
272
|
-
ArrayTypes.append(zarr.Array)
|
273
|
-
GroupTypes.append(zarr.Group)
|
274
|
-
StorageTypes.append(zarr.Group)
|
275
|
-
|
276
|
-
@registry.register_open("zarr")
|
277
|
-
def open(filepath: Union[UPath, Path, str]): # noqa
|
278
|
-
fs, file_path_str = infer_filesystem(filepath)
|
279
|
-
conn = None
|
280
|
-
if isinstance(fs, LocalFileSystem):
|
281
|
-
# this is faster than through an fsspec mapper for local
|
282
|
-
open_obj = file_path_str
|
283
|
-
else:
|
284
|
-
open_obj = create_mapper(fs, file_path_str, check=True)
|
285
|
-
storage = zarr.open(open_obj, mode="r")
|
286
|
-
return conn, storage
|
287
|
-
|
288
|
-
@registry.register("zarr")
|
289
|
-
def read_dataframe(elem: Union[zarr.Array, zarr.Group]): # noqa
|
290
|
-
if isinstance(elem, zarr.Array):
|
291
|
-
return read_dataframe_legacy_zarr(elem)
|
292
|
-
else:
|
293
|
-
return read_elem(elem)
|
294
|
-
|
295
|
-
@registry.register("zarr")
|
296
|
-
def safer_read_partial(elem, indices): # noqa
|
297
|
-
encoding_type = get_spec(elem).encoding_type
|
298
|
-
if encoding_type == "":
|
299
|
-
if isinstance(elem, zarr.Array):
|
300
|
-
dims = len(elem.shape)
|
301
|
-
if dims == 2:
|
302
|
-
return elem.oindex[indices]
|
303
|
-
elif dims == 1:
|
304
|
-
if indices[0] == slice(None):
|
305
|
-
return elem.oindex[indices[1]]
|
306
|
-
elif indices[1] == slice(None):
|
307
|
-
return elem.oindex[indices[0]]
|
308
|
-
elif isinstance(elem, zarr.Group):
|
309
|
-
try:
|
310
|
-
ds = CSRDataset(elem)
|
311
|
-
return _subset_sparse(ds, indices)
|
312
|
-
except Exception as e:
|
313
|
-
logger.debug(
|
314
|
-
f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
|
315
|
-
)
|
316
|
-
raise ValueError(
|
317
|
-
"Can not get a subset of the element of type"
|
318
|
-
f" {type(elem).__name__} with an empty spec."
|
319
|
-
)
|
320
|
-
else:
|
321
|
-
if encoding_type in ("csr_matrix", "csc_matrix"):
|
322
|
-
ds = sparse_dataset(elem)
|
323
|
-
return _subset_sparse(ds, indices)
|
324
|
-
else:
|
325
|
-
return read_elem_partial(elem, indices=indices)
|
326
|
-
|
327
|
-
# this is needed because accessing zarr.Group.keys() directly is very slow
|
328
|
-
@registry.register("zarr")
|
329
|
-
def keys(storage: zarr.Group): # noqa
|
330
|
-
paths = storage._store.keys()
|
331
|
-
|
332
|
-
attrs_keys: dict[str, list] = {}
|
333
|
-
obs_var_arrays = []
|
334
|
-
|
335
|
-
for path in paths:
|
336
|
-
if path in (".zattrs", ".zgroup"):
|
337
|
-
continue
|
338
|
-
parts = path.split("/")
|
339
|
-
if len(parts) < 2:
|
340
|
-
continue
|
341
|
-
attr = parts[0]
|
342
|
-
key = parts[1]
|
343
|
-
|
344
|
-
if attr == "X":
|
345
|
-
continue
|
346
|
-
|
347
|
-
if attr in ("obs", "var"):
|
348
|
-
if attr in obs_var_arrays:
|
349
|
-
continue
|
350
|
-
if key == ".zarray":
|
351
|
-
attrs_keys.pop(attr, None)
|
352
|
-
obs_var_arrays.append(attr)
|
353
|
-
|
354
|
-
if attr not in attrs_keys:
|
355
|
-
attrs_keys[attr] = []
|
356
|
-
|
357
|
-
if key in (".zattrs", ".zgroup", ".zarray"):
|
358
|
-
continue
|
359
|
-
attr_keys = attrs_keys[attr]
|
360
|
-
if key not in attr_keys:
|
361
|
-
attr_keys.append(key)
|
362
|
-
|
363
|
-
for attr in obs_var_arrays:
|
364
|
-
attrs_keys[attr] = list(storage[attr].dtype.fields.keys())
|
365
|
-
|
366
|
-
return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}
|
367
|
-
|
368
|
-
|
369
|
-
ArrayTypes = tuple(ArrayTypes) # type: ignore
|
370
|
-
GroupTypes = tuple(GroupTypes) # type: ignore
|
371
|
-
StorageTypes = tuple(StorageTypes) # type: ignore
|
372
|
-
|
373
|
-
|
374
|
-
ArrayType = Union[ArrayTypes] # type: ignore
|
375
|
-
GroupType = Union[GroupTypes] # type: ignore
|
376
|
-
StorageType = Union[StorageTypes] # type: ignore
|
377
|
-
|
378
|
-
|
379
|
-
def _to_memory(elem):
|
380
|
-
if isinstance(elem, ArrayTypes):
|
381
|
-
return elem[()]
|
382
|
-
elif isinstance(elem, SparseDataset):
|
383
|
-
return elem.to_memory()
|
59
|
+
import tiledbsoma as soma
|
60
|
+
except ImportError as e:
|
61
|
+
raise ImportError("Please install tiledbsoma: pip install tiledbsoma") from e
|
62
|
+
filepath_str = filepath.as_posix()
|
63
|
+
if filepath.protocol == "s3":
|
64
|
+
from lamindb_setup.core._settings_storage import get_storage_region
|
65
|
+
|
66
|
+
region = get_storage_region(filepath_str)
|
67
|
+
tiledb_config = {"vfs.s3.region": region}
|
68
|
+
storage_options = filepath.storage_options
|
69
|
+
if "key" in storage_options:
|
70
|
+
tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
|
71
|
+
if "secret" in storage_options:
|
72
|
+
tiledb_config["vfs.s3.aws_secret_access_key"] = storage_options["secret"]
|
73
|
+
if "token" in storage_options:
|
74
|
+
tiledb_config["vfs.s3.aws_session_token"] = storage_options["token"]
|
75
|
+
ctx = soma.SOMATileDBContext(tiledb_config=tiledb_config)
|
76
|
+
# this is a strange bug
|
77
|
+
# for some reason iterdir futher gives incorrect results
|
78
|
+
# if cache is not invalidated
|
79
|
+
# instead of obs and ms it gives ms and ms in the list of names
|
80
|
+
filepath.fs.invalidate_cache()
|
384
81
|
else:
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
def _try_backed_full(elem):
|
389
|
-
# think what to do for compatibility with old var and obs
|
390
|
-
if isinstance(elem, ArrayTypes):
|
391
|
-
return elem
|
392
|
-
|
393
|
-
if isinstance(elem, GroupTypes):
|
394
|
-
encoding_type = get_spec(elem).encoding_type
|
395
|
-
if encoding_type in ("csr_matrix", "csc_matrix"):
|
396
|
-
return sparse_dataset(elem)
|
397
|
-
if "h5sparse_format" in elem.attrs:
|
398
|
-
return sparse_dataset(elem)
|
399
|
-
if encoding_type == "" and "indptr" in elem:
|
400
|
-
return CSRDataset(elem)
|
82
|
+
ctx = None
|
401
83
|
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
def _safer_read_index(elem):
|
406
|
-
if isinstance(elem, GroupTypes):
|
407
|
-
return pd.Index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
|
408
|
-
elif isinstance(elem, ArrayTypes):
|
409
|
-
indices = None
|
410
|
-
for index_name in ("index", "_index"):
|
411
|
-
if index_name in elem.dtype.names:
|
412
|
-
indices = elem[index_name]
|
413
|
-
break
|
414
|
-
if indices is not None and len(indices) > 0:
|
415
|
-
if isinstance(indices[0], bytes):
|
416
|
-
indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
|
417
|
-
return pd.Index(indices)
|
418
|
-
else:
|
419
|
-
raise ValueError("Indices not found.")
|
84
|
+
soma_objects = [obj.name for obj in filepath.iterdir()]
|
85
|
+
if "obs" in soma_objects and "ms" in soma_objects:
|
86
|
+
SOMAType = soma.Experiment
|
420
87
|
else:
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
class _MapAccessor:
|
425
|
-
def __init__(self, elem, name, indices=None):
|
426
|
-
self.elem = elem
|
427
|
-
self.indices = indices
|
428
|
-
self.name = name
|
429
|
-
|
430
|
-
def __getitem__(self, key):
|
431
|
-
if self.indices is None:
|
432
|
-
return _try_backed_full(self.elem[key])
|
433
|
-
else:
|
434
|
-
return registry.safer_read_partial(self.elem[key], indices=self.indices)
|
435
|
-
|
436
|
-
def keys(self):
|
437
|
-
return list(self.elem.keys())
|
438
|
-
|
439
|
-
def __repr__(self):
|
440
|
-
"""Description of the _MapAccessor object."""
|
441
|
-
descr = f"Accessor for the AnnData attribute {self.name}"
|
442
|
-
descr += f"\n with keys: {self.keys()}"
|
443
|
-
return descr
|
444
|
-
|
445
|
-
|
446
|
-
class _AnnDataAttrsMixin:
|
447
|
-
storage: StorageType
|
448
|
-
_attrs_keys: Mapping[str, list]
|
449
|
-
|
450
|
-
@cached_property
|
451
|
-
def obs(self) -> pd.DataFrame:
|
452
|
-
if "obs" not in self._attrs_keys:
|
453
|
-
return None
|
454
|
-
indices = getattr(self, "indices", None)
|
455
|
-
if indices is not None:
|
456
|
-
indices = (indices[0], slice(None))
|
457
|
-
obj = registry.safer_read_partial(self.storage["obs"], indices=indices) # type: ignore
|
458
|
-
return _records_to_df(obj)
|
459
|
-
else:
|
460
|
-
return registry.read_dataframe(self.storage["obs"]) # type: ignore
|
461
|
-
|
462
|
-
@cached_property
|
463
|
-
def var(self) -> pd.DataFrame:
|
464
|
-
if "var" not in self._attrs_keys:
|
465
|
-
return None
|
466
|
-
indices = getattr(self, "indices", None)
|
467
|
-
if indices is not None:
|
468
|
-
indices = (indices[1], slice(None))
|
469
|
-
obj = registry.safer_read_partial(self.storage["var"], indices=indices) # type: ignore
|
470
|
-
return _records_to_df(obj)
|
471
|
-
else:
|
472
|
-
return registry.read_dataframe(self.storage["var"]) # type: ignore
|
473
|
-
|
474
|
-
@cached_property
|
475
|
-
def uns(self):
|
476
|
-
if "uns" not in self._attrs_keys:
|
477
|
-
return None
|
478
|
-
return read_elem(self.storage["uns"])
|
479
|
-
|
480
|
-
@cached_property
|
481
|
-
def X(self):
|
482
|
-
indices = getattr(self, "indices", None)
|
483
|
-
if indices is not None:
|
484
|
-
return registry.safer_read_partial(self.storage["X"], indices=indices)
|
485
|
-
else:
|
486
|
-
return _try_backed_full(self.storage["X"])
|
487
|
-
|
488
|
-
@cached_property
|
489
|
-
def obsm(self):
|
490
|
-
if "obsm" not in self._attrs_keys:
|
491
|
-
return None
|
492
|
-
indices = getattr(self, "indices", None)
|
493
|
-
if indices is not None:
|
494
|
-
indices = (indices[0], slice(None))
|
495
|
-
return _MapAccessor(self.storage["obsm"], "obsm", indices)
|
496
|
-
|
497
|
-
@cached_property
|
498
|
-
def varm(self):
|
499
|
-
if "varm" not in self._attrs_keys:
|
500
|
-
return None
|
501
|
-
indices = getattr(self, "indices", None)
|
502
|
-
if indices is not None:
|
503
|
-
indices = (indices[1], slice(None))
|
504
|
-
return _MapAccessor(self.storage["varm"], "varm", indices)
|
505
|
-
|
506
|
-
@cached_property
|
507
|
-
def obsp(self):
|
508
|
-
if "obsp" not in self._attrs_keys:
|
509
|
-
return None
|
510
|
-
indices = getattr(self, "indices", None)
|
511
|
-
if indices is not None:
|
512
|
-
indices = (indices[0], indices[0])
|
513
|
-
return _MapAccessor(self.storage["obsp"], "obsp", indices)
|
514
|
-
|
515
|
-
@cached_property
|
516
|
-
def varp(self):
|
517
|
-
if "varp" not in self._attrs_keys:
|
518
|
-
return None
|
519
|
-
indices = getattr(self, "indices", None)
|
520
|
-
if indices is not None:
|
521
|
-
indices = (indices[1], indices[1])
|
522
|
-
return _MapAccessor(self.storage["varp"], "varp", indices)
|
523
|
-
|
524
|
-
@cached_property
|
525
|
-
def layers(self):
|
526
|
-
if "layers" not in self._attrs_keys:
|
527
|
-
return None
|
528
|
-
indices = getattr(self, "indices", None)
|
529
|
-
return _MapAccessor(self.storage["layers"], "layers", indices)
|
530
|
-
|
531
|
-
@property
|
532
|
-
def obs_names(self):
|
533
|
-
return self._obs_names
|
534
|
-
|
535
|
-
@property
|
536
|
-
def var_names(self):
|
537
|
-
return self._var_names
|
538
|
-
|
539
|
-
@cached_property
|
540
|
-
def shape(self):
|
541
|
-
return len(self._obs_names), len(self._var_names)
|
542
|
-
|
543
|
-
def to_dict(self):
|
544
|
-
prepare_adata = {}
|
545
|
-
|
546
|
-
prepare_adata["X"] = _to_memory(self.X)
|
547
|
-
|
548
|
-
if "uns" in self._attrs_keys:
|
549
|
-
prepare_adata["uns"] = self.uns
|
550
|
-
|
551
|
-
for attr in ("obs", "var"):
|
552
|
-
if attr in self._attrs_keys:
|
553
|
-
prepare_adata[attr] = getattr(self, attr)
|
554
|
-
|
555
|
-
for attr in ("obsm", "varm", "obsp", "varp", "layers"):
|
556
|
-
if attr in self._attrs_keys:
|
557
|
-
prepare_adata[attr] = {}
|
558
|
-
get_attr = getattr(self, attr)
|
559
|
-
for key in self._attrs_keys[attr]:
|
560
|
-
prepare_adata[attr][key] = _to_memory(get_attr[key])
|
561
|
-
|
562
|
-
if "raw" in self._attrs_keys:
|
563
|
-
prepare_adata["raw"] = self.raw.to_dict()
|
564
|
-
|
565
|
-
return prepare_adata
|
566
|
-
|
567
|
-
def to_memory(self):
|
568
|
-
adata = AnnData(**self.to_dict())
|
569
|
-
return adata
|
570
|
-
|
571
|
-
|
572
|
-
class AnnDataAccessorSubset(_AnnDataAttrsMixin):
|
573
|
-
def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):
|
574
|
-
self.storage = storage
|
575
|
-
self.indices = indices
|
576
|
-
|
577
|
-
self._attrs_keys = attrs_keys
|
578
|
-
self._obs_names, self._var_names = obs_names, var_names
|
579
|
-
|
580
|
-
self._ref_shape = ref_shape
|
581
|
-
|
582
|
-
def __getitem__(self, index: Index):
|
583
|
-
"""Access a subset of the underlying AnnData object."""
|
584
|
-
oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
|
585
|
-
new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
|
586
|
-
if self.indices is not None:
|
587
|
-
oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
|
588
|
-
vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
|
589
|
-
return type(self)(
|
590
|
-
self.storage,
|
591
|
-
(oidx, vidx),
|
592
|
-
self._attrs_keys,
|
593
|
-
new_obs_names,
|
594
|
-
new_var_names,
|
595
|
-
self._ref_shape,
|
596
|
-
)
|
597
|
-
|
598
|
-
def __repr__(self):
|
599
|
-
"""Description of the object."""
|
600
|
-
n_obs, n_vars = self.shape
|
601
|
-
descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
|
602
|
-
for attr, keys in self._attrs_keys.items():
|
603
|
-
descr += f"\n {attr}: {keys}"
|
604
|
-
return descr
|
605
|
-
|
606
|
-
@cached_property
|
607
|
-
def raw(self):
|
608
|
-
if "raw" not in self._attrs_keys:
|
609
|
-
return None
|
610
|
-
prepare_indices = None
|
611
|
-
if self.indices is not None:
|
612
|
-
oidx = self.indices[0]
|
613
|
-
if isinstance(oidx, np.ndarray) or oidx != slice(None):
|
614
|
-
prepare_indices = oidx, slice(None)
|
615
|
-
return AnnDataRawAccessor(
|
616
|
-
self.storage["raw"],
|
617
|
-
prepare_indices,
|
618
|
-
None,
|
619
|
-
self._obs_names,
|
620
|
-
None,
|
621
|
-
self._ref_shape[0],
|
622
|
-
)
|
623
|
-
|
624
|
-
|
625
|
-
class AnnDataRawAccessor(AnnDataAccessorSubset):
|
626
|
-
def __init__(
|
627
|
-
self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
|
628
|
-
):
|
629
|
-
var_raw = storage_raw["var"]
|
630
|
-
|
631
|
-
if var_names is None:
|
632
|
-
var_names = _safer_read_index(var_raw)
|
633
|
-
|
634
|
-
if isinstance(ref_shape, int):
|
635
|
-
ref_shape = ref_shape, len(var_names)
|
636
|
-
elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
|
637
|
-
ref_shape = ref_shape[0], len(var_names)
|
638
|
-
|
639
|
-
if attrs_keys is None:
|
640
|
-
attrs_keys = {}
|
641
|
-
if isinstance(var_raw, ArrayTypes):
|
642
|
-
attrs_keys["var"] = list(var_raw.dtype.fields.keys())
|
643
|
-
else:
|
644
|
-
# for some reason list(var_raw.keys()) is very slow for zarr
|
645
|
-
# maybe also directly get keys from the underlying mapper
|
646
|
-
attrs_keys["var"] = list(var_raw)
|
647
|
-
if "varm" in storage_raw:
|
648
|
-
varm_keys_raw = list(storage_raw["varm"])
|
649
|
-
if len(varm_keys_raw) > 0:
|
650
|
-
attrs_keys["varm"] = varm_keys_raw
|
651
|
-
|
652
|
-
super().__init__(
|
653
|
-
storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
|
654
|
-
)
|
655
|
-
|
656
|
-
@property
|
657
|
-
def raw(self):
|
658
|
-
raise AttributeError
|
659
|
-
|
660
|
-
|
661
|
-
class AnnDataAccessor(_AnnDataAttrsMixin):
|
662
|
-
"""Cloud-backed AnnData."""
|
663
|
-
|
664
|
-
def __init__(
|
665
|
-
self,
|
666
|
-
connection: OpenFile | None,
|
667
|
-
storage: StorageType,
|
668
|
-
filename: str,
|
669
|
-
):
|
670
|
-
self._conn = connection
|
671
|
-
self.storage = storage
|
672
|
-
|
673
|
-
self._attrs_keys = registry.keys(self.storage)
|
674
|
-
|
675
|
-
self._name = filename
|
676
|
-
|
677
|
-
self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
|
678
|
-
self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
|
679
|
-
|
680
|
-
self._closed = False
|
681
|
-
|
682
|
-
def close(self):
|
683
|
-
"""Closes the connection."""
|
684
|
-
if hasattr(self, "storage") and hasattr(self.storage, "close"):
|
685
|
-
self.storage.close()
|
686
|
-
if hasattr(self, "_conn") and hasattr(self._conn, "close"):
|
687
|
-
self._conn.close()
|
688
|
-
self._closed = True
|
689
|
-
|
690
|
-
@property
|
691
|
-
def closed(self):
|
692
|
-
return self._closed
|
693
|
-
|
694
|
-
def __enter__(self):
|
695
|
-
return self
|
696
|
-
|
697
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
698
|
-
self.close()
|
699
|
-
|
700
|
-
def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
|
701
|
-
"""Access a subset of the underlying AnnData object."""
|
702
|
-
oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
|
703
|
-
new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
|
704
|
-
return AnnDataAccessorSubset(
|
705
|
-
self.storage,
|
706
|
-
(oidx, vidx),
|
707
|
-
self._attrs_keys,
|
708
|
-
new_obs_names,
|
709
|
-
new_var_names,
|
710
|
-
self.shape,
|
711
|
-
)
|
712
|
-
|
713
|
-
def __repr__(self):
|
714
|
-
"""Description of the AnnDataAccessor object."""
|
715
|
-
n_obs, n_vars = self.shape
|
716
|
-
descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
|
717
|
-
descr += f"\n constructed for the AnnData object {self._name}"
|
718
|
-
for attr, keys in self._attrs_keys.items():
|
719
|
-
descr += f"\n {attr}: {keys}"
|
720
|
-
return descr
|
721
|
-
|
722
|
-
@cached_property
|
723
|
-
def raw(self):
|
724
|
-
if "raw" not in self._attrs_keys:
|
725
|
-
return None
|
726
|
-
return AnnDataRawAccessor(
|
727
|
-
self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
|
728
|
-
)
|
88
|
+
SOMAType = soma.Collection
|
89
|
+
return SOMAType.open(filepath_str, mode=mode, context=ctx)
|
729
90
|
|
730
91
|
|
731
92
|
@dataclass
|
@@ -739,7 +100,9 @@ class BackedAccessor:
|
|
739
100
|
|
740
101
|
|
741
102
|
def backed_access(
|
742
|
-
artifact_or_filepath: Artifact |
|
103
|
+
artifact_or_filepath: Artifact | UPath,
|
104
|
+
mode: str = "r",
|
105
|
+
using_key: str | None = None,
|
743
106
|
) -> AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment:
|
744
107
|
if isinstance(artifact_or_filepath, Artifact):
|
745
108
|
filepath = filepath_from_artifact(artifact_or_filepath, using_key=using_key)
|
@@ -749,56 +112,23 @@ def backed_access(
|
|
749
112
|
suffix = filepath.suffix
|
750
113
|
|
751
114
|
if name == "soma" or suffix == ".tiledbsoma":
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
raise ImportError(
|
756
|
-
"Please install tiledbsoma: pip install tiledbsoma"
|
757
|
-
) from e
|
758
|
-
filepath_str = filepath.as_posix()
|
759
|
-
if filepath.protocol == "s3":
|
760
|
-
from lamindb_setup.core._settings_storage import get_storage_region
|
761
|
-
|
762
|
-
region = get_storage_region(filepath_str)
|
763
|
-
tiledb_config = {"vfs.s3.region": region}
|
764
|
-
storage_options = filepath.storage_options
|
765
|
-
if "key" in storage_options:
|
766
|
-
tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
|
767
|
-
if "secret" in storage_options:
|
768
|
-
tiledb_config["vfs.s3.aws_secret_access_key"] = storage_options[
|
769
|
-
"secret"
|
770
|
-
]
|
771
|
-
if "token" in storage_options:
|
772
|
-
tiledb_config["vfs.s3.aws_session_token"] = storage_options["token"]
|
773
|
-
ctx = soma.SOMATileDBContext(tiledb_config=tiledb_config)
|
774
|
-
# this is a strange bug
|
775
|
-
# for some reason iterdir futher gives incorrect results
|
776
|
-
# if cache is not invalidated
|
777
|
-
# instead of obs and ms it gives ms and ms in the list of names
|
778
|
-
filepath.fs.invalidate_cache()
|
779
|
-
else:
|
780
|
-
ctx = None
|
781
|
-
|
782
|
-
soma_objects = [obj.name for obj in filepath.iterdir()]
|
783
|
-
if "obs" in soma_objects and "ms" in soma_objects:
|
784
|
-
SOMAType = soma.Experiment
|
785
|
-
else:
|
786
|
-
SOMAType = soma.Collection
|
787
|
-
return SOMAType.open(filepath_str, context=ctx)
|
115
|
+
if mode not in {"r", "w"}:
|
116
|
+
raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.")
|
117
|
+
return _open_tiledbsoma(filepath, mode=mode) # type: ignore
|
788
118
|
elif suffix in {".h5", ".hdf5", ".h5ad"}:
|
789
|
-
conn, storage = registry.open("h5py", filepath)
|
119
|
+
conn, storage = registry.open("h5py", filepath, mode=mode)
|
790
120
|
elif suffix == ".zarr":
|
791
|
-
conn, storage = registry.open("zarr", filepath)
|
121
|
+
conn, storage = registry.open("zarr", filepath, mode=mode)
|
792
122
|
else:
|
793
123
|
raise ValueError(
|
794
124
|
"object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix, not"
|
795
125
|
f" {suffix}."
|
796
126
|
)
|
797
127
|
|
798
|
-
|
128
|
+
is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
|
129
|
+
if is_anndata:
|
130
|
+
if mode != "r":
|
131
|
+
raise ValueError("Can only access `AnnData` with mode='r'.")
|
799
132
|
return AnnDataAccessor(conn, storage, name)
|
800
133
|
else:
|
801
|
-
|
802
|
-
return AnnDataAccessor(conn, storage, name)
|
803
|
-
else:
|
804
|
-
return BackedAccessor(conn, storage)
|
134
|
+
return BackedAccessor(conn, storage)
|