lamindb 0.48a2__py3-none-any.whl → 0.48.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,130 @@
1
+ import inspect
1
2
  from dataclasses import dataclass
2
3
  from functools import cached_property
3
- from typing import Dict, Mapping, Union
4
+ from itertools import chain
5
+ from pathlib import Path
6
+ from typing import Callable, Dict, Mapping, Union
4
7
 
5
8
  import h5py
9
+ import numpy as np
6
10
  import pandas as pd
7
11
  from anndata import AnnData
8
12
  from anndata._core.index import Index, _normalize_indices
9
13
  from anndata._core.sparse_dataset import SparseDataset
10
14
  from anndata._core.views import _resolve_idx
11
15
  from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
12
- from anndata._io.specs.methods import read_indices
13
16
  from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
14
17
  from anndata.compat import _read_attr
15
18
  from fsspec.core import OpenFile
16
- from lamindb_setup.dev.upath import infer_filesystem
19
+ from lamindb_setup.dev.upath import UPath, infer_filesystem
17
20
  from lnschema_core import File
18
21
 
19
22
  from lamindb.dev.storage.file import filepath_from_file
20
23
 
24
+
25
+ def get_module_name(obj):
26
+ return inspect.getmodule(obj).__name__.partition(".")[0]
27
+
28
+
29
+ class Registry:
30
+ def __init__(self):
31
+ self._registry = {}
32
+ self._openers = {}
33
+
34
+ def register_open(self, module: str):
35
+ def wrapper(func: Callable):
36
+ self._openers[module] = func
37
+ return func
38
+
39
+ return wrapper
40
+
41
+ def open(self, module: str, *args, **kwargs):
42
+ if module in self._openers:
43
+ return self._openers[module](*args, **kwargs)
44
+ else:
45
+ raise ValueError(f"Module {module} not found, please install it.")
46
+
47
+ def register(self, module: str):
48
+ def wrapper(func: Callable):
49
+ func_name = func.__name__
50
+ if func_name not in self._registry:
51
+ self._registry[func_name] = {}
52
+ self._registry[func_name][module] = func
53
+ return func
54
+
55
+ return wrapper
56
+
57
+ def __getattr__(self, func_name: str):
58
+ def wrapper(*args, **kwargs):
59
+ func_registry = self._registry[func_name]
60
+ for arg in chain(args, kwargs.values()):
61
+ arg_module = get_module_name(arg)
62
+ if arg_module in func_registry:
63
+ return func_registry[arg_module](*args, **kwargs)
64
+ raise ValueError(f"{func_name} is not registered for this module.")
65
+
66
+ return wrapper
67
+
68
+
69
+ # storage specific functions should be registered and called through the registry
70
+ registry = Registry()
71
+
72
+
73
+ @registry.register_open("h5py")
74
+ def open(filepath: Union[UPath, Path, str]):
75
+ fs, file_path_str = infer_filesystem(filepath)
76
+ conn = fs.open(file_path_str, mode="rb")
77
+ try:
78
+ storage = h5py.File(conn, mode="r")
79
+ except Exception as e:
80
+ conn.close()
81
+ raise e
82
+ return conn, storage
83
+
84
+
85
+ @registry.register("h5py")
86
+ def read_dataframe(elem: Union[h5py.Dataset, h5py.Group]):
87
+ if isinstance(elem, h5py.Dataset):
88
+ return read_dataframe_legacy_h5(elem)
89
+ else:
90
+ return read_elem(elem)
91
+
92
+
93
+ @registry.register("h5py")
94
+ def safer_read_partial(elem, indices):
95
+ if get_spec(elem).encoding_type == "":
96
+ if isinstance(elem, h5py.Datatset):
97
+ return elem[indices]
98
+ else:
99
+ raise ValueError(
100
+ "Can not get a subset of the element of type"
101
+ f" {type(elem).__name__} with an empty spec."
102
+ )
103
+ else:
104
+ return read_elem_partial(elem, indices=indices)
105
+
106
+
107
+ @registry.register("h5py")
108
+ def keys(storage: h5py.File):
109
+ attrs_keys: Dict[str, list] = {}
110
+ for attr in storage.keys():
111
+ if attr == "X":
112
+ continue
113
+ attr_obj = storage[attr]
114
+ if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
115
+ keys = list(attr_obj.dtype.fields.keys())
116
+ else:
117
+ keys = list(attr_obj.keys())
118
+ if len(keys) > 0:
119
+ attrs_keys[attr] = keys
120
+ return attrs_keys
121
+
122
+
123
+ ArrayTypes = [h5py.Dataset]
124
+ GroupTypes = [h5py.Group]
125
+ StorageTypes = [h5py.File]
126
+
127
+
21
128
  ZARR_INSTALLED = False
22
129
  try:
23
130
  import zarr
@@ -27,44 +134,30 @@ except ImportError:
27
134
  pass
28
135
 
29
136
  if ZARR_INSTALLED:
137
+ from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
30
138
 
31
- def _read_dataframe(elem: Union[zarr.Array, h5py.Dataset, zarr.Group, h5py.Group]):
32
- if isinstance(elem, zarr.Array):
33
- from anndata._io.zarr import (
34
- read_dataframe_legacy as read_dataframe_legacy_zarr,
35
- )
139
+ ArrayTypes.append(zarr.Array)
140
+ GroupTypes.append(zarr.Group)
141
+ StorageTypes.append(zarr.Group)
142
+
143
+ @registry.register_open("zarr")
144
+ def open(filepath: Union[UPath, Path, str]): # noqa
145
+ fs, file_path_str = infer_filesystem(filepath)
146
+ conn = None
147
+ storage = zarr.open(fs.get_mapper(file_path_str, check=True), mode="r")
148
+ return conn, storage
36
149
 
150
+ @registry.register("zarr")
151
+ def read_dataframe(elem: Union[zarr.Array, zarr.Group]): # noqa
152
+ if isinstance(elem, zarr.Array):
37
153
  return read_dataframe_legacy_zarr(elem)
38
- elif isinstance(elem, h5py.Dataset):
39
- return read_dataframe_legacy_h5(elem)
40
154
  else:
41
155
  return read_elem(elem)
42
156
 
43
- def _to_memory(elem):
44
- if isinstance(elem, (h5py.Dataset, zarr.Array, SparseDataset)):
45
- return elem[()]
46
- else:
47
- return elem
48
-
49
- def _try_backed_full(elem):
50
- # think what to do for compatibility with old var and obs
51
- if isinstance(elem, (h5py.Dataset, zarr.Array)):
52
- return elem
53
-
54
- if isinstance(elem, (h5py.Group, zarr.Group)):
55
- encoding_type = get_spec(elem).encoding_type
56
- if encoding_type in ("csr_matrix", "csc_matrix"):
57
- return SparseDataset(elem)
58
- if "h5sparse_format" in elem.attrs:
59
- return SparseDataset(elem)
60
-
61
- return read_elem(elem)
62
-
63
- def _safer_read_partial(elem, indices):
157
+ @registry.register("zarr")
158
+ def safer_read_partial(elem, indices): # noqa
64
159
  if get_spec(elem).encoding_type == "":
65
- if isinstance(elem, h5py.Datatset):
66
- return elem[indices]
67
- elif isinstance(elem, zarr.Array):
160
+ if isinstance(elem, zarr.Array):
68
161
  return elem.oindex[indices]
69
162
  else:
70
163
  raise ValueError(
@@ -74,302 +167,9 @@ if ZARR_INSTALLED:
74
167
  else:
75
168
  return read_elem_partial(elem, indices=indices)
76
169
 
77
- class _MapAccessor:
78
- def __init__(self, elem, name, indices=None):
79
- self.elem = elem
80
- self.indices = indices
81
- self.name = name
82
-
83
- def __getitem__(self, key):
84
- if self.indices is None:
85
- return _try_backed_full(self.elem[key])
86
- else:
87
- return _safer_read_partial(self.elem[key], indices=self.indices)
88
-
89
- def keys(self):
90
- return list(self.elem.keys())
91
-
92
- def __repr__(self):
93
- """Description of the _MapAccessor object."""
94
- descr = f"Accessor for the AnnData attribute {self.name}"
95
- descr += f"\n with keys: {self.keys()}"
96
- return descr
97
-
98
- class _AnnDataAttrsMixin:
99
- storage: Union[h5py.File, zarr.Group]
100
- _attrs_keys: Mapping[str, list]
101
-
102
- @cached_property
103
- def obs(self) -> pd.DataFrame:
104
- if "obs" not in self._attrs_keys:
105
- return None
106
- indices = getattr(self, "indices", None)
107
- if indices is not None:
108
- indices = (indices[0], slice(None))
109
- return _safer_read_partial(self.storage["obs"], indices=indices)
110
- else:
111
- return _read_dataframe(self.storage["obs"])
112
-
113
- @cached_property
114
- def var(self) -> pd.DataFrame:
115
- if "var" not in self._attrs_keys:
116
- return None
117
- indices = getattr(self, "indices", None)
118
- if indices is not None:
119
- indices = (indices[1], slice(None))
120
- return _safer_read_partial(self.storage["var"], indices=indices)
121
- else:
122
- return _read_dataframe(self.storage["var"])
123
-
124
- @cached_property
125
- def uns(self):
126
- if "uns" not in self._attrs_keys:
127
- return None
128
- return read_elem(self.storage["uns"])
129
-
130
- @cached_property
131
- def X(self):
132
- indices = getattr(self, "indices", None)
133
- if indices is not None:
134
- return _safer_read_partial(self.storage["X"], indices=indices)
135
- else:
136
- return _try_backed_full(self.storage["X"])
137
-
138
- @cached_property
139
- def obsm(self):
140
- if "obsm" not in self._attrs_keys:
141
- return None
142
- indices = getattr(self, "indices", None)
143
- if indices is not None:
144
- indices = (indices[0], slice(None))
145
- return _MapAccessor(self.storage["obsm"], "obsm", indices)
146
-
147
- @cached_property
148
- def varm(self):
149
- if "varm" not in self._attrs_keys:
150
- return None
151
- indices = getattr(self, "indices", None)
152
- if indices is not None:
153
- indices = (indices[1], slice(None))
154
- return _MapAccessor(self.storage["varm"], "varm", indices)
155
-
156
- @cached_property
157
- def obsp(self):
158
- if "obsp" not in self._attrs_keys:
159
- return None
160
- indices = getattr(self, "indices", None)
161
- if indices is not None:
162
- indices = (indices[0], indices[0])
163
- return _MapAccessor(self.storage["obsp"], "obsp", indices)
164
-
165
- @cached_property
166
- def varp(self):
167
- if "varp" not in self._attrs_keys:
168
- return None
169
- indices = getattr(self, "indices", None)
170
- if indices is not None:
171
- indices = (indices[1], indices[1])
172
- return _MapAccessor(self.storage["varp"], "varp", indices)
173
-
174
- @cached_property
175
- def layers(self):
176
- if "layers" not in self._attrs_keys:
177
- return None
178
- indices = getattr(self, "indices", None)
179
- return _MapAccessor(self.storage["layers"], "layers", indices)
180
-
181
- @property
182
- def obs_names(self):
183
- return self._obs_names
184
-
185
- @property
186
- def var_names(self):
187
- return self._var_names
188
-
189
- @cached_property
190
- def shape(self):
191
- return len(self._obs_names), len(self._var_names)
192
-
193
- def to_dict(self):
194
- prepare_adata = {}
195
-
196
- prepare_adata["X"] = _to_memory(self.X)
197
-
198
- if "uns" in self._attrs_keys:
199
- prepare_adata["uns"] = self.uns
200
-
201
- for attr in ("obs", "var"):
202
- if attr in self._attrs_keys:
203
- prepare_adata[attr] = getattr(self, attr)
204
-
205
- for attr in ("obsm", "varm", "obsp", "varp", "layers"):
206
- if attr in self._attrs_keys:
207
- prepare_adata[attr] = {}
208
- get_attr = getattr(self, attr)
209
- for key in self._attrs_keys[attr]:
210
- prepare_adata[attr][key] = _to_memory(get_attr[key])
211
-
212
- if "raw" in self._attrs_keys:
213
- prepare_adata["raw"] = self.raw.to_dict()
214
-
215
- return prepare_adata
216
-
217
- def to_memory(self):
218
- adata = AnnData(**self.to_dict())
219
- return adata
220
-
221
- class AnnDataAccessorSubset(_AnnDataAttrsMixin):
222
- def __init__(
223
- self, storage, indices, attrs_keys, obs_names, var_names, ref_shape
224
- ):
225
- self.storage = storage
226
- self.indices = indices
227
-
228
- self._attrs_keys = attrs_keys
229
- self._obs_names, self._var_names = obs_names, var_names
230
-
231
- self._ref_shape = ref_shape
232
-
233
- def __getitem__(self, index: Index):
234
- """Access a subset of the underlying AnnData object."""
235
- oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
236
- new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
237
- if self.indices is not None:
238
- oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
239
- vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
240
- return type(self)(
241
- self.storage,
242
- (oidx, vidx),
243
- self._attrs_keys,
244
- new_obs_names,
245
- new_var_names,
246
- self._ref_shape,
247
- )
248
-
249
- def __repr__(self):
250
- """Description of the object."""
251
- n_obs, n_vars = self.shape
252
- descr = (
253
- f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
254
- )
255
- for attr, keys in self._attrs_keys.items():
256
- descr += f"\n {attr}: {keys}"
257
- return descr
258
-
259
- @cached_property
260
- def raw(self):
261
- if "raw" not in self._attrs_keys:
262
- return None
263
- prepare_indices = None
264
- if self.indices is not None:
265
- oidx = self.indices[0]
266
- if oidx != slice(None):
267
- prepare_indices = oidx, slice(None)
268
- return AnnDataRawAccessor(
269
- self.storage["raw"],
270
- prepare_indices,
271
- None,
272
- self._obs_names,
273
- None,
274
- self._ref_shape[0],
275
- )
276
-
277
- class AnnDataRawAccessor(AnnDataAccessorSubset):
278
- def __init__(
279
- self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
280
- ):
281
- var_raw = storage_raw["var"]
282
-
283
- if var_names is None:
284
- var_names = read_elem(var_raw[_read_attr(var_raw.attrs, "_index")])
285
-
286
- if isinstance(ref_shape, int):
287
- ref_shape = ref_shape, len(var_names)
288
- elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
289
- ref_shape = ref_shape[0], len(var_names)
290
-
291
- if attrs_keys is None:
292
- attrs_keys = {}
293
- if isinstance(var_raw, (h5py.Dataset, zarr.Array)):
294
- attrs_keys["var"] = list(var_raw.dtype.fields.keys())
295
- else:
296
- # for some reason list(var_raw.keys()) is very slow for zarr
297
- # maybe also directly get keys from the underlying mapper
298
- attrs_keys["var"] = [key for key in var_raw]
299
- if "varm" in storage_raw:
300
- varm_keys_raw = [key for key in storage_raw["varm"]]
301
- if len(varm_keys_raw) > 0:
302
- attrs_keys["varm"] = varm_keys_raw
303
-
304
- super().__init__(
305
- storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
306
- )
307
-
308
- @property
309
- def raw(self):
310
- raise AttributeError
311
-
312
- class AnnDataAccessor(_AnnDataAttrsMixin):
313
- """Cloud-backed AnnData."""
314
-
315
- def __init__(
316
- self,
317
- connection: Union[OpenFile, None],
318
- storage: Union[h5py.File, zarr.Group],
319
- filename: str,
320
- ):
321
- self._conn = connection
322
- self.storage = storage
323
-
324
- if isinstance(self.storage, h5py.File):
325
- self._attrs_keys = _keys_h5(self.storage)
326
- elif isinstance(self.storage, zarr.Group):
327
- self._attrs_keys = _keys_zarr(self.storage)
328
- else:
329
- raise ValueError("Unknown type of storage.")
330
-
331
- self._name = filename
332
-
333
- self._obs_names, self._var_names = read_indices(self.storage)
334
-
335
- def __del__(self):
336
- """Closes the connection."""
337
- if self._conn is not None:
338
- self.storage.close()
339
- self._conn.close()
340
-
341
- def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
342
- """Access a subset of the underlying AnnData object."""
343
- oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
344
- new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
345
- return AnnDataAccessorSubset(
346
- self.storage,
347
- (oidx, vidx),
348
- self._attrs_keys,
349
- new_obs_names,
350
- new_var_names,
351
- self.shape,
352
- )
353
-
354
- def __repr__(self):
355
- """Description of the AnnDataAccessor object."""
356
- n_obs, n_vars = self.shape
357
- descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
358
- descr += f"\n constructed for the AnnData object {self._name}"
359
- for attr, keys in self._attrs_keys.items():
360
- descr += f"\n {attr}: {keys}"
361
- return descr
362
-
363
- @cached_property
364
- def raw(self):
365
- if "raw" not in self._attrs_keys:
366
- return None
367
- return AnnDataRawAccessor(
368
- self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
369
- )
370
-
371
170
  # this is needed because accessing zarr.Group.keys() directly is very slow
372
- def _keys_zarr(storage: zarr.Group):
171
+ @registry.register("zarr")
172
+ def keys(storage: zarr.Group): # noqa
373
173
  paths = storage._store.keys()
374
174
 
375
175
  attrs_keys: Dict[str, list] = {}
@@ -408,58 +208,384 @@ if ZARR_INSTALLED:
408
208
 
409
209
  return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}
410
210
 
411
- def _keys_h5(storage: h5py.File):
412
- attrs_keys: Dict[str, list] = {}
413
- for attr in storage.keys():
414
- if attr == "X":
415
- continue
416
- attr_obj = storage[attr]
417
- if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
418
- keys = list(attr_obj.dtype.fields.keys())
419
- else:
420
- keys = list(attr_obj.keys())
421
- if len(keys) > 0:
422
- attrs_keys[attr] = keys
423
- return attrs_keys
424
-
425
- @dataclass
426
- class BackedAccessor:
427
- """h5py.File or zarr.Group accessor."""
428
-
429
- connection: OpenFile
430
- """The connection."""
431
- storage: Union[h5py.File, zarr.Group]
432
- """The storage access."""
433
-
434
- def backed_access(file_or_filepath: File) -> Union[AnnDataAccessor, BackedAccessor]:
435
- if isinstance(file_or_filepath, File):
436
- filepath = filepath_from_file(file_or_filepath)
437
- name = File.key
438
- else:
439
- filepath = file_or_filepath
440
- name = filepath.name
441
- fs, file_path_str = infer_filesystem(filepath)
442
211
 
443
- if filepath.suffix in (".h5", ".hdf5", ".h5ad"):
444
- conn = fs.open(file_path_str, mode="rb")
445
- try:
446
- storage = h5py.File(conn, mode="r")
447
- except Exception as e:
448
- conn.close()
449
- raise e
450
- elif filepath.suffix in (".zarr", ".zrad"):
451
- conn = None
452
- storage = zarr.open(fs.get_mapper(file_path_str, check=True), mode="r")
212
+ ArrayTypes = tuple(ArrayTypes) # type: ignore
213
+ GroupTypes = tuple(GroupTypes) # type: ignore
214
+ StorageTypes = tuple(StorageTypes) # type: ignore
215
+
216
+ ArrayOrSparseTypes = ArrayTypes + (SparseDataset,) # type: ignore
217
+
218
+
219
+ ArrayType = Union[ArrayTypes] # type: ignore
220
+ GroupType = Union[GroupTypes] # type: ignore
221
+ StorageType = Union[StorageTypes] # type: ignore
222
+
223
+
224
+ def _to_memory(elem):
225
+ if isinstance(elem, ArrayOrSparseTypes):
226
+ return elem[()]
227
+ else:
228
+ return elem
229
+
230
+
231
+ def _try_backed_full(elem):
232
+ # think what to do for compatibility with old var and obs
233
+ if isinstance(elem, ArrayTypes):
234
+ return elem
235
+
236
+ if isinstance(elem, GroupTypes):
237
+ encoding_type = get_spec(elem).encoding_type
238
+ if encoding_type in ("csr_matrix", "csc_matrix"):
239
+ return SparseDataset(elem)
240
+ if "h5sparse_format" in elem.attrs:
241
+ return SparseDataset(elem)
242
+
243
+ return read_elem(elem)
244
+
245
+
246
+ def _safer_read_index(elem):
247
+ if isinstance(elem, GroupTypes):
248
+ return read_elem(elem[_read_attr(elem.attrs, "_index")])
249
+ elif isinstance(elem, ArrayTypes):
250
+ indices = None
251
+ for index_name in ("index", "_index"):
252
+ if index_name in elem.dtype.names:
253
+ indices = elem[index_name]
254
+ break
255
+ if indices is not None and len(indices) > 0:
256
+ if isinstance(indices[0], bytes):
257
+ indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
258
+ return indices
453
259
  else:
454
- raise ValueError(
455
- "file should have .h5, .hdf5, .h5ad, .zarr or .zrad suffix, not"
456
- f" {filepath.suffix}."
457
- )
260
+ raise ValueError("Indices not found.")
261
+ else:
262
+ raise ValueError(f"Unknown elem type {type(elem)} when reading indices.")
458
263
 
459
- if filepath.suffix in (".h5ad", ".zrad"):
460
- return AnnDataAccessor(conn, storage, name)
264
+
265
+ class _MapAccessor:
266
+ def __init__(self, elem, name, indices=None):
267
+ self.elem = elem
268
+ self.indices = indices
269
+ self.name = name
270
+
271
+ def __getitem__(self, key):
272
+ if self.indices is None:
273
+ return _try_backed_full(self.elem[key])
274
+ else:
275
+ return registry.safer_read_partial(self.elem[key], indices=self.indices)
276
+
277
+ def keys(self):
278
+ return list(self.elem.keys())
279
+
280
+ def __repr__(self):
281
+ """Description of the _MapAccessor object."""
282
+ descr = f"Accessor for the AnnData attribute {self.name}"
283
+ descr += f"\n with keys: {self.keys()}"
284
+ return descr
285
+
286
+
287
+ class _AnnDataAttrsMixin:
288
+ storage: StorageType
289
+ _attrs_keys: Mapping[str, list]
290
+
291
+ @cached_property
292
+ def obs(self) -> pd.DataFrame:
293
+ if "obs" not in self._attrs_keys:
294
+ return None
295
+ indices = getattr(self, "indices", None)
296
+ if indices is not None:
297
+ indices = (indices[0], slice(None))
298
+ return registry.safer_read_partial(self.storage["obs"], indices=indices) # type: ignore # noqa
461
299
  else:
462
- if get_spec(storage).encoding_type == "anndata":
463
- return AnnDataAccessor(conn, storage, name)
300
+ return registry.read_dataframe(self.storage["obs"]) # type: ignore
301
+
302
+ @cached_property
303
+ def var(self) -> pd.DataFrame:
304
+ if "var" not in self._attrs_keys:
305
+ return None
306
+ indices = getattr(self, "indices", None)
307
+ if indices is not None:
308
+ indices = (indices[1], slice(None))
309
+ return registry.safer_read_partial(self.storage["var"], indices=indices) # type: ignore # noqa
310
+ else:
311
+ return registry.read_dataframe(self.storage["var"]) # type: ignore
312
+
313
+ @cached_property
314
+ def uns(self):
315
+ if "uns" not in self._attrs_keys:
316
+ return None
317
+ return read_elem(self.storage["uns"])
318
+
319
+ @cached_property
320
+ def X(self):
321
+ indices = getattr(self, "indices", None)
322
+ if indices is not None:
323
+ return registry.safer_read_partial(self.storage["X"], indices=indices)
324
+ else:
325
+ return _try_backed_full(self.storage["X"])
326
+
327
+ @cached_property
328
+ def obsm(self):
329
+ if "obsm" not in self._attrs_keys:
330
+ return None
331
+ indices = getattr(self, "indices", None)
332
+ if indices is not None:
333
+ indices = (indices[0], slice(None))
334
+ return _MapAccessor(self.storage["obsm"], "obsm", indices)
335
+
336
+ @cached_property
337
+ def varm(self):
338
+ if "varm" not in self._attrs_keys:
339
+ return None
340
+ indices = getattr(self, "indices", None)
341
+ if indices is not None:
342
+ indices = (indices[1], slice(None))
343
+ return _MapAccessor(self.storage["varm"], "varm", indices)
344
+
345
+ @cached_property
346
+ def obsp(self):
347
+ if "obsp" not in self._attrs_keys:
348
+ return None
349
+ indices = getattr(self, "indices", None)
350
+ if indices is not None:
351
+ indices = (indices[0], indices[0])
352
+ return _MapAccessor(self.storage["obsp"], "obsp", indices)
353
+
354
+ @cached_property
355
+ def varp(self):
356
+ if "varp" not in self._attrs_keys:
357
+ return None
358
+ indices = getattr(self, "indices", None)
359
+ if indices is not None:
360
+ indices = (indices[1], indices[1])
361
+ return _MapAccessor(self.storage["varp"], "varp", indices)
362
+
363
+ @cached_property
364
+ def layers(self):
365
+ if "layers" not in self._attrs_keys:
366
+ return None
367
+ indices = getattr(self, "indices", None)
368
+ return _MapAccessor(self.storage["layers"], "layers", indices)
369
+
370
+ @property
371
+ def obs_names(self):
372
+ return self._obs_names
373
+
374
+ @property
375
+ def var_names(self):
376
+ return self._var_names
377
+
378
+ @cached_property
379
+ def shape(self):
380
+ return len(self._obs_names), len(self._var_names)
381
+
382
+ def to_dict(self):
383
+ prepare_adata = {}
384
+
385
+ prepare_adata["X"] = _to_memory(self.X)
386
+
387
+ if "uns" in self._attrs_keys:
388
+ prepare_adata["uns"] = self.uns
389
+
390
+ for attr in ("obs", "var"):
391
+ if attr in self._attrs_keys:
392
+ prepare_adata[attr] = getattr(self, attr)
393
+
394
+ for attr in ("obsm", "varm", "obsp", "varp", "layers"):
395
+ if attr in self._attrs_keys:
396
+ prepare_adata[attr] = {}
397
+ get_attr = getattr(self, attr)
398
+ for key in self._attrs_keys[attr]:
399
+ prepare_adata[attr][key] = _to_memory(get_attr[key])
400
+
401
+ if "raw" in self._attrs_keys:
402
+ prepare_adata["raw"] = self.raw.to_dict()
403
+
404
+ return prepare_adata
405
+
406
+ def to_memory(self):
407
+ adata = AnnData(**self.to_dict())
408
+ return adata
409
+
410
+
411
+ class AnnDataAccessorSubset(_AnnDataAttrsMixin):
412
+ def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):
413
+ self.storage = storage
414
+ self.indices = indices
415
+
416
+ self._attrs_keys = attrs_keys
417
+ self._obs_names, self._var_names = obs_names, var_names
418
+
419
+ self._ref_shape = ref_shape
420
+
421
+ def __getitem__(self, index: Index):
422
+ """Access a subset of the underlying AnnData object."""
423
+ oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
424
+ new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
425
+ if self.indices is not None:
426
+ oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
427
+ vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
428
+ return type(self)(
429
+ self.storage,
430
+ (oidx, vidx),
431
+ self._attrs_keys,
432
+ new_obs_names,
433
+ new_var_names,
434
+ self._ref_shape,
435
+ )
436
+
437
+ def __repr__(self):
438
+ """Description of the object."""
439
+ n_obs, n_vars = self.shape
440
+ descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
441
+ for attr, keys in self._attrs_keys.items():
442
+ descr += f"\n {attr}: {keys}"
443
+ return descr
444
+
445
+ @cached_property
446
+ def raw(self):
447
+ if "raw" not in self._attrs_keys:
448
+ return None
449
+ prepare_indices = None
450
+ if self.indices is not None:
451
+ oidx = self.indices[0]
452
+ if oidx != slice(None):
453
+ prepare_indices = oidx, slice(None)
454
+ return AnnDataRawAccessor(
455
+ self.storage["raw"],
456
+ prepare_indices,
457
+ None,
458
+ self._obs_names,
459
+ None,
460
+ self._ref_shape[0],
461
+ )
462
+
463
+
464
+ class AnnDataRawAccessor(AnnDataAccessorSubset):
465
+ def __init__(
466
+ self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
467
+ ):
468
+ var_raw = storage_raw["var"]
469
+
470
+ if var_names is None:
471
+ var_names = _safer_read_index(var_raw)
472
+
473
+ if isinstance(ref_shape, int):
474
+ ref_shape = ref_shape, len(var_names)
475
+ elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
476
+ ref_shape = ref_shape[0], len(var_names)
477
+
478
+ if attrs_keys is None:
479
+ attrs_keys = {}
480
+ if isinstance(var_raw, (h5py.Dataset, zarr.Array)):
481
+ attrs_keys["var"] = list(var_raw.dtype.fields.keys())
464
482
  else:
465
- return BackedAccessor(conn, storage)
483
+ # for some reason list(var_raw.keys()) is very slow for zarr
484
+ # maybe also directly get keys from the underlying mapper
485
+ attrs_keys["var"] = [key for key in var_raw]
486
+ if "varm" in storage_raw:
487
+ varm_keys_raw = [key for key in storage_raw["varm"]]
488
+ if len(varm_keys_raw) > 0:
489
+ attrs_keys["varm"] = varm_keys_raw
490
+
491
+ super().__init__(
492
+ storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
493
+ )
494
+
495
+ @property
496
+ def raw(self):
497
+ raise AttributeError
498
+
499
+
500
+ class AnnDataAccessor(_AnnDataAttrsMixin):
501
+ """Cloud-backed AnnData."""
502
+
503
+ def __init__(
504
+ self,
505
+ connection: Union[OpenFile, None],
506
+ storage: StorageType,
507
+ filename: str,
508
+ ):
509
+ self._conn = connection
510
+ self.storage = storage
511
+
512
+ self._attrs_keys = registry.keys(self.storage)
513
+
514
+ self._name = filename
515
+
516
+ self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
517
+ self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
518
+
519
+ def __del__(self):
520
+ """Closes the connection."""
521
+ if self._conn is not None:
522
+ self.storage.close()
523
+ self._conn.close()
524
+
525
+ def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
526
+ """Access a subset of the underlying AnnData object."""
527
+ oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
528
+ new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
529
+ return AnnDataAccessorSubset(
530
+ self.storage,
531
+ (oidx, vidx),
532
+ self._attrs_keys,
533
+ new_obs_names,
534
+ new_var_names,
535
+ self.shape,
536
+ )
537
+
538
+ def __repr__(self):
539
+ """Description of the AnnDataAccessor object."""
540
+ n_obs, n_vars = self.shape
541
+ descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
542
+ descr += f"\n constructed for the AnnData object {self._name}"
543
+ for attr, keys in self._attrs_keys.items():
544
+ descr += f"\n {attr}: {keys}"
545
+ return descr
546
+
547
+ @cached_property
548
+ def raw(self):
549
+ if "raw" not in self._attrs_keys:
550
+ return None
551
+ return AnnDataRawAccessor(
552
+ self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
553
+ )
554
+
555
+
556
+ @dataclass
557
+ class BackedAccessor:
558
+ """h5py.File or zarr.Group accessor."""
559
+
560
+ connection: OpenFile
561
+ """The connection."""
562
+ storage: StorageType
563
+ """The storage access."""
564
+
565
+
566
+ def backed_access(
567
+ file_or_filepath: Union[File, Path]
568
+ ) -> Union[AnnDataAccessor, BackedAccessor]:
569
+ if isinstance(file_or_filepath, File):
570
+ filepath = filepath_from_file(file_or_filepath)
571
+ else:
572
+ filepath = file_or_filepath
573
+ name = filepath.name
574
+
575
+ if filepath.suffix in (".h5", ".hdf5", ".h5ad"):
576
+ conn, storage = registry.open("h5py", filepath)
577
+ elif filepath.suffix in (".zarr", ".zrad"):
578
+ conn, storage = registry.open("zarr", filepath)
579
+ else:
580
+ raise ValueError(
581
+ "file should have .h5, .hdf5, .h5ad, .zarr or .zrad suffix, not"
582
+ f" {filepath.suffix}."
583
+ )
584
+
585
+ if filepath.suffix in (".h5ad", ".zrad"):
586
+ return AnnDataAccessor(conn, storage, name)
587
+ else:
588
+ if get_spec(storage).encoding_type == "anndata":
589
+ return AnnDataAccessor(conn, storage, name)
590
+ else:
591
+ return BackedAccessor(conn, storage)