lamindb 0.76.8__py3-none-any.whl → 0.76.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1205
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +389 -387
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -308
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +574 -574
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -867
  27. lamindb/core/_label_manager.py +253 -253
  28. lamindb/core/_mapped_collection.py +631 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +581 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -90
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -164
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -204
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -172
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/METADATA +4 -4
  59. lamindb-0.76.9.dist-info/RECORD +60 -0
  60. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.8.dist-info/RECORD +0 -60
@@ -1,740 +1,740 @@
1
- from __future__ import annotations
2
-
3
- import inspect
4
- from functools import cached_property
5
- from itertools import chain
6
- from typing import TYPE_CHECKING, Callable, Literal, Mapping, Union
7
-
8
- import h5py
9
- import numpy as np
10
- import pandas as pd
11
- from anndata import AnnData
12
- from anndata import __version__ as anndata_version
13
- from anndata._core.index import _normalize_indices
14
- from anndata._core.views import _resolve_idx
15
- from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
16
- from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
17
- from anndata.compat import _read_attr
18
- from fsspec.implementations.local import LocalFileSystem
19
- from lamin_utils import logger
20
- from lamindb_setup.core.upath import UPath, create_mapper, infer_filesystem
21
- from packaging import version
22
-
23
- if TYPE_CHECKING:
24
- from pathlib import Path
25
-
26
- from fsspec.core import OpenFile
27
- from lamindb_setup.core.types import UPathStr
28
-
29
-
30
- anndata_version_parse = version.parse(anndata_version)
31
-
32
- if anndata_version_parse < version.parse("0.9.0"):
33
- from anndata._core.index import Index
34
- else:
35
- from anndata.compat import Index
36
-
37
- if anndata_version_parse < version.parse("0.10.0"):
38
- if anndata_version_parse < version.parse("0.9.1"):
39
- logger.warning(
40
- "Full backed capabilities are not available for this version of anndata,"
41
- " please install anndata>=0.9.1."
42
- )
43
-
44
- from anndata._core.sparse_dataset import SparseDataset
45
-
46
- # try csr for groups with no encoding_type
47
- class CSRDataset(SparseDataset):
48
- @property
49
- def format_str(self) -> str:
50
- return "csr"
51
-
52
- def sparse_dataset(group):
53
- return SparseDataset(group)
54
-
55
- else:
56
- from anndata._core.sparse_dataset import (
57
- BaseCompressedSparseDataset as SparseDataset,
58
- )
59
- from anndata._core.sparse_dataset import ( # type: ignore
60
- CSRDataset,
61
- sparse_dataset,
62
- )
63
-
64
- def _check_group_format(*args):
65
- pass
66
-
67
- CSRDataset._check_group_format = _check_group_format
68
-
69
-
70
- # zarr and CSRDataset have problems with full selection
71
- def _subset_sparse(sparse_ds: CSRDataset | SparseDataset, indices):
72
- has_arrays = isinstance(indices[0], np.ndarray) or isinstance(
73
- indices[1], np.ndarray
74
- )
75
- if not has_arrays and indices == (slice(None), slice(None)):
76
- return sparse_ds.to_memory()
77
- else:
78
- return sparse_ds[indices]
79
-
80
-
81
- def get_module_name(obj):
82
- return inspect.getmodule(obj).__name__.partition(".")[0]
83
-
84
-
85
- def _records_to_df(obj):
86
- if isinstance(obj, pd.DataFrame):
87
- return obj
88
-
89
- if hasattr(obj, "dtype") and obj.dtype.names is not None:
90
- formats = []
91
- for name, (dt, _) in obj.dtype.fields.items():
92
- if dt.char == "S":
93
- new_dt = str(dt).replace("S", "U")
94
- else:
95
- new_dt = dt
96
- formats.append((name, new_dt))
97
- df = pd.DataFrame(obj.astype(formats, copy=False))
98
- for index_name in ("index", "_index"):
99
- if index_name in df.columns:
100
- return df.set_index(index_name)
101
- return df
102
- else:
103
- return obj
104
-
105
-
106
- class AccessRegistry:
107
- def __init__(self):
108
- self._registry = {}
109
- self._openers = {}
110
-
111
- def register_open(self, module: str):
112
- def wrapper(func: Callable):
113
- self._openers[module] = func
114
- return func
115
-
116
- return wrapper
117
-
118
- def open(self, module: str, *args, **kwargs):
119
- if module in self._openers:
120
- return self._openers[module](*args, **kwargs)
121
- else:
122
- raise ValueError(f"Module {module} not found, please install it.")
123
-
124
- def register(self, module: str):
125
- def wrapper(func: Callable):
126
- func_name = func.__name__
127
- if func_name not in self._registry:
128
- self._registry[func_name] = {}
129
- self._registry[func_name][module] = func
130
- return func
131
-
132
- return wrapper
133
-
134
- def __getattr__(self, func_name: str):
135
- def wrapper(*args, **kwargs):
136
- func_registry = self._registry[func_name]
137
- for arg in chain(args, kwargs.values()):
138
- arg_module = get_module_name(arg)
139
- if arg_module in func_registry:
140
- return func_registry[arg_module](*args, **kwargs)
141
- raise ValueError(f"{func_name} is not registered for this module.")
142
-
143
- return wrapper
144
-
145
-
146
- # storage specific functions should be registered and called through the registry
147
- registry = AccessRegistry()
148
-
149
-
150
- @registry.register_open("h5py")
151
- def open(filepath: UPathStr, mode: str = "r"):
152
- fs, file_path_str = infer_filesystem(filepath)
153
- if isinstance(fs, LocalFileSystem):
154
- assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
155
- return None, h5py.File(file_path_str, mode=mode)
156
- if mode == "r":
157
- conn_mode = "rb"
158
- elif mode == "w":
159
- conn_mode = "wb"
160
- elif mode == "a":
161
- conn_mode = "ab"
162
- else:
163
- raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
164
- conn = fs.open(file_path_str, mode=conn_mode)
165
- try:
166
- storage = h5py.File(conn, mode=mode)
167
- except Exception as e:
168
- conn.close()
169
- raise e
170
- return conn, storage
171
-
172
-
173
- @registry.register("h5py")
174
- def read_dataframe(elem: h5py.Dataset | h5py.Group):
175
- if isinstance(elem, h5py.Dataset):
176
- return read_dataframe_legacy_h5(elem)
177
- else:
178
- return read_elem(elem)
179
-
180
-
181
- @registry.register("h5py")
182
- def safer_read_partial(elem, indices):
183
- is_dataset = isinstance(elem, h5py.Dataset)
184
- indices_inverse: list | None = None
185
- encoding_type = get_spec(elem).encoding_type
186
- # h5py selection for datasets requires sorted indices
187
- if is_dataset or encoding_type == "dataframe":
188
- indices_increasing = []
189
- indices_inverse = []
190
- for indices_dim in indices:
191
- # should be integer or bool
192
- # ignore bool or increasing unique integers
193
- if (
194
- isinstance(indices_dim, np.ndarray)
195
- and indices_dim.dtype != "bool"
196
- and not np.all(np.diff(indices_dim) > 0)
197
- ):
198
- idx_unique, idx_inverse = np.unique(indices_dim, return_inverse=True)
199
- indices_increasing.append(idx_unique)
200
- indices_inverse.append(idx_inverse)
201
- else:
202
- indices_increasing.append(indices_dim)
203
- indices_inverse.append(None)
204
- indices = tuple(indices_increasing)
205
- if all(idx is None for idx in indices_inverse):
206
- indices_inverse = None
207
- result = None
208
- if encoding_type == "":
209
- if is_dataset:
210
- dims = len(elem.shape)
211
- if dims == 2:
212
- result = elem[indices]
213
- elif dims == 1:
214
- if indices[0] == slice(None):
215
- result = elem[indices[1]]
216
- elif indices[1] == slice(None):
217
- result = elem[indices[0]]
218
- elif isinstance(elem, h5py.Group):
219
- try:
220
- ds = CSRDataset(elem)
221
- result = _subset_sparse(ds, indices)
222
- except Exception as e:
223
- logger.debug(
224
- f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
225
- )
226
- if result is None:
227
- raise ValueError(
228
- "Can not get a subset of the element of type"
229
- f" {type(elem).__name__} with an empty spec."
230
- )
231
- else:
232
- result = read_elem_partial(elem, indices=indices)
233
- if indices_inverse is None:
234
- return result
235
- else:
236
- if indices_inverse[0] is None:
237
- if len(result.shape) == 2:
238
- return result[:, indices_inverse[1]]
239
- else:
240
- return result[indices_inverse[1]]
241
- elif indices_inverse[1] is None:
242
- if isinstance(result, pd.DataFrame):
243
- return result.iloc[indices_inverse[0]]
244
- else:
245
- return result[indices_inverse[0]]
246
- else:
247
- return result[tuple(indices_inverse)]
248
-
249
-
250
- @registry.register("h5py")
251
- def keys(storage: h5py.File):
252
- attrs_keys: dict[str, list] = {}
253
- for attr in storage.keys():
254
- if attr == "X":
255
- continue
256
- attr_obj = storage[attr]
257
- if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
258
- keys = list(attr_obj.dtype.fields.keys())
259
- else:
260
- keys = list(attr_obj.keys())
261
- if len(keys) > 0:
262
- attrs_keys[attr] = keys
263
- return attrs_keys
264
-
265
-
266
- ArrayTypes = [h5py.Dataset]
267
- GroupTypes = [h5py.Group]
268
- StorageTypes = [h5py.File]
269
-
270
-
271
- ZARR_INSTALLED = False
272
- try:
273
- import zarr
274
-
275
- ZARR_INSTALLED = True
276
- except ImportError:
277
- pass
278
-
279
- if ZARR_INSTALLED:
280
- from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
281
-
282
- ArrayTypes.append(zarr.Array)
283
- GroupTypes.append(zarr.Group)
284
- StorageTypes.append(zarr.Group)
285
-
286
- @registry.register_open("zarr")
287
- def open(filepath: UPathStr, mode: Literal["r", "r+", "a", "w", "w-"] = "r"):
288
- assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
289
-
290
- fs, file_path_str = infer_filesystem(filepath)
291
- conn = None
292
- if isinstance(fs, LocalFileSystem):
293
- # this is faster than through an fsspec mapper for local
294
- open_obj = file_path_str
295
- else:
296
- open_obj = create_mapper(fs, file_path_str, check=True)
297
- storage = zarr.open(open_obj, mode=mode)
298
- return conn, storage
299
-
300
- @registry.register("zarr")
301
- def read_dataframe(elem: Union[zarr.Array, zarr.Group]): # noqa
302
- if isinstance(elem, zarr.Array):
303
- return read_dataframe_legacy_zarr(elem)
304
- else:
305
- return read_elem(elem)
306
-
307
- @registry.register("zarr")
308
- def safer_read_partial(elem, indices):
309
- encoding_type = get_spec(elem).encoding_type
310
- if encoding_type == "":
311
- if isinstance(elem, zarr.Array):
312
- dims = len(elem.shape)
313
- if dims == 2:
314
- return elem.oindex[indices]
315
- elif dims == 1:
316
- if indices[0] == slice(None):
317
- return elem.oindex[indices[1]]
318
- elif indices[1] == slice(None):
319
- return elem.oindex[indices[0]]
320
- elif isinstance(elem, zarr.Group):
321
- try:
322
- ds = CSRDataset(elem)
323
- return _subset_sparse(ds, indices)
324
- except Exception as e:
325
- logger.debug(
326
- f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
327
- )
328
- raise ValueError(
329
- "Can not get a subset of the element of type"
330
- f" {type(elem).__name__} with an empty spec."
331
- )
332
- else:
333
- if encoding_type in ("csr_matrix", "csc_matrix"):
334
- ds = sparse_dataset(elem)
335
- return _subset_sparse(ds, indices)
336
- else:
337
- return read_elem_partial(elem, indices=indices)
338
-
339
- # this is needed because accessing zarr.Group.keys() directly is very slow
340
- @registry.register("zarr")
341
- def keys(storage: zarr.Group):
342
- paths = storage._store.keys()
343
-
344
- attrs_keys: dict[str, list] = {}
345
- obs_var_arrays = []
346
-
347
- for path in paths:
348
- if path in (".zattrs", ".zgroup"):
349
- continue
350
- parts = path.split("/")
351
- if len(parts) < 2:
352
- continue
353
- attr = parts[0]
354
- key = parts[1]
355
-
356
- if attr == "X":
357
- continue
358
-
359
- if attr in ("obs", "var"):
360
- if attr in obs_var_arrays:
361
- continue
362
- if key == ".zarray":
363
- attrs_keys.pop(attr, None)
364
- obs_var_arrays.append(attr)
365
-
366
- if attr not in attrs_keys:
367
- attrs_keys[attr] = []
368
-
369
- if key in (".zattrs", ".zgroup", ".zarray"):
370
- continue
371
- attr_keys = attrs_keys[attr]
372
- if key not in attr_keys:
373
- attr_keys.append(key)
374
-
375
- for attr in obs_var_arrays:
376
- attrs_keys[attr] = list(storage[attr].dtype.fields.keys())
377
-
378
- return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}
379
-
380
-
381
- ArrayTypes = tuple(ArrayTypes) # type: ignore
382
- GroupTypes = tuple(GroupTypes) # type: ignore
383
- StorageTypes = tuple(StorageTypes) # type: ignore
384
-
385
-
386
- ArrayType = Union[ArrayTypes] # type: ignore
387
- GroupType = Union[GroupTypes] # type: ignore
388
- StorageType = Union[StorageTypes] # type: ignore
389
-
390
-
391
- def _to_memory(elem):
392
- if isinstance(elem, ArrayTypes):
393
- return elem[()]
394
- elif isinstance(elem, SparseDataset):
395
- return elem.to_memory()
396
- else:
397
- return elem
398
-
399
-
400
- def _try_backed_full(elem):
401
- # think what to do for compatibility with old var and obs
402
- if isinstance(elem, ArrayTypes):
403
- return elem
404
-
405
- if isinstance(elem, GroupTypes):
406
- encoding_type = get_spec(elem).encoding_type
407
- if encoding_type in ("csr_matrix", "csc_matrix"):
408
- return sparse_dataset(elem)
409
- if "h5sparse_format" in elem.attrs:
410
- return sparse_dataset(elem)
411
- if encoding_type == "" and "indptr" in elem:
412
- return CSRDataset(elem)
413
-
414
- return read_elem(elem)
415
-
416
-
417
- def _safer_read_index(elem):
418
- if isinstance(elem, GroupTypes):
419
- return pd.Index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
420
- elif isinstance(elem, ArrayTypes):
421
- indices = None
422
- for index_name in ("index", "_index"):
423
- if index_name in elem.dtype.names:
424
- indices = elem[index_name]
425
- break
426
- if indices is not None and len(indices) > 0:
427
- if isinstance(indices[0], bytes):
428
- indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
429
- return pd.Index(indices)
430
- else:
431
- raise ValueError("Indices not found.")
432
- else:
433
- raise ValueError(f"Unknown elem type {type(elem)} when reading indices.")
434
-
435
-
436
- class _MapAccessor:
437
- def __init__(self, elem, name, indices=None):
438
- self.elem = elem
439
- self.indices = indices
440
- self.name = name
441
-
442
- def __getitem__(self, key):
443
- if self.indices is None:
444
- return _try_backed_full(self.elem[key])
445
- else:
446
- return registry.safer_read_partial(self.elem[key], indices=self.indices)
447
-
448
- def keys(self):
449
- return list(self.elem.keys())
450
-
451
- def __repr__(self):
452
- """Description of the _MapAccessor object."""
453
- descr = f"Accessor for the AnnData attribute {self.name}"
454
- descr += f"\n with keys: {self.keys()}"
455
- return descr
456
-
457
-
458
- class _AnnDataAttrsMixin:
459
- storage: StorageType
460
- _attrs_keys: Mapping[str, list]
461
-
462
- @cached_property
463
- def obs(self) -> pd.DataFrame:
464
- if "obs" not in self._attrs_keys:
465
- return None
466
- indices = getattr(self, "indices", None)
467
- if indices is not None:
468
- indices = (indices[0], slice(None))
469
- obj = registry.safer_read_partial(self.storage["obs"], indices=indices) # type: ignore
470
- return _records_to_df(obj)
471
- else:
472
- return registry.read_dataframe(self.storage["obs"]) # type: ignore
473
-
474
- @cached_property
475
- def var(self) -> pd.DataFrame:
476
- if "var" not in self._attrs_keys:
477
- return None
478
- indices = getattr(self, "indices", None)
479
- if indices is not None:
480
- indices = (indices[1], slice(None))
481
- obj = registry.safer_read_partial(self.storage["var"], indices=indices) # type: ignore
482
- return _records_to_df(obj)
483
- else:
484
- return registry.read_dataframe(self.storage["var"]) # type: ignore
485
-
486
- @cached_property
487
- def uns(self):
488
- if "uns" not in self._attrs_keys:
489
- return None
490
- return read_elem(self.storage["uns"])
491
-
492
- @cached_property
493
- def X(self):
494
- indices = getattr(self, "indices", None)
495
- if indices is not None:
496
- return registry.safer_read_partial(self.storage["X"], indices=indices)
497
- else:
498
- return _try_backed_full(self.storage["X"])
499
-
500
- @cached_property
501
- def obsm(self):
502
- if "obsm" not in self._attrs_keys:
503
- return None
504
- indices = getattr(self, "indices", None)
505
- if indices is not None:
506
- indices = (indices[0], slice(None))
507
- return _MapAccessor(self.storage["obsm"], "obsm", indices)
508
-
509
- @cached_property
510
- def varm(self):
511
- if "varm" not in self._attrs_keys:
512
- return None
513
- indices = getattr(self, "indices", None)
514
- if indices is not None:
515
- indices = (indices[1], slice(None))
516
- return _MapAccessor(self.storage["varm"], "varm", indices)
517
-
518
- @cached_property
519
- def obsp(self):
520
- if "obsp" not in self._attrs_keys:
521
- return None
522
- indices = getattr(self, "indices", None)
523
- if indices is not None:
524
- indices = (indices[0], indices[0])
525
- return _MapAccessor(self.storage["obsp"], "obsp", indices)
526
-
527
- @cached_property
528
- def varp(self):
529
- if "varp" not in self._attrs_keys:
530
- return None
531
- indices = getattr(self, "indices", None)
532
- if indices is not None:
533
- indices = (indices[1], indices[1])
534
- return _MapAccessor(self.storage["varp"], "varp", indices)
535
-
536
- @cached_property
537
- def layers(self):
538
- if "layers" not in self._attrs_keys:
539
- return None
540
- indices = getattr(self, "indices", None)
541
- return _MapAccessor(self.storage["layers"], "layers", indices)
542
-
543
- @property
544
- def obs_names(self):
545
- return self._obs_names
546
-
547
- @property
548
- def var_names(self):
549
- return self._var_names
550
-
551
- @cached_property
552
- def shape(self):
553
- return len(self._obs_names), len(self._var_names)
554
-
555
- def to_dict(self):
556
- prepare_adata = {}
557
-
558
- prepare_adata["X"] = _to_memory(self.X)
559
-
560
- if "uns" in self._attrs_keys:
561
- prepare_adata["uns"] = self.uns
562
-
563
- for attr in ("obs", "var"):
564
- if attr in self._attrs_keys:
565
- prepare_adata[attr] = getattr(self, attr)
566
-
567
- for attr in ("obsm", "varm", "obsp", "varp", "layers"):
568
- if attr in self._attrs_keys:
569
- prepare_adata[attr] = {}
570
- get_attr = getattr(self, attr)
571
- for key in self._attrs_keys[attr]:
572
- prepare_adata[attr][key] = _to_memory(get_attr[key])
573
-
574
- if "raw" in self._attrs_keys:
575
- prepare_adata["raw"] = self.raw.to_dict()
576
-
577
- return prepare_adata
578
-
579
- def to_memory(self):
580
- adata = AnnData(**self.to_dict())
581
- return adata
582
-
583
-
584
- class AnnDataAccessorSubset(_AnnDataAttrsMixin):
585
- def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):
586
- self.storage = storage
587
- self.indices = indices
588
-
589
- self._attrs_keys = attrs_keys
590
- self._obs_names, self._var_names = obs_names, var_names
591
-
592
- self._ref_shape = ref_shape
593
-
594
- def __getitem__(self, index: Index):
595
- """Access a subset of the underlying AnnData object."""
596
- oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
597
- new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
598
- if self.indices is not None:
599
- oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
600
- vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
601
- return type(self)(
602
- self.storage,
603
- (oidx, vidx),
604
- self._attrs_keys,
605
- new_obs_names,
606
- new_var_names,
607
- self._ref_shape,
608
- )
609
-
610
- def __repr__(self):
611
- """Description of the object."""
612
- n_obs, n_vars = self.shape
613
- descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
614
- for attr, keys in self._attrs_keys.items():
615
- descr += f"\n {attr}: {keys}"
616
- return descr
617
-
618
- @cached_property
619
- def raw(self):
620
- if "raw" not in self._attrs_keys:
621
- return None
622
- prepare_indices = None
623
- if self.indices is not None:
624
- oidx = self.indices[0]
625
- if isinstance(oidx, np.ndarray) or oidx != slice(None):
626
- prepare_indices = oidx, slice(None)
627
- return AnnDataRawAccessor(
628
- self.storage["raw"],
629
- prepare_indices,
630
- None,
631
- self._obs_names,
632
- None,
633
- self._ref_shape[0],
634
- )
635
-
636
-
637
- class AnnDataRawAccessor(AnnDataAccessorSubset):
638
- def __init__(
639
- self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
640
- ):
641
- var_raw = storage_raw["var"]
642
-
643
- if var_names is None:
644
- var_names = _safer_read_index(var_raw)
645
-
646
- if isinstance(ref_shape, int):
647
- ref_shape = ref_shape, len(var_names)
648
- elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
649
- ref_shape = ref_shape[0], len(var_names)
650
-
651
- if attrs_keys is None:
652
- attrs_keys = {}
653
- if isinstance(var_raw, ArrayTypes):
654
- attrs_keys["var"] = list(var_raw.dtype.fields.keys())
655
- else:
656
- # for some reason list(var_raw.keys()) is very slow for zarr
657
- # maybe also directly get keys from the underlying mapper
658
- attrs_keys["var"] = list(var_raw)
659
- if "varm" in storage_raw:
660
- varm_keys_raw = list(storage_raw["varm"])
661
- if len(varm_keys_raw) > 0:
662
- attrs_keys["varm"] = varm_keys_raw
663
-
664
- super().__init__(
665
- storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
666
- )
667
-
668
- @property
669
- def raw(self):
670
- raise AttributeError
671
-
672
-
673
- class AnnDataAccessor(_AnnDataAttrsMixin):
674
- """Cloud-backed AnnData."""
675
-
676
- def __init__(
677
- self,
678
- connection: OpenFile | None,
679
- storage: StorageType,
680
- filename: str,
681
- ):
682
- self._conn = connection
683
- self.storage = storage
684
-
685
- self._attrs_keys = registry.keys(self.storage)
686
-
687
- self._name = filename
688
-
689
- self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
690
- self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
691
-
692
- self._closed = False
693
-
694
- def close(self):
695
- """Closes the connection."""
696
- if hasattr(self, "storage") and hasattr(self.storage, "close"):
697
- self.storage.close()
698
- if hasattr(self, "_conn") and hasattr(self._conn, "close"):
699
- self._conn.close()
700
- self._closed = True
701
-
702
- @property
703
- def closed(self):
704
- return self._closed
705
-
706
- def __enter__(self):
707
- return self
708
-
709
- def __exit__(self, exc_type, exc_val, exc_tb):
710
- self.close()
711
-
712
- def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
713
- """Access a subset of the underlying AnnData object."""
714
- oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
715
- new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
716
- return AnnDataAccessorSubset(
717
- self.storage,
718
- (oidx, vidx),
719
- self._attrs_keys,
720
- new_obs_names,
721
- new_var_names,
722
- self.shape,
723
- )
724
-
725
- def __repr__(self):
726
- """Description of the AnnDataAccessor object."""
727
- n_obs, n_vars = self.shape
728
- descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
729
- descr += f"\n constructed for the AnnData object {self._name}"
730
- for attr, keys in self._attrs_keys.items():
731
- descr += f"\n {attr}: {keys}"
732
- return descr
733
-
734
- @cached_property
735
- def raw(self):
736
- if "raw" not in self._attrs_keys:
737
- return None
738
- return AnnDataRawAccessor(
739
- self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
740
- )
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ from functools import cached_property
5
+ from itertools import chain
6
+ from typing import TYPE_CHECKING, Callable, Literal, Mapping, Union
7
+
8
+ import h5py
9
+ import numpy as np
10
+ import pandas as pd
11
+ from anndata import AnnData
12
+ from anndata import __version__ as anndata_version
13
+ from anndata._core.index import _normalize_indices
14
+ from anndata._core.views import _resolve_idx
15
+ from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
16
+ from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
17
+ from anndata.compat import _read_attr
18
+ from fsspec.implementations.local import LocalFileSystem
19
+ from lamin_utils import logger
20
+ from lamindb_setup.core.upath import UPath, create_mapper, infer_filesystem
21
+ from packaging import version
22
+
23
+ if TYPE_CHECKING:
24
+ from pathlib import Path
25
+
26
+ from fsspec.core import OpenFile
27
+ from lamindb_setup.core.types import UPathStr
28
+
29
+
30
+ anndata_version_parse = version.parse(anndata_version)
31
+
32
+ if anndata_version_parse < version.parse("0.9.0"):
33
+ from anndata._core.index import Index
34
+ else:
35
+ from anndata.compat import Index
36
+
37
+ if anndata_version_parse < version.parse("0.10.0"):
38
+ if anndata_version_parse < version.parse("0.9.1"):
39
+ logger.warning(
40
+ "Full backed capabilities are not available for this version of anndata,"
41
+ " please install anndata>=0.9.1."
42
+ )
43
+
44
+ from anndata._core.sparse_dataset import SparseDataset
45
+
46
+ # try csr for groups with no encoding_type
47
+ class CSRDataset(SparseDataset):
48
+ @property
49
+ def format_str(self) -> str:
50
+ return "csr"
51
+
52
+ def sparse_dataset(group):
53
+ return SparseDataset(group)
54
+
55
+ else:
56
+ from anndata._core.sparse_dataset import (
57
+ BaseCompressedSparseDataset as SparseDataset,
58
+ )
59
+ from anndata._core.sparse_dataset import ( # type: ignore
60
+ CSRDataset,
61
+ sparse_dataset,
62
+ )
63
+
64
+ def _check_group_format(*args):
65
+ pass
66
+
67
+ CSRDataset._check_group_format = _check_group_format
68
+
69
+
70
+ # zarr and CSRDataset have problems with full selection
71
+ def _subset_sparse(sparse_ds: CSRDataset | SparseDataset, indices):
72
+ has_arrays = isinstance(indices[0], np.ndarray) or isinstance(
73
+ indices[1], np.ndarray
74
+ )
75
+ if not has_arrays and indices == (slice(None), slice(None)):
76
+ return sparse_ds.to_memory()
77
+ else:
78
+ return sparse_ds[indices]
79
+
80
+
81
+ def get_module_name(obj):
82
+ return inspect.getmodule(obj).__name__.partition(".")[0]
83
+
84
+
85
+ def _records_to_df(obj):
86
+ if isinstance(obj, pd.DataFrame):
87
+ return obj
88
+
89
+ if hasattr(obj, "dtype") and obj.dtype.names is not None:
90
+ formats = []
91
+ for name, (dt, _) in obj.dtype.fields.items():
92
+ if dt.char == "S":
93
+ new_dt = str(dt).replace("S", "U")
94
+ else:
95
+ new_dt = dt
96
+ formats.append((name, new_dt))
97
+ df = pd.DataFrame(obj.astype(formats, copy=False))
98
+ for index_name in ("index", "_index"):
99
+ if index_name in df.columns:
100
+ return df.set_index(index_name)
101
+ return df
102
+ else:
103
+ return obj
104
+
105
+
106
+ class AccessRegistry:
107
+ def __init__(self):
108
+ self._registry = {}
109
+ self._openers = {}
110
+
111
+ def register_open(self, module: str):
112
+ def wrapper(func: Callable):
113
+ self._openers[module] = func
114
+ return func
115
+
116
+ return wrapper
117
+
118
+ def open(self, module: str, *args, **kwargs):
119
+ if module in self._openers:
120
+ return self._openers[module](*args, **kwargs)
121
+ else:
122
+ raise ValueError(f"Module {module} not found, please install it.")
123
+
124
+ def register(self, module: str):
125
+ def wrapper(func: Callable):
126
+ func_name = func.__name__
127
+ if func_name not in self._registry:
128
+ self._registry[func_name] = {}
129
+ self._registry[func_name][module] = func
130
+ return func
131
+
132
+ return wrapper
133
+
134
+ def __getattr__(self, func_name: str):
135
+ def wrapper(*args, **kwargs):
136
+ func_registry = self._registry[func_name]
137
+ for arg in chain(args, kwargs.values()):
138
+ arg_module = get_module_name(arg)
139
+ if arg_module in func_registry:
140
+ return func_registry[arg_module](*args, **kwargs)
141
+ raise ValueError(f"{func_name} is not registered for this module.")
142
+
143
+ return wrapper
144
+
145
+
146
+ # storage specific functions should be registered and called through the registry
147
+ registry = AccessRegistry()
148
+
149
+
150
+ @registry.register_open("h5py")
151
+ def open(filepath: UPathStr, mode: str = "r"):
152
+ fs, file_path_str = infer_filesystem(filepath)
153
+ if isinstance(fs, LocalFileSystem):
154
+ assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
155
+ return None, h5py.File(file_path_str, mode=mode)
156
+ if mode == "r":
157
+ conn_mode = "rb"
158
+ elif mode == "w":
159
+ conn_mode = "wb"
160
+ elif mode == "a":
161
+ conn_mode = "ab"
162
+ else:
163
+ raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
164
+ conn = fs.open(file_path_str, mode=conn_mode)
165
+ try:
166
+ storage = h5py.File(conn, mode=mode)
167
+ except Exception as e:
168
+ conn.close()
169
+ raise e
170
+ return conn, storage
171
+
172
+
173
+ @registry.register("h5py")
174
+ def read_dataframe(elem: h5py.Dataset | h5py.Group):
175
+ if isinstance(elem, h5py.Dataset):
176
+ return read_dataframe_legacy_h5(elem)
177
+ else:
178
+ return read_elem(elem)
179
+
180
+
181
+ @registry.register("h5py")
182
+ def safer_read_partial(elem, indices):
183
+ is_dataset = isinstance(elem, h5py.Dataset)
184
+ indices_inverse: list | None = None
185
+ encoding_type = get_spec(elem).encoding_type
186
+ # h5py selection for datasets requires sorted indices
187
+ if is_dataset or encoding_type == "dataframe":
188
+ indices_increasing = []
189
+ indices_inverse = []
190
+ for indices_dim in indices:
191
+ # should be integer or bool
192
+ # ignore bool or increasing unique integers
193
+ if (
194
+ isinstance(indices_dim, np.ndarray)
195
+ and indices_dim.dtype != "bool"
196
+ and not np.all(np.diff(indices_dim) > 0)
197
+ ):
198
+ idx_unique, idx_inverse = np.unique(indices_dim, return_inverse=True)
199
+ indices_increasing.append(idx_unique)
200
+ indices_inverse.append(idx_inverse)
201
+ else:
202
+ indices_increasing.append(indices_dim)
203
+ indices_inverse.append(None)
204
+ indices = tuple(indices_increasing)
205
+ if all(idx is None for idx in indices_inverse):
206
+ indices_inverse = None
207
+ result = None
208
+ if encoding_type == "":
209
+ if is_dataset:
210
+ dims = len(elem.shape)
211
+ if dims == 2:
212
+ result = elem[indices]
213
+ elif dims == 1:
214
+ if indices[0] == slice(None):
215
+ result = elem[indices[1]]
216
+ elif indices[1] == slice(None):
217
+ result = elem[indices[0]]
218
+ elif isinstance(elem, h5py.Group):
219
+ try:
220
+ ds = CSRDataset(elem)
221
+ result = _subset_sparse(ds, indices)
222
+ except Exception as e:
223
+ logger.debug(
224
+ f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
225
+ )
226
+ if result is None:
227
+ raise ValueError(
228
+ "Can not get a subset of the element of type"
229
+ f" {type(elem).__name__} with an empty spec."
230
+ )
231
+ else:
232
+ result = read_elem_partial(elem, indices=indices)
233
+ if indices_inverse is None:
234
+ return result
235
+ else:
236
+ if indices_inverse[0] is None:
237
+ if len(result.shape) == 2:
238
+ return result[:, indices_inverse[1]]
239
+ else:
240
+ return result[indices_inverse[1]]
241
+ elif indices_inverse[1] is None:
242
+ if isinstance(result, pd.DataFrame):
243
+ return result.iloc[indices_inverse[0]]
244
+ else:
245
+ return result[indices_inverse[0]]
246
+ else:
247
+ return result[tuple(indices_inverse)]
248
+
249
+
250
+ @registry.register("h5py")
251
+ def keys(storage: h5py.File):
252
+ attrs_keys: dict[str, list] = {}
253
+ for attr in storage.keys():
254
+ if attr == "X":
255
+ continue
256
+ attr_obj = storage[attr]
257
+ if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
258
+ keys = list(attr_obj.dtype.fields.keys())
259
+ else:
260
+ keys = list(attr_obj.keys())
261
+ if len(keys) > 0:
262
+ attrs_keys[attr] = keys
263
+ return attrs_keys
264
+
265
+
266
+ ArrayTypes = [h5py.Dataset]
267
+ GroupTypes = [h5py.Group]
268
+ StorageTypes = [h5py.File]
269
+
270
+
271
+ ZARR_INSTALLED = False
272
+ try:
273
+ import zarr
274
+
275
+ ZARR_INSTALLED = True
276
+ except ImportError:
277
+ pass
278
+
279
+ if ZARR_INSTALLED:
280
+ from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
281
+
282
+ ArrayTypes.append(zarr.Array)
283
+ GroupTypes.append(zarr.Group)
284
+ StorageTypes.append(zarr.Group)
285
+
286
+ @registry.register_open("zarr")
287
+ def open(filepath: UPathStr, mode: Literal["r", "r+", "a", "w", "w-"] = "r"):
288
+ assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
289
+
290
+ fs, file_path_str = infer_filesystem(filepath)
291
+ conn = None
292
+ if isinstance(fs, LocalFileSystem):
293
+ # this is faster than through an fsspec mapper for local
294
+ open_obj = file_path_str
295
+ else:
296
+ open_obj = create_mapper(fs, file_path_str, check=True)
297
+ storage = zarr.open(open_obj, mode=mode)
298
+ return conn, storage
299
+
300
+ @registry.register("zarr")
301
+ def read_dataframe(elem: Union[zarr.Array, zarr.Group]): # noqa
302
+ if isinstance(elem, zarr.Array):
303
+ return read_dataframe_legacy_zarr(elem)
304
+ else:
305
+ return read_elem(elem)
306
+
307
+ @registry.register("zarr")
308
+ def safer_read_partial(elem, indices):
309
+ encoding_type = get_spec(elem).encoding_type
310
+ if encoding_type == "":
311
+ if isinstance(elem, zarr.Array):
312
+ dims = len(elem.shape)
313
+ if dims == 2:
314
+ return elem.oindex[indices]
315
+ elif dims == 1:
316
+ if indices[0] == slice(None):
317
+ return elem.oindex[indices[1]]
318
+ elif indices[1] == slice(None):
319
+ return elem.oindex[indices[0]]
320
+ elif isinstance(elem, zarr.Group):
321
+ try:
322
+ ds = CSRDataset(elem)
323
+ return _subset_sparse(ds, indices)
324
+ except Exception as e:
325
+ logger.debug(
326
+ f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
327
+ )
328
+ raise ValueError(
329
+ "Can not get a subset of the element of type"
330
+ f" {type(elem).__name__} with an empty spec."
331
+ )
332
+ else:
333
+ if encoding_type in ("csr_matrix", "csc_matrix"):
334
+ ds = sparse_dataset(elem)
335
+ return _subset_sparse(ds, indices)
336
+ else:
337
+ return read_elem_partial(elem, indices=indices)
338
+
339
+ # this is needed because accessing zarr.Group.keys() directly is very slow
340
+ @registry.register("zarr")
341
+ def keys(storage: zarr.Group):
342
+ paths = storage._store.keys()
343
+
344
+ attrs_keys: dict[str, list] = {}
345
+ obs_var_arrays = []
346
+
347
+ for path in paths:
348
+ if path in (".zattrs", ".zgroup"):
349
+ continue
350
+ parts = path.split("/")
351
+ if len(parts) < 2:
352
+ continue
353
+ attr = parts[0]
354
+ key = parts[1]
355
+
356
+ if attr == "X":
357
+ continue
358
+
359
+ if attr in ("obs", "var"):
360
+ if attr in obs_var_arrays:
361
+ continue
362
+ if key == ".zarray":
363
+ attrs_keys.pop(attr, None)
364
+ obs_var_arrays.append(attr)
365
+
366
+ if attr not in attrs_keys:
367
+ attrs_keys[attr] = []
368
+
369
+ if key in (".zattrs", ".zgroup", ".zarray"):
370
+ continue
371
+ attr_keys = attrs_keys[attr]
372
+ if key not in attr_keys:
373
+ attr_keys.append(key)
374
+
375
+ for attr in obs_var_arrays:
376
+ attrs_keys[attr] = list(storage[attr].dtype.fields.keys())
377
+
378
+ return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}
379
+
380
+
381
+ ArrayTypes = tuple(ArrayTypes) # type: ignore
382
+ GroupTypes = tuple(GroupTypes) # type: ignore
383
+ StorageTypes = tuple(StorageTypes) # type: ignore
384
+
385
+
386
+ ArrayType = Union[ArrayTypes] # type: ignore
387
+ GroupType = Union[GroupTypes] # type: ignore
388
+ StorageType = Union[StorageTypes] # type: ignore
389
+
390
+
391
+ def _to_memory(elem):
392
+ if isinstance(elem, ArrayTypes):
393
+ return elem[()]
394
+ elif isinstance(elem, SparseDataset):
395
+ return elem.to_memory()
396
+ else:
397
+ return elem
398
+
399
+
400
+ def _try_backed_full(elem):
401
+ # think what to do for compatibility with old var and obs
402
+ if isinstance(elem, ArrayTypes):
403
+ return elem
404
+
405
+ if isinstance(elem, GroupTypes):
406
+ encoding_type = get_spec(elem).encoding_type
407
+ if encoding_type in ("csr_matrix", "csc_matrix"):
408
+ return sparse_dataset(elem)
409
+ if "h5sparse_format" in elem.attrs:
410
+ return sparse_dataset(elem)
411
+ if encoding_type == "" and "indptr" in elem:
412
+ return CSRDataset(elem)
413
+
414
+ return read_elem(elem)
415
+
416
+
417
+ def _safer_read_index(elem):
418
+ if isinstance(elem, GroupTypes):
419
+ return pd.Index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
420
+ elif isinstance(elem, ArrayTypes):
421
+ indices = None
422
+ for index_name in ("index", "_index"):
423
+ if index_name in elem.dtype.names:
424
+ indices = elem[index_name]
425
+ break
426
+ if indices is not None and len(indices) > 0:
427
+ if isinstance(indices[0], bytes):
428
+ indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
429
+ return pd.Index(indices)
430
+ else:
431
+ raise ValueError("Indices not found.")
432
+ else:
433
+ raise ValueError(f"Unknown elem type {type(elem)} when reading indices.")
434
+
435
+
436
+ class _MapAccessor:
437
+ def __init__(self, elem, name, indices=None):
438
+ self.elem = elem
439
+ self.indices = indices
440
+ self.name = name
441
+
442
+ def __getitem__(self, key):
443
+ if self.indices is None:
444
+ return _try_backed_full(self.elem[key])
445
+ else:
446
+ return registry.safer_read_partial(self.elem[key], indices=self.indices)
447
+
448
+ def keys(self):
449
+ return list(self.elem.keys())
450
+
451
+ def __repr__(self):
452
+ """Description of the _MapAccessor object."""
453
+ descr = f"Accessor for the AnnData attribute {self.name}"
454
+ descr += f"\n with keys: {self.keys()}"
455
+ return descr
456
+
457
+
458
+ class _AnnDataAttrsMixin:
459
+ storage: StorageType
460
+ _attrs_keys: Mapping[str, list]
461
+
462
+ @cached_property
463
+ def obs(self) -> pd.DataFrame:
464
+ if "obs" not in self._attrs_keys:
465
+ return None
466
+ indices = getattr(self, "indices", None)
467
+ if indices is not None:
468
+ indices = (indices[0], slice(None))
469
+ obj = registry.safer_read_partial(self.storage["obs"], indices=indices) # type: ignore
470
+ return _records_to_df(obj)
471
+ else:
472
+ return registry.read_dataframe(self.storage["obs"]) # type: ignore
473
+
474
+ @cached_property
475
+ def var(self) -> pd.DataFrame:
476
+ if "var" not in self._attrs_keys:
477
+ return None
478
+ indices = getattr(self, "indices", None)
479
+ if indices is not None:
480
+ indices = (indices[1], slice(None))
481
+ obj = registry.safer_read_partial(self.storage["var"], indices=indices) # type: ignore
482
+ return _records_to_df(obj)
483
+ else:
484
+ return registry.read_dataframe(self.storage["var"]) # type: ignore
485
+
486
+ @cached_property
487
+ def uns(self):
488
+ if "uns" not in self._attrs_keys:
489
+ return None
490
+ return read_elem(self.storage["uns"])
491
+
492
+ @cached_property
493
+ def X(self):
494
+ indices = getattr(self, "indices", None)
495
+ if indices is not None:
496
+ return registry.safer_read_partial(self.storage["X"], indices=indices)
497
+ else:
498
+ return _try_backed_full(self.storage["X"])
499
+
500
+ @cached_property
501
+ def obsm(self):
502
+ if "obsm" not in self._attrs_keys:
503
+ return None
504
+ indices = getattr(self, "indices", None)
505
+ if indices is not None:
506
+ indices = (indices[0], slice(None))
507
+ return _MapAccessor(self.storage["obsm"], "obsm", indices)
508
+
509
+ @cached_property
510
+ def varm(self):
511
+ if "varm" not in self._attrs_keys:
512
+ return None
513
+ indices = getattr(self, "indices", None)
514
+ if indices is not None:
515
+ indices = (indices[1], slice(None))
516
+ return _MapAccessor(self.storage["varm"], "varm", indices)
517
+
518
+ @cached_property
519
+ def obsp(self):
520
+ if "obsp" not in self._attrs_keys:
521
+ return None
522
+ indices = getattr(self, "indices", None)
523
+ if indices is not None:
524
+ indices = (indices[0], indices[0])
525
+ return _MapAccessor(self.storage["obsp"], "obsp", indices)
526
+
527
+ @cached_property
528
+ def varp(self):
529
+ if "varp" not in self._attrs_keys:
530
+ return None
531
+ indices = getattr(self, "indices", None)
532
+ if indices is not None:
533
+ indices = (indices[1], indices[1])
534
+ return _MapAccessor(self.storage["varp"], "varp", indices)
535
+
536
+ @cached_property
537
+ def layers(self):
538
+ if "layers" not in self._attrs_keys:
539
+ return None
540
+ indices = getattr(self, "indices", None)
541
+ return _MapAccessor(self.storage["layers"], "layers", indices)
542
+
543
+ @property
544
+ def obs_names(self):
545
+ return self._obs_names
546
+
547
+ @property
548
+ def var_names(self):
549
+ return self._var_names
550
+
551
+ @cached_property
552
+ def shape(self):
553
+ return len(self._obs_names), len(self._var_names)
554
+
555
+ def to_dict(self):
556
+ prepare_adata = {}
557
+
558
+ prepare_adata["X"] = _to_memory(self.X)
559
+
560
+ if "uns" in self._attrs_keys:
561
+ prepare_adata["uns"] = self.uns
562
+
563
+ for attr in ("obs", "var"):
564
+ if attr in self._attrs_keys:
565
+ prepare_adata[attr] = getattr(self, attr)
566
+
567
+ for attr in ("obsm", "varm", "obsp", "varp", "layers"):
568
+ if attr in self._attrs_keys:
569
+ prepare_adata[attr] = {}
570
+ get_attr = getattr(self, attr)
571
+ for key in self._attrs_keys[attr]:
572
+ prepare_adata[attr][key] = _to_memory(get_attr[key])
573
+
574
+ if "raw" in self._attrs_keys:
575
+ prepare_adata["raw"] = self.raw.to_dict()
576
+
577
+ return prepare_adata
578
+
579
+ def to_memory(self):
580
+ adata = AnnData(**self.to_dict())
581
+ return adata
582
+
583
+
584
+ class AnnDataAccessorSubset(_AnnDataAttrsMixin):
585
+ def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):
586
+ self.storage = storage
587
+ self.indices = indices
588
+
589
+ self._attrs_keys = attrs_keys
590
+ self._obs_names, self._var_names = obs_names, var_names
591
+
592
+ self._ref_shape = ref_shape
593
+
594
+ def __getitem__(self, index: Index):
595
+ """Access a subset of the underlying AnnData object."""
596
+ oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
597
+ new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
598
+ if self.indices is not None:
599
+ oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
600
+ vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
601
+ return type(self)(
602
+ self.storage,
603
+ (oidx, vidx),
604
+ self._attrs_keys,
605
+ new_obs_names,
606
+ new_var_names,
607
+ self._ref_shape,
608
+ )
609
+
610
+ def __repr__(self):
611
+ """Description of the object."""
612
+ n_obs, n_vars = self.shape
613
+ descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
614
+ for attr, keys in self._attrs_keys.items():
615
+ descr += f"\n {attr}: {keys}"
616
+ return descr
617
+
618
+ @cached_property
619
+ def raw(self):
620
+ if "raw" not in self._attrs_keys:
621
+ return None
622
+ prepare_indices = None
623
+ if self.indices is not None:
624
+ oidx = self.indices[0]
625
+ if isinstance(oidx, np.ndarray) or oidx != slice(None):
626
+ prepare_indices = oidx, slice(None)
627
+ return AnnDataRawAccessor(
628
+ self.storage["raw"],
629
+ prepare_indices,
630
+ None,
631
+ self._obs_names,
632
+ None,
633
+ self._ref_shape[0],
634
+ )
635
+
636
+
637
+ class AnnDataRawAccessor(AnnDataAccessorSubset):
638
+ def __init__(
639
+ self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
640
+ ):
641
+ var_raw = storage_raw["var"]
642
+
643
+ if var_names is None:
644
+ var_names = _safer_read_index(var_raw)
645
+
646
+ if isinstance(ref_shape, int):
647
+ ref_shape = ref_shape, len(var_names)
648
+ elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
649
+ ref_shape = ref_shape[0], len(var_names)
650
+
651
+ if attrs_keys is None:
652
+ attrs_keys = {}
653
+ if isinstance(var_raw, ArrayTypes):
654
+ attrs_keys["var"] = list(var_raw.dtype.fields.keys())
655
+ else:
656
+ # for some reason list(var_raw.keys()) is very slow for zarr
657
+ # maybe also directly get keys from the underlying mapper
658
+ attrs_keys["var"] = list(var_raw)
659
+ if "varm" in storage_raw:
660
+ varm_keys_raw = list(storage_raw["varm"])
661
+ if len(varm_keys_raw) > 0:
662
+ attrs_keys["varm"] = varm_keys_raw
663
+
664
+ super().__init__(
665
+ storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
666
+ )
667
+
668
+ @property
669
+ def raw(self):
670
+ raise AttributeError
671
+
672
+
673
+ class AnnDataAccessor(_AnnDataAttrsMixin):
674
+ """Cloud-backed AnnData."""
675
+
676
+ def __init__(
677
+ self,
678
+ connection: OpenFile | None,
679
+ storage: StorageType,
680
+ filename: str,
681
+ ):
682
+ self._conn = connection
683
+ self.storage = storage
684
+
685
+ self._attrs_keys = registry.keys(self.storage)
686
+
687
+ self._name = filename
688
+
689
+ self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
690
+ self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
691
+
692
+ self._closed = False
693
+
694
+ def close(self):
695
+ """Closes the connection."""
696
+ if hasattr(self, "storage") and hasattr(self.storage, "close"):
697
+ self.storage.close()
698
+ if hasattr(self, "_conn") and hasattr(self._conn, "close"):
699
+ self._conn.close()
700
+ self._closed = True
701
+
702
+ @property
703
+ def closed(self):
704
+ return self._closed
705
+
706
+ def __enter__(self):
707
+ return self
708
+
709
+ def __exit__(self, exc_type, exc_val, exc_tb):
710
+ self.close()
711
+
712
+ def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
713
+ """Access a subset of the underlying AnnData object."""
714
+ oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
715
+ new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
716
+ return AnnDataAccessorSubset(
717
+ self.storage,
718
+ (oidx, vidx),
719
+ self._attrs_keys,
720
+ new_obs_names,
721
+ new_var_names,
722
+ self.shape,
723
+ )
724
+
725
+ def __repr__(self):
726
+ """Description of the AnnDataAccessor object."""
727
+ n_obs, n_vars = self.shape
728
+ descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
729
+ descr += f"\n constructed for the AnnData object {self._name}"
730
+ for attr, keys in self._attrs_keys.items():
731
+ descr += f"\n {attr}: {keys}"
732
+ return descr
733
+
734
+ @cached_property
735
+ def raw(self):
736
+ if "raw" not in self._attrs_keys:
737
+ return None
738
+ return AnnDataRawAccessor(
739
+ self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
740
+ )