lamindb 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,731 +1,92 @@
1
1
  from __future__ import annotations
2
2
 
3
- import inspect
4
3
  from dataclasses import dataclass
5
- from functools import cached_property
6
- from itertools import chain
7
- from typing import TYPE_CHECKING, Callable, Mapping, Union
4
+ from typing import TYPE_CHECKING, Any, Callable, Literal
8
5
 
9
- import h5py
10
- import numpy as np
11
- import pandas as pd
12
- from anndata import AnnData
13
- from anndata import __version__ as anndata_version
14
- from anndata._core.index import Index, _normalize_indices
15
- from anndata._core.views import _resolve_idx
16
- from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
17
- from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
18
- from anndata.compat import _read_attr
19
- from fsspec.implementations.local import LocalFileSystem
20
- from lamin_utils import logger
21
- from lamindb_setup.core.upath import UPath, create_mapper, infer_filesystem
6
+ from anndata._io.specs.registry import get_spec
22
7
  from lnschema_core import Artifact
23
- from packaging import version
24
8
 
25
- from lamindb.core.storage.paths import filepath_from_artifact
9
+ from ._anndata_accessor import AnnDataAccessor, StorageType, registry
10
+ from .paths import filepath_from_artifact
26
11
 
27
12
  if TYPE_CHECKING:
28
- from pathlib import Path
29
-
30
13
  from fsspec.core import OpenFile
31
14
  from tiledbsoma import Collection as SOMACollection
32
15
  from tiledbsoma import Experiment as SOMAExperiment
16
+ from upath import UPath
33
17
 
34
- anndata_version_parse = version.parse(anndata_version)
35
-
36
- if anndata_version_parse < version.parse("0.10.0"):
37
- if anndata_version_parse < version.parse("0.9.1"):
38
- logger.warning(
39
- "Full backed capabilities are not available for this version of anndata,"
40
- " please install anndata>=0.9.1."
41
- )
42
-
43
- from anndata._core.sparse_dataset import SparseDataset
44
-
45
- # try csr for groups with no encoding_type
46
- class CSRDataset(SparseDataset):
47
- @property
48
- def format_str(self) -> str:
49
- return "csr"
50
-
51
- def sparse_dataset(group):
52
- return SparseDataset(group)
53
-
54
- else:
55
- from anndata._core.sparse_dataset import (
56
- BaseCompressedSparseDataset as SparseDataset,
57
- )
58
- from anndata._core.sparse_dataset import ( # type: ignore
59
- CSRDataset,
60
- sparse_dataset,
61
- )
62
-
63
- def _check_group_format(*args):
64
- pass
65
18
 
66
- CSRDataset._check_group_format = _check_group_format
19
+ # this dynamically creates a subclass of a context manager class
20
+ # and reassigns it to an instance of the superclass
21
+ # so that the instance calls finalize on close or exit
22
+ def _track_writes_factory(obj: Any, finalize: Callable):
23
+ closed: bool = False
67
24
 
25
+ tracked_class = obj.__class__
26
+ type_dict = {"__doc__": tracked_class.__doc__}
27
+ if hasattr(tracked_class, "__slots__"):
28
+ type_dict["__slots__"] = ()
29
+ if hasattr(tracked_class, "__exit__"):
68
30
 
69
- # zarr and CSRDataset have problems with full selection
70
- def _subset_sparse(sparse_ds: CSRDataset | SparseDataset, indices):
71
- has_arrays = isinstance(indices[0], np.ndarray) or isinstance(
72
- indices[1], np.ndarray
73
- )
74
- if not has_arrays and indices == (slice(None), slice(None)):
75
- return sparse_ds.to_memory()
76
- else:
77
- return sparse_ds[indices]
78
-
79
-
80
- def get_module_name(obj):
81
- return inspect.getmodule(obj).__name__.partition(".")[0]
82
-
83
-
84
- def _records_to_df(obj):
85
- if isinstance(obj, pd.DataFrame):
86
- return obj
87
-
88
- if hasattr(obj, "dtype") and obj.dtype.names is not None:
89
- formats = []
90
- for name, (dt, _) in obj.dtype.fields.items():
91
- if dt.char == "S":
92
- new_dt = str(dt).replace("S", "U")
93
- else:
94
- new_dt = dt
95
- formats.append((name, new_dt))
96
- df = pd.DataFrame(obj.astype(formats, copy=False))
97
- for index_name in ("index", "_index"):
98
- if index_name in df.columns:
99
- return df.set_index(index_name)
100
- return df
101
- else:
102
- return obj
103
-
31
+ def __exit__(self, exc_type, exc_val, exc_tb):
32
+ nonlocal closed
33
+ tracked_class.__exit__(self, exc_type, exc_val, exc_tb)
34
+ if not closed:
35
+ finalize()
36
+ closed = True
104
37
 
105
- class AccessRecord:
106
- def __init__(self):
107
- self._registry = {}
108
- self._openers = {}
38
+ type_dict["__exit__"] = __exit__
39
+ if hasattr(tracked_class, "close"):
109
40
 
110
- def register_open(self, module: str):
111
- def wrapper(func: Callable):
112
- self._openers[module] = func
113
- return func
41
+ def close(self, *args, **kwargs):
42
+ nonlocal closed
43
+ tracked_class.close(self, *args, **kwargs)
44
+ if not closed:
45
+ finalize()
46
+ closed = True
114
47
 
115
- return wrapper
48
+ type_dict["close"] = close
116
49
 
117
- def open(self, module: str, *args, **kwargs):
118
- if module in self._openers:
119
- return self._openers[module](*args, **kwargs)
120
- else:
121
- raise ValueError(f"Module {module} not found, please install it.")
50
+ Track = type(tracked_class.__name__ + "Track", (tracked_class,), type_dict)
51
+ obj.__class__ = Track
52
+ return obj
122
53
 
123
- def register(self, module: str):
124
- def wrapper(func: Callable):
125
- func_name = func.__name__
126
- if func_name not in self._registry:
127
- self._registry[func_name] = {}
128
- self._registry[func_name][module] = func
129
- return func
130
54
 
131
- return wrapper
132
-
133
- def __getattr__(self, func_name: str):
134
- def wrapper(*args, **kwargs):
135
- func_registry = self._registry[func_name]
136
- for arg in chain(args, kwargs.values()):
137
- arg_module = get_module_name(arg)
138
- if arg_module in func_registry:
139
- return func_registry[arg_module](*args, **kwargs)
140
- raise ValueError(f"{func_name} is not registered for this module.")
141
-
142
- return wrapper
143
-
144
-
145
- # storage specific functions should be registered and called through the registry
146
- registry = AccessRecord()
147
-
148
-
149
- @registry.register_open("h5py")
150
- def open(filepath: UPath | Path | str):
151
- fs, file_path_str = infer_filesystem(filepath)
152
- if isinstance(fs, LocalFileSystem):
153
- return None, h5py.File(file_path_str, mode="r")
154
- conn = fs.open(file_path_str, mode="rb")
55
+ def _open_tiledbsoma(
56
+ filepath: UPath, mode: Literal["r", "w"] = "r"
57
+ ) -> SOMACollection | SOMAExperiment:
155
58
  try:
156
- storage = h5py.File(conn, mode="r")
157
- except Exception as e:
158
- conn.close()
159
- raise e
160
- return conn, storage
161
-
162
-
163
- @registry.register("h5py")
164
- def read_dataframe(elem: h5py.Dataset | h5py.Group):
165
- if isinstance(elem, h5py.Dataset):
166
- return read_dataframe_legacy_h5(elem)
167
- else:
168
- return read_elem(elem)
169
-
170
-
171
- @registry.register("h5py")
172
- def safer_read_partial(elem, indices):
173
- is_dataset = isinstance(elem, h5py.Dataset)
174
- indices_inverse: list | None = None
175
- encoding_type = get_spec(elem).encoding_type
176
- # h5py selection for datasets requires sorted indices
177
- if is_dataset or encoding_type == "dataframe":
178
- indices_increasing = []
179
- indices_inverse = []
180
- for indices_dim in indices:
181
- # should be integer or bool
182
- # ignore bool or increasing unique integers
183
- if (
184
- isinstance(indices_dim, np.ndarray)
185
- and indices_dim.dtype != "bool"
186
- and not np.all(np.diff(indices_dim) > 0)
187
- ):
188
- idx_unique, idx_inverse = np.unique(indices_dim, return_inverse=True)
189
- indices_increasing.append(idx_unique)
190
- indices_inverse.append(idx_inverse)
191
- else:
192
- indices_increasing.append(indices_dim)
193
- indices_inverse.append(None)
194
- indices = tuple(indices_increasing)
195
- if all(idx is None for idx in indices_inverse):
196
- indices_inverse = None
197
- result = None
198
- if encoding_type == "":
199
- if is_dataset:
200
- dims = len(elem.shape)
201
- if dims == 2:
202
- result = elem[indices]
203
- elif dims == 1:
204
- if indices[0] == slice(None):
205
- result = elem[indices[1]]
206
- elif indices[1] == slice(None):
207
- result = elem[indices[0]]
208
- elif isinstance(elem, h5py.Group):
209
- try:
210
- ds = CSRDataset(elem)
211
- result = _subset_sparse(ds, indices)
212
- except Exception as e:
213
- logger.debug(
214
- f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
215
- )
216
- if result is None:
217
- raise ValueError(
218
- "Can not get a subset of the element of type"
219
- f" {type(elem).__name__} with an empty spec."
220
- )
221
- else:
222
- result = read_elem_partial(elem, indices=indices)
223
- if indices_inverse is None:
224
- return result
225
- else:
226
- if indices_inverse[0] is None:
227
- if len(result.shape) == 2:
228
- return result[:, indices_inverse[1]]
229
- else:
230
- return result[indices_inverse[1]]
231
- elif indices_inverse[1] is None:
232
- if isinstance(result, pd.DataFrame):
233
- return result.iloc[indices_inverse[0]]
234
- else:
235
- return result[indices_inverse[0]]
236
- else:
237
- return result[tuple(indices_inverse)]
238
-
239
-
240
- @registry.register("h5py")
241
- def keys(storage: h5py.File):
242
- attrs_keys: dict[str, list] = {}
243
- for attr in storage.keys():
244
- if attr == "X":
245
- continue
246
- attr_obj = storage[attr]
247
- if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset):
248
- keys = list(attr_obj.dtype.fields.keys())
249
- else:
250
- keys = list(attr_obj.keys())
251
- if len(keys) > 0:
252
- attrs_keys[attr] = keys
253
- return attrs_keys
254
-
255
-
256
- ArrayTypes = [h5py.Dataset]
257
- GroupTypes = [h5py.Group]
258
- StorageTypes = [h5py.File]
259
-
260
-
261
- ZARR_INSTALLED = False
262
- try:
263
- import zarr
264
-
265
- ZARR_INSTALLED = True
266
- except ImportError:
267
- pass
268
-
269
- if ZARR_INSTALLED:
270
- from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
271
-
272
- ArrayTypes.append(zarr.Array)
273
- GroupTypes.append(zarr.Group)
274
- StorageTypes.append(zarr.Group)
275
-
276
- @registry.register_open("zarr")
277
- def open(filepath: Union[UPath, Path, str]): # noqa
278
- fs, file_path_str = infer_filesystem(filepath)
279
- conn = None
280
- if isinstance(fs, LocalFileSystem):
281
- # this is faster than through an fsspec mapper for local
282
- open_obj = file_path_str
283
- else:
284
- open_obj = create_mapper(fs, file_path_str, check=True)
285
- storage = zarr.open(open_obj, mode="r")
286
- return conn, storage
287
-
288
- @registry.register("zarr")
289
- def read_dataframe(elem: Union[zarr.Array, zarr.Group]): # noqa
290
- if isinstance(elem, zarr.Array):
291
- return read_dataframe_legacy_zarr(elem)
292
- else:
293
- return read_elem(elem)
294
-
295
- @registry.register("zarr")
296
- def safer_read_partial(elem, indices): # noqa
297
- encoding_type = get_spec(elem).encoding_type
298
- if encoding_type == "":
299
- if isinstance(elem, zarr.Array):
300
- dims = len(elem.shape)
301
- if dims == 2:
302
- return elem.oindex[indices]
303
- elif dims == 1:
304
- if indices[0] == slice(None):
305
- return elem.oindex[indices[1]]
306
- elif indices[1] == slice(None):
307
- return elem.oindex[indices[0]]
308
- elif isinstance(elem, zarr.Group):
309
- try:
310
- ds = CSRDataset(elem)
311
- return _subset_sparse(ds, indices)
312
- except Exception as e:
313
- logger.debug(
314
- f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
315
- )
316
- raise ValueError(
317
- "Can not get a subset of the element of type"
318
- f" {type(elem).__name__} with an empty spec."
319
- )
320
- else:
321
- if encoding_type in ("csr_matrix", "csc_matrix"):
322
- ds = sparse_dataset(elem)
323
- return _subset_sparse(ds, indices)
324
- else:
325
- return read_elem_partial(elem, indices=indices)
326
-
327
- # this is needed because accessing zarr.Group.keys() directly is very slow
328
- @registry.register("zarr")
329
- def keys(storage: zarr.Group): # noqa
330
- paths = storage._store.keys()
331
-
332
- attrs_keys: dict[str, list] = {}
333
- obs_var_arrays = []
334
-
335
- for path in paths:
336
- if path in (".zattrs", ".zgroup"):
337
- continue
338
- parts = path.split("/")
339
- if len(parts) < 2:
340
- continue
341
- attr = parts[0]
342
- key = parts[1]
343
-
344
- if attr == "X":
345
- continue
346
-
347
- if attr in ("obs", "var"):
348
- if attr in obs_var_arrays:
349
- continue
350
- if key == ".zarray":
351
- attrs_keys.pop(attr, None)
352
- obs_var_arrays.append(attr)
353
-
354
- if attr not in attrs_keys:
355
- attrs_keys[attr] = []
356
-
357
- if key in (".zattrs", ".zgroup", ".zarray"):
358
- continue
359
- attr_keys = attrs_keys[attr]
360
- if key not in attr_keys:
361
- attr_keys.append(key)
362
-
363
- for attr in obs_var_arrays:
364
- attrs_keys[attr] = list(storage[attr].dtype.fields.keys())
365
-
366
- return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0}
367
-
368
-
369
- ArrayTypes = tuple(ArrayTypes) # type: ignore
370
- GroupTypes = tuple(GroupTypes) # type: ignore
371
- StorageTypes = tuple(StorageTypes) # type: ignore
372
-
373
-
374
- ArrayType = Union[ArrayTypes] # type: ignore
375
- GroupType = Union[GroupTypes] # type: ignore
376
- StorageType = Union[StorageTypes] # type: ignore
377
-
378
-
379
- def _to_memory(elem):
380
- if isinstance(elem, ArrayTypes):
381
- return elem[()]
382
- elif isinstance(elem, SparseDataset):
383
- return elem.to_memory()
59
+ import tiledbsoma as soma
60
+ except ImportError as e:
61
+ raise ImportError("Please install tiledbsoma: pip install tiledbsoma") from e
62
+ filepath_str = filepath.as_posix()
63
+ if filepath.protocol == "s3":
64
+ from lamindb_setup.core._settings_storage import get_storage_region
65
+
66
+ region = get_storage_region(filepath_str)
67
+ tiledb_config = {"vfs.s3.region": region}
68
+ storage_options = filepath.storage_options
69
+ if "key" in storage_options:
70
+ tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
71
+ if "secret" in storage_options:
72
+ tiledb_config["vfs.s3.aws_secret_access_key"] = storage_options["secret"]
73
+ if "token" in storage_options:
74
+ tiledb_config["vfs.s3.aws_session_token"] = storage_options["token"]
75
+ ctx = soma.SOMATileDBContext(tiledb_config=tiledb_config)
76
+ # this is a strange bug
77
+ # for some reason iterdir futher gives incorrect results
78
+ # if cache is not invalidated
79
+ # instead of obs and ms it gives ms and ms in the list of names
80
+ filepath.fs.invalidate_cache()
384
81
  else:
385
- return elem
386
-
387
-
388
- def _try_backed_full(elem):
389
- # think what to do for compatibility with old var and obs
390
- if isinstance(elem, ArrayTypes):
391
- return elem
392
-
393
- if isinstance(elem, GroupTypes):
394
- encoding_type = get_spec(elem).encoding_type
395
- if encoding_type in ("csr_matrix", "csc_matrix"):
396
- return sparse_dataset(elem)
397
- if "h5sparse_format" in elem.attrs:
398
- return sparse_dataset(elem)
399
- if encoding_type == "" and "indptr" in elem:
400
- return CSRDataset(elem)
82
+ ctx = None
401
83
 
402
- return read_elem(elem)
403
-
404
-
405
- def _safer_read_index(elem):
406
- if isinstance(elem, GroupTypes):
407
- return pd.Index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
408
- elif isinstance(elem, ArrayTypes):
409
- indices = None
410
- for index_name in ("index", "_index"):
411
- if index_name in elem.dtype.names:
412
- indices = elem[index_name]
413
- break
414
- if indices is not None and len(indices) > 0:
415
- if isinstance(indices[0], bytes):
416
- indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
417
- return pd.Index(indices)
418
- else:
419
- raise ValueError("Indices not found.")
84
+ soma_objects = [obj.name for obj in filepath.iterdir()]
85
+ if "obs" in soma_objects and "ms" in soma_objects:
86
+ SOMAType = soma.Experiment
420
87
  else:
421
- raise ValueError(f"Unknown elem type {type(elem)} when reading indices.")
422
-
423
-
424
- class _MapAccessor:
425
- def __init__(self, elem, name, indices=None):
426
- self.elem = elem
427
- self.indices = indices
428
- self.name = name
429
-
430
- def __getitem__(self, key):
431
- if self.indices is None:
432
- return _try_backed_full(self.elem[key])
433
- else:
434
- return registry.safer_read_partial(self.elem[key], indices=self.indices)
435
-
436
- def keys(self):
437
- return list(self.elem.keys())
438
-
439
- def __repr__(self):
440
- """Description of the _MapAccessor object."""
441
- descr = f"Accessor for the AnnData attribute {self.name}"
442
- descr += f"\n with keys: {self.keys()}"
443
- return descr
444
-
445
-
446
- class _AnnDataAttrsMixin:
447
- storage: StorageType
448
- _attrs_keys: Mapping[str, list]
449
-
450
- @cached_property
451
- def obs(self) -> pd.DataFrame:
452
- if "obs" not in self._attrs_keys:
453
- return None
454
- indices = getattr(self, "indices", None)
455
- if indices is not None:
456
- indices = (indices[0], slice(None))
457
- obj = registry.safer_read_partial(self.storage["obs"], indices=indices) # type: ignore
458
- return _records_to_df(obj)
459
- else:
460
- return registry.read_dataframe(self.storage["obs"]) # type: ignore
461
-
462
- @cached_property
463
- def var(self) -> pd.DataFrame:
464
- if "var" not in self._attrs_keys:
465
- return None
466
- indices = getattr(self, "indices", None)
467
- if indices is not None:
468
- indices = (indices[1], slice(None))
469
- obj = registry.safer_read_partial(self.storage["var"], indices=indices) # type: ignore
470
- return _records_to_df(obj)
471
- else:
472
- return registry.read_dataframe(self.storage["var"]) # type: ignore
473
-
474
- @cached_property
475
- def uns(self):
476
- if "uns" not in self._attrs_keys:
477
- return None
478
- return read_elem(self.storage["uns"])
479
-
480
- @cached_property
481
- def X(self):
482
- indices = getattr(self, "indices", None)
483
- if indices is not None:
484
- return registry.safer_read_partial(self.storage["X"], indices=indices)
485
- else:
486
- return _try_backed_full(self.storage["X"])
487
-
488
- @cached_property
489
- def obsm(self):
490
- if "obsm" not in self._attrs_keys:
491
- return None
492
- indices = getattr(self, "indices", None)
493
- if indices is not None:
494
- indices = (indices[0], slice(None))
495
- return _MapAccessor(self.storage["obsm"], "obsm", indices)
496
-
497
- @cached_property
498
- def varm(self):
499
- if "varm" not in self._attrs_keys:
500
- return None
501
- indices = getattr(self, "indices", None)
502
- if indices is not None:
503
- indices = (indices[1], slice(None))
504
- return _MapAccessor(self.storage["varm"], "varm", indices)
505
-
506
- @cached_property
507
- def obsp(self):
508
- if "obsp" not in self._attrs_keys:
509
- return None
510
- indices = getattr(self, "indices", None)
511
- if indices is not None:
512
- indices = (indices[0], indices[0])
513
- return _MapAccessor(self.storage["obsp"], "obsp", indices)
514
-
515
- @cached_property
516
- def varp(self):
517
- if "varp" not in self._attrs_keys:
518
- return None
519
- indices = getattr(self, "indices", None)
520
- if indices is not None:
521
- indices = (indices[1], indices[1])
522
- return _MapAccessor(self.storage["varp"], "varp", indices)
523
-
524
- @cached_property
525
- def layers(self):
526
- if "layers" not in self._attrs_keys:
527
- return None
528
- indices = getattr(self, "indices", None)
529
- return _MapAccessor(self.storage["layers"], "layers", indices)
530
-
531
- @property
532
- def obs_names(self):
533
- return self._obs_names
534
-
535
- @property
536
- def var_names(self):
537
- return self._var_names
538
-
539
- @cached_property
540
- def shape(self):
541
- return len(self._obs_names), len(self._var_names)
542
-
543
- def to_dict(self):
544
- prepare_adata = {}
545
-
546
- prepare_adata["X"] = _to_memory(self.X)
547
-
548
- if "uns" in self._attrs_keys:
549
- prepare_adata["uns"] = self.uns
550
-
551
- for attr in ("obs", "var"):
552
- if attr in self._attrs_keys:
553
- prepare_adata[attr] = getattr(self, attr)
554
-
555
- for attr in ("obsm", "varm", "obsp", "varp", "layers"):
556
- if attr in self._attrs_keys:
557
- prepare_adata[attr] = {}
558
- get_attr = getattr(self, attr)
559
- for key in self._attrs_keys[attr]:
560
- prepare_adata[attr][key] = _to_memory(get_attr[key])
561
-
562
- if "raw" in self._attrs_keys:
563
- prepare_adata["raw"] = self.raw.to_dict()
564
-
565
- return prepare_adata
566
-
567
- def to_memory(self):
568
- adata = AnnData(**self.to_dict())
569
- return adata
570
-
571
-
572
- class AnnDataAccessorSubset(_AnnDataAttrsMixin):
573
- def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape):
574
- self.storage = storage
575
- self.indices = indices
576
-
577
- self._attrs_keys = attrs_keys
578
- self._obs_names, self._var_names = obs_names, var_names
579
-
580
- self._ref_shape = ref_shape
581
-
582
- def __getitem__(self, index: Index):
583
- """Access a subset of the underlying AnnData object."""
584
- oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
585
- new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
586
- if self.indices is not None:
587
- oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0])
588
- vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1])
589
- return type(self)(
590
- self.storage,
591
- (oidx, vidx),
592
- self._attrs_keys,
593
- new_obs_names,
594
- new_var_names,
595
- self._ref_shape,
596
- )
597
-
598
- def __repr__(self):
599
- """Description of the object."""
600
- n_obs, n_vars = self.shape
601
- descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}"
602
- for attr, keys in self._attrs_keys.items():
603
- descr += f"\n {attr}: {keys}"
604
- return descr
605
-
606
- @cached_property
607
- def raw(self):
608
- if "raw" not in self._attrs_keys:
609
- return None
610
- prepare_indices = None
611
- if self.indices is not None:
612
- oidx = self.indices[0]
613
- if isinstance(oidx, np.ndarray) or oidx != slice(None):
614
- prepare_indices = oidx, slice(None)
615
- return AnnDataRawAccessor(
616
- self.storage["raw"],
617
- prepare_indices,
618
- None,
619
- self._obs_names,
620
- None,
621
- self._ref_shape[0],
622
- )
623
-
624
-
625
- class AnnDataRawAccessor(AnnDataAccessorSubset):
626
- def __init__(
627
- self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
628
- ):
629
- var_raw = storage_raw["var"]
630
-
631
- if var_names is None:
632
- var_names = _safer_read_index(var_raw)
633
-
634
- if isinstance(ref_shape, int):
635
- ref_shape = ref_shape, len(var_names)
636
- elif isinstance(ref_shape, tuple) and len(ref_shape) < 2:
637
- ref_shape = ref_shape[0], len(var_names)
638
-
639
- if attrs_keys is None:
640
- attrs_keys = {}
641
- if isinstance(var_raw, ArrayTypes):
642
- attrs_keys["var"] = list(var_raw.dtype.fields.keys())
643
- else:
644
- # for some reason list(var_raw.keys()) is very slow for zarr
645
- # maybe also directly get keys from the underlying mapper
646
- attrs_keys["var"] = list(var_raw)
647
- if "varm" in storage_raw:
648
- varm_keys_raw = list(storage_raw["varm"])
649
- if len(varm_keys_raw) > 0:
650
- attrs_keys["varm"] = varm_keys_raw
651
-
652
- super().__init__(
653
- storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape
654
- )
655
-
656
- @property
657
- def raw(self):
658
- raise AttributeError
659
-
660
-
661
- class AnnDataAccessor(_AnnDataAttrsMixin):
662
- """Cloud-backed AnnData."""
663
-
664
- def __init__(
665
- self,
666
- connection: OpenFile | None,
667
- storage: StorageType,
668
- filename: str,
669
- ):
670
- self._conn = connection
671
- self.storage = storage
672
-
673
- self._attrs_keys = registry.keys(self.storage)
674
-
675
- self._name = filename
676
-
677
- self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
678
- self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
679
-
680
- self._closed = False
681
-
682
- def close(self):
683
- """Closes the connection."""
684
- if hasattr(self, "storage") and hasattr(self.storage, "close"):
685
- self.storage.close()
686
- if hasattr(self, "_conn") and hasattr(self._conn, "close"):
687
- self._conn.close()
688
- self._closed = True
689
-
690
- @property
691
- def closed(self):
692
- return self._closed
693
-
694
- def __enter__(self):
695
- return self
696
-
697
- def __exit__(self, exc_type, exc_val, exc_tb):
698
- self.close()
699
-
700
- def __getitem__(self, index: Index) -> AnnDataAccessorSubset:
701
- """Access a subset of the underlying AnnData object."""
702
- oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names)
703
- new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx]
704
- return AnnDataAccessorSubset(
705
- self.storage,
706
- (oidx, vidx),
707
- self._attrs_keys,
708
- new_obs_names,
709
- new_var_names,
710
- self.shape,
711
- )
712
-
713
- def __repr__(self):
714
- """Description of the AnnDataAccessor object."""
715
- n_obs, n_vars = self.shape
716
- descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}"
717
- descr += f"\n constructed for the AnnData object {self._name}"
718
- for attr, keys in self._attrs_keys.items():
719
- descr += f"\n {attr}: {keys}"
720
- return descr
721
-
722
- @cached_property
723
- def raw(self):
724
- if "raw" not in self._attrs_keys:
725
- return None
726
- return AnnDataRawAccessor(
727
- self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
728
- )
88
+ SOMAType = soma.Collection
89
+ return SOMAType.open(filepath_str, mode=mode, context=ctx)
729
90
 
730
91
 
731
92
  @dataclass
@@ -739,7 +100,9 @@ class BackedAccessor:
739
100
 
740
101
 
741
102
  def backed_access(
742
- artifact_or_filepath: Artifact | Path, using_key: str | None = None
103
+ artifact_or_filepath: Artifact | UPath,
104
+ mode: str = "r",
105
+ using_key: str | None = None,
743
106
  ) -> AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment:
744
107
  if isinstance(artifact_or_filepath, Artifact):
745
108
  filepath = filepath_from_artifact(artifact_or_filepath, using_key=using_key)
@@ -749,56 +112,23 @@ def backed_access(
749
112
  suffix = filepath.suffix
750
113
 
751
114
  if name == "soma" or suffix == ".tiledbsoma":
752
- try:
753
- import tiledbsoma as soma
754
- except ImportError as e:
755
- raise ImportError(
756
- "Please install tiledbsoma: pip install tiledbsoma"
757
- ) from e
758
- filepath_str = filepath.as_posix()
759
- if filepath.protocol == "s3":
760
- from lamindb_setup.core._settings_storage import get_storage_region
761
-
762
- region = get_storage_region(filepath_str)
763
- tiledb_config = {"vfs.s3.region": region}
764
- storage_options = filepath.storage_options
765
- if "key" in storage_options:
766
- tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
767
- if "secret" in storage_options:
768
- tiledb_config["vfs.s3.aws_secret_access_key"] = storage_options[
769
- "secret"
770
- ]
771
- if "token" in storage_options:
772
- tiledb_config["vfs.s3.aws_session_token"] = storage_options["token"]
773
- ctx = soma.SOMATileDBContext(tiledb_config=tiledb_config)
774
- # this is a strange bug
775
- # for some reason iterdir futher gives incorrect results
776
- # if cache is not invalidated
777
- # instead of obs and ms it gives ms and ms in the list of names
778
- filepath.fs.invalidate_cache()
779
- else:
780
- ctx = None
781
-
782
- soma_objects = [obj.name for obj in filepath.iterdir()]
783
- if "obs" in soma_objects and "ms" in soma_objects:
784
- SOMAType = soma.Experiment
785
- else:
786
- SOMAType = soma.Collection
787
- return SOMAType.open(filepath_str, context=ctx)
115
+ if mode not in {"r", "w"}:
116
+ raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.")
117
+ return _open_tiledbsoma(filepath, mode=mode) # type: ignore
788
118
  elif suffix in {".h5", ".hdf5", ".h5ad"}:
789
- conn, storage = registry.open("h5py", filepath)
119
+ conn, storage = registry.open("h5py", filepath, mode=mode)
790
120
  elif suffix == ".zarr":
791
- conn, storage = registry.open("zarr", filepath)
121
+ conn, storage = registry.open("zarr", filepath, mode=mode)
792
122
  else:
793
123
  raise ValueError(
794
124
  "object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix, not"
795
125
  f" {suffix}."
796
126
  )
797
127
 
798
- if suffix == ".h5ad":
128
+ is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
129
+ if is_anndata:
130
+ if mode != "r":
131
+ raise ValueError("Can only access `AnnData` with mode='r'.")
799
132
  return AnnDataAccessor(conn, storage, name)
800
133
  else:
801
- if get_spec(storage).encoding_type == "anndata":
802
- return AnnDataAccessor(conn, storage, name)
803
- else:
804
- return BackedAccessor(conn, storage)
134
+ return BackedAccessor(conn, storage)