lamindb 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +33 -26
- lamindb/_finish.py +9 -1
- lamindb/_tracked.py +26 -3
- lamindb/_view.py +2 -3
- lamindb/base/__init__.py +1 -1
- lamindb/base/ids.py +1 -10
- lamindb/base/users.py +1 -4
- lamindb/core/__init__.py +7 -65
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +50 -22
- lamindb/core/_mapped_collection.py +4 -2
- lamindb/core/_settings.py +6 -6
- lamindb/core/_sync_git.py +1 -1
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/_small.py +3 -3
- lamindb/core/loaders.py +43 -20
- lamindb/core/storage/_anndata_accessor.py +8 -3
- lamindb/core/storage/_backed_access.py +14 -7
- lamindb/core/storage/_pyarrow_dataset.py +24 -9
- lamindb/core/storage/_tiledbsoma.py +8 -6
- lamindb/core/storage/_zarr.py +104 -25
- lamindb/core/storage/objects.py +63 -28
- lamindb/core/storage/paths.py +16 -13
- lamindb/core/types.py +10 -0
- lamindb/curators/__init__.py +176 -149
- lamindb/errors.py +1 -1
- lamindb/integrations/_vitessce.py +4 -4
- lamindb/migrations/0089_subsequent_runs.py +159 -0
- lamindb/migrations/0090_runproject_project_runs.py +73 -0
- lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
- lamindb/models/__init__.py +79 -0
- lamindb/{core → models}/_describe.py +3 -3
- lamindb/{core → models}/_django.py +8 -5
- lamindb/{core → models}/_feature_manager.py +103 -87
- lamindb/{_from_values.py → models/_from_values.py} +5 -2
- lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
- lamindb/{core → models}/_label_manager.py +10 -17
- lamindb/{core/relations.py → models/_relations.py} +8 -1
- lamindb/models/artifact.py +2602 -0
- lamindb/{_can_curate.py → models/can_curate.py} +349 -180
- lamindb/models/collection.py +683 -0
- lamindb/models/core.py +135 -0
- lamindb/models/feature.py +643 -0
- lamindb/models/flextable.py +163 -0
- lamindb/{_parents.py → models/has_parents.py} +55 -49
- lamindb/models/project.py +384 -0
- lamindb/{_query_manager.py → models/query_manager.py} +10 -8
- lamindb/{_query_set.py → models/query_set.py} +64 -32
- lamindb/models/record.py +1762 -0
- lamindb/models/run.py +563 -0
- lamindb/{_save.py → models/save.py} +18 -8
- lamindb/models/schema.py +732 -0
- lamindb/models/transform.py +360 -0
- lamindb/models/ulabel.py +249 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
- lamindb-1.2.0.dist-info/RECORD +95 -0
- lamindb/_artifact.py +0 -1361
- lamindb/_collection.py +0 -440
- lamindb/_feature.py +0 -316
- lamindb/_is_versioned.py +0 -40
- lamindb/_record.py +0 -1065
- lamindb/_run.py +0 -60
- lamindb/_schema.py +0 -347
- lamindb/_storage.py +0 -15
- lamindb/_transform.py +0 -170
- lamindb/_ulabel.py +0 -56
- lamindb/_utils.py +0 -9
- lamindb/base/validation.py +0 -63
- lamindb/core/_data.py +0 -491
- lamindb/core/fields.py +0 -12
- lamindb/models.py +0 -4435
- lamindb-1.1.0.dist-info/RECORD +0 -95
- {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
lamindb/core/loaders.py
CHANGED
@@ -20,26 +20,30 @@ from __future__ import annotations
|
|
20
20
|
import builtins
|
21
21
|
import re
|
22
22
|
from pathlib import Path
|
23
|
-
from typing import TYPE_CHECKING
|
23
|
+
from typing import TYPE_CHECKING, Any
|
24
24
|
|
25
|
-
import anndata as ad
|
26
25
|
import pandas as pd
|
26
|
+
from anndata import read_h5ad
|
27
27
|
from lamin_utils import logger
|
28
28
|
from lamindb_setup.core.upath import (
|
29
29
|
create_path,
|
30
30
|
infer_filesystem,
|
31
31
|
)
|
32
32
|
|
33
|
-
from ._settings import settings
|
33
|
+
from ..core._settings import settings
|
34
34
|
|
35
35
|
if TYPE_CHECKING:
|
36
|
+
from anndata import AnnData
|
36
37
|
from lamindb_setup.core.types import UPathStr
|
38
|
+
from mudata import MuData
|
39
|
+
|
40
|
+
from lamindb.core.types import ScverseDataStructures
|
37
41
|
|
38
42
|
try:
|
39
|
-
from .storage._zarr import
|
43
|
+
from ..core.storage._zarr import load_zarr
|
40
44
|
except ImportError:
|
41
45
|
|
42
|
-
def
|
46
|
+
def load_zarr(storepath): # type: ignore
|
43
47
|
raise ImportError("Please install zarr: pip install zarr<=2.18.4")
|
44
48
|
|
45
49
|
|
@@ -47,7 +51,7 @@ is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
|
|
47
51
|
|
48
52
|
|
49
53
|
# tested in lamin-usecases
|
50
|
-
def load_fcs(*args, **kwargs) ->
|
54
|
+
def load_fcs(*args, **kwargs) -> AnnData:
|
51
55
|
"""Load an `.fcs` file to `AnnData`."""
|
52
56
|
try:
|
53
57
|
import readfcs
|
@@ -62,16 +66,16 @@ def load_tsv(path: UPathStr, **kwargs) -> pd.DataFrame:
|
|
62
66
|
return pd.read_csv(path_sanitized, sep="\t", **kwargs)
|
63
67
|
|
64
68
|
|
65
|
-
def load_h5ad(filepath, **kwargs) ->
|
69
|
+
def load_h5ad(filepath, **kwargs) -> AnnData:
|
66
70
|
"""Load an `.h5ad` file to `AnnData`."""
|
67
71
|
fs, filepath = infer_filesystem(filepath)
|
68
|
-
|
69
|
-
with fs.open(filepath, mode="rb") as file:
|
70
|
-
adata =
|
72
|
+
compression = kwargs.pop("compression", "infer")
|
73
|
+
with fs.open(filepath, mode="rb", compression=compression) as file:
|
74
|
+
adata = read_h5ad(file, backed=False, **kwargs)
|
71
75
|
return adata
|
72
76
|
|
73
77
|
|
74
|
-
def load_h5mu(filepath: UPathStr, **kwargs):
|
78
|
+
def load_h5mu(filepath: UPathStr, **kwargs) -> MuData:
|
75
79
|
"""Load an `.h5mu` file to `MuData`."""
|
76
80
|
import mudata as md
|
77
81
|
|
@@ -100,7 +104,7 @@ def load_html(path: UPathStr) -> None | UPathStr:
|
|
100
104
|
return path
|
101
105
|
|
102
106
|
|
103
|
-
def load_json(path: UPathStr) -> dict:
|
107
|
+
def load_json(path: UPathStr) -> dict[str, Any] | list[Any]:
|
104
108
|
"""Load `.json` to `dict`."""
|
105
109
|
import json
|
106
110
|
|
@@ -109,7 +113,7 @@ def load_json(path: UPathStr) -> dict:
|
|
109
113
|
return data
|
110
114
|
|
111
115
|
|
112
|
-
def load_yaml(path: UPathStr) -> dict:
|
116
|
+
def load_yaml(path: UPathStr) -> dict[str, Any] | list[Any]:
|
113
117
|
"""Load `.yaml` to `dict`."""
|
114
118
|
import yaml # type: ignore
|
115
119
|
|
@@ -148,11 +152,15 @@ def load_rds(path: UPathStr) -> UPathStr:
|
|
148
152
|
|
149
153
|
FILE_LOADERS = {
|
150
154
|
".csv": pd.read_csv,
|
155
|
+
".csv.gz": pd.read_csv,
|
151
156
|
".tsv": load_tsv,
|
157
|
+
".tsv.gz": load_tsv,
|
152
158
|
".h5ad": load_h5ad,
|
159
|
+
".h5ad.gz": load_h5ad,
|
153
160
|
".parquet": pd.read_parquet,
|
161
|
+
".parquet.gz": pd.read_parquet, # this doesn't work for externally gzipped files, REMOVE LATER
|
154
162
|
".fcs": load_fcs,
|
155
|
-
".zarr":
|
163
|
+
".zarr": load_zarr,
|
156
164
|
".html": load_html,
|
157
165
|
".json": load_json,
|
158
166
|
".yaml": load_yaml,
|
@@ -168,17 +176,32 @@ SUPPORTED_SUFFIXES = [sfx for sfx in FILE_LOADERS.keys() if sfx != ".rds"]
|
|
168
176
|
"""Suffixes with defined artifact loaders."""
|
169
177
|
|
170
178
|
|
171
|
-
def load_to_memory(
|
179
|
+
def load_to_memory(
|
180
|
+
filepath: UPathStr, **kwargs
|
181
|
+
) -> (
|
182
|
+
pd.DataFrame | ScverseDataStructures | dict[str, Any] | list[Any] | UPathStr | None
|
183
|
+
):
|
172
184
|
"""Load a file into memory.
|
173
185
|
|
174
186
|
Returns the filepath if no in-memory form is found.
|
187
|
+
May return None in interactive sessions for images.
|
175
188
|
"""
|
176
189
|
filepath = create_path(filepath)
|
177
190
|
|
178
|
-
|
191
|
+
# infer the correct suffix when .gz is present
|
192
|
+
suffixes = filepath.suffixes
|
193
|
+
suffix = (
|
194
|
+
"".join(suffixes[-2:])
|
195
|
+
if len(suffixes) > 1 and ".gz" in suffixes
|
196
|
+
else filepath.suffix
|
197
|
+
)
|
179
198
|
|
180
|
-
loader = FILE_LOADERS.get(
|
199
|
+
loader = FILE_LOADERS.get(suffix, None)
|
181
200
|
if loader is None:
|
182
|
-
|
183
|
-
|
184
|
-
|
201
|
+
raise NotImplementedError(
|
202
|
+
f"There is no loader for {suffix} files. Use .cache() to get the path."
|
203
|
+
)
|
204
|
+
|
205
|
+
filepath = settings._storage_settings.cloud_to_local(filepath, print_progress=True)
|
206
|
+
|
207
|
+
return loader(filepath, **kwargs)
|
@@ -16,6 +16,7 @@ from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
|
|
16
16
|
from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
|
17
17
|
from anndata.compat import _read_attr
|
18
18
|
from fsspec.implementations.local import LocalFileSystem
|
19
|
+
from fsspec.utils import infer_compression
|
19
20
|
from lamin_utils import logger
|
20
21
|
from lamindb_setup.core.upath import create_mapper, infer_filesystem
|
21
22
|
from packaging import version
|
@@ -152,9 +153,13 @@ registry = AccessRegistry()
|
|
152
153
|
|
153
154
|
|
154
155
|
@registry.register_open("h5py")
|
155
|
-
def open(filepath: UPathStr, mode: str = "r"):
|
156
|
+
def open(filepath: UPathStr, mode: str = "r", compression: str | None = "infer"):
|
156
157
|
fs, file_path_str = infer_filesystem(filepath)
|
157
|
-
|
158
|
+
# we don't open compressed files directly because we need fsspec to uncompress on .open
|
159
|
+
compression = (
|
160
|
+
infer_compression(file_path_str) if compression == "infer" else compression
|
161
|
+
)
|
162
|
+
if isinstance(fs, LocalFileSystem) and compression is None:
|
158
163
|
assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
|
159
164
|
return None, h5py.File(file_path_str, mode=mode)
|
160
165
|
if mode == "r":
|
@@ -165,7 +170,7 @@ def open(filepath: UPathStr, mode: str = "r"):
|
|
165
170
|
conn_mode = "ab"
|
166
171
|
else:
|
167
172
|
raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
|
168
|
-
conn = fs.open(file_path_str, mode=conn_mode)
|
173
|
+
conn = fs.open(file_path_str, mode=conn_mode, compression=compression)
|
169
174
|
try:
|
170
175
|
storage = h5py.File(conn, mode=mode)
|
171
176
|
except Exception as e:
|
@@ -5,8 +5,6 @@ from typing import TYPE_CHECKING, Any, Callable
|
|
5
5
|
|
6
6
|
from anndata._io.specs.registry import get_spec
|
7
7
|
|
8
|
-
from lamindb.models import Artifact
|
9
|
-
|
10
8
|
from ._anndata_accessor import AnnDataAccessor, StorageType, registry
|
11
9
|
from ._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
|
12
10
|
from ._tiledbsoma import _open_tiledbsoma
|
@@ -19,6 +17,8 @@ if TYPE_CHECKING:
|
|
19
17
|
from tiledbsoma import Experiment as SOMAExperiment
|
20
18
|
from upath import UPath
|
21
19
|
|
20
|
+
from lamindb.models.artifact import Artifact
|
21
|
+
|
22
22
|
|
23
23
|
# this dynamically creates a subclass of a context manager class
|
24
24
|
# and reassigns it to an instance of the superclass
|
@@ -70,9 +70,12 @@ def backed_access(
|
|
70
70
|
artifact_or_filepath: Artifact | UPath,
|
71
71
|
mode: str = "r",
|
72
72
|
using_key: str | None = None,
|
73
|
+
**kwargs,
|
73
74
|
) -> (
|
74
75
|
AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
|
75
76
|
):
|
77
|
+
from lamindb.models import Artifact
|
78
|
+
|
76
79
|
if isinstance(artifact_or_filepath, Artifact):
|
77
80
|
objectpath, _ = filepath_from_artifact(
|
78
81
|
artifact_or_filepath, using_key=using_key
|
@@ -80,18 +83,22 @@ def backed_access(
|
|
80
83
|
else:
|
81
84
|
objectpath = artifact_or_filepath
|
82
85
|
name = objectpath.name
|
83
|
-
|
86
|
+
# ignore .gz, only check the real suffix
|
87
|
+
suffixes = objectpath.suffixes
|
88
|
+
suffix = (
|
89
|
+
suffixes[-2] if len(suffixes) > 1 and ".gz" in suffixes else objectpath.suffix
|
90
|
+
)
|
84
91
|
|
85
92
|
if name == "soma" or suffix == ".tiledbsoma":
|
86
93
|
if mode not in {"r", "w"}:
|
87
94
|
raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.")
|
88
|
-
return _open_tiledbsoma(objectpath, mode=mode) # type: ignore
|
95
|
+
return _open_tiledbsoma(objectpath, mode=mode, **kwargs) # type: ignore
|
89
96
|
elif suffix in {".h5", ".hdf5", ".h5ad"}:
|
90
|
-
conn, storage = registry.open("h5py", objectpath, mode=mode)
|
97
|
+
conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
|
91
98
|
elif suffix == ".zarr":
|
92
|
-
conn, storage = registry.open("zarr", objectpath, mode=mode)
|
99
|
+
conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
|
93
100
|
elif _is_pyarrow_dataset(objectpath):
|
94
|
-
return _open_pyarrow_dataset(objectpath)
|
101
|
+
return _open_pyarrow_dataset(objectpath, **kwargs)
|
95
102
|
else:
|
96
103
|
raise ValueError(
|
97
104
|
"The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
|
@@ -18,15 +18,30 @@ def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
|
|
18
18
|
# we don't check here that the filesystem is the same
|
19
19
|
# but this is a requirement for pyarrow.dataset.dataset
|
20
20
|
if isinstance(paths, list):
|
21
|
-
|
22
|
-
elif paths.
|
23
|
-
|
21
|
+
path_list = paths
|
22
|
+
elif paths.is_dir():
|
23
|
+
path_list = [path for path in paths.rglob("*") if path.suffix != ""]
|
24
24
|
else:
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
25
|
+
path_list = [paths]
|
26
|
+
suffix = None
|
27
|
+
for path in path_list:
|
28
|
+
path_suffixes = path.suffixes
|
29
|
+
# this doesn't work for externally gzipped files, REMOVE LATER
|
30
|
+
path_suffix = (
|
31
|
+
path_suffixes[-2]
|
32
|
+
if len(path_suffixes) > 1 and ".gz" in path_suffixes
|
33
|
+
else path.suffix
|
34
|
+
)
|
35
|
+
if path_suffix not in PYARROW_SUFFIXES:
|
36
|
+
return False
|
37
|
+
elif suffix is None:
|
38
|
+
suffix = path_suffix
|
39
|
+
elif path_suffix != suffix:
|
40
|
+
return False
|
41
|
+
return True
|
42
|
+
|
43
|
+
|
44
|
+
def _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset:
|
30
45
|
if isinstance(paths, list):
|
31
46
|
path0 = paths[0]
|
32
47
|
if isinstance(path0, LocalPathClasses):
|
@@ -38,4 +53,4 @@ def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
|
|
38
53
|
else:
|
39
54
|
paths_str, filesystem = paths.path, paths.fs
|
40
55
|
|
41
|
-
return pyarrow.dataset.dataset(paths_str, filesystem=filesystem)
|
56
|
+
return pyarrow.dataset.dataset(paths_str, filesystem=filesystem, **kwargs)
|
@@ -12,8 +12,6 @@ from lamindb_setup.core._settings_storage import get_storage_region
|
|
12
12
|
from lamindb_setup.core.upath import LocalPathClasses, create_path
|
13
13
|
from packaging import version
|
14
14
|
|
15
|
-
from lamindb.models import Artifact, Run
|
16
|
-
|
17
15
|
if TYPE_CHECKING:
|
18
16
|
from lamindb_setup.core.types import UPathStr
|
19
17
|
from tiledbsoma import Collection as SOMACollection
|
@@ -21,12 +19,15 @@ if TYPE_CHECKING:
|
|
21
19
|
from tiledbsoma import Measurement as SOMAMeasurement
|
22
20
|
from upath import UPath
|
23
21
|
|
22
|
+
from lamindb.models.artifact import Artifact
|
23
|
+
from lamindb.models.run import Run
|
24
|
+
|
24
25
|
|
25
26
|
def _load_h5ad_zarr(objpath: UPath):
|
26
|
-
from lamindb.core.loaders import
|
27
|
+
from lamindb.core.loaders import load_h5ad, load_zarr
|
27
28
|
|
28
29
|
if objpath.is_dir():
|
29
|
-
adata =
|
30
|
+
adata = load_zarr(objpath, expected_type="anndata")
|
30
31
|
else:
|
31
32
|
# read only local in backed for now
|
32
33
|
# in principle possible to read remote in backed also
|
@@ -134,9 +135,10 @@ def save_tiledbsoma_experiment(
|
|
134
135
|
except ImportError as e:
|
135
136
|
raise ImportError("Please install tiledbsoma: pip install tiledbsoma") from e
|
136
137
|
|
137
|
-
from lamindb.core._data import get_run
|
138
138
|
from lamindb.core.storage.paths import auto_storage_key_from_artifact_uid
|
139
|
-
from lamindb.
|
139
|
+
from lamindb.models import Artifact
|
140
|
+
from lamindb.models._is_versioned import create_uid
|
141
|
+
from lamindb.models.artifact import get_run
|
140
142
|
|
141
143
|
run = get_run(run)
|
142
144
|
|
lamindb/core/storage/_zarr.py
CHANGED
@@ -1,55 +1,134 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import warnings
|
4
|
-
from typing import TYPE_CHECKING
|
4
|
+
from typing import TYPE_CHECKING, Literal
|
5
5
|
|
6
6
|
import scipy.sparse as sparse
|
7
7
|
import zarr
|
8
8
|
from anndata import __version__ as anndata_version
|
9
9
|
from anndata._io.specs import write_elem
|
10
|
-
from anndata._io.specs.registry import get_spec
|
11
10
|
from fsspec.implementations.local import LocalFileSystem
|
12
|
-
from
|
11
|
+
from lamin_utils import logger
|
12
|
+
from lamindb_setup.core.upath import S3FSMap, create_mapper, infer_filesystem
|
13
13
|
from packaging import version
|
14
14
|
|
15
|
+
from lamindb.core._compat import with_package
|
16
|
+
|
15
17
|
from ._anndata_sizes import _size_elem, _size_raw, size_adata
|
16
18
|
|
17
19
|
if version.parse(anndata_version) < version.parse("0.11.0"):
|
18
|
-
from anndata._io import read_zarr
|
20
|
+
from anndata._io import read_zarr as read_anndata_zarr
|
19
21
|
else:
|
20
|
-
from anndata.io import read_zarr
|
22
|
+
from anndata.io import read_zarr as read_anndata_zarr
|
21
23
|
|
22
24
|
|
23
25
|
if TYPE_CHECKING:
|
24
26
|
from anndata import AnnData
|
27
|
+
from fsspec import FSMap
|
25
28
|
from lamindb_setup.core.types import UPathStr
|
26
29
|
|
30
|
+
from lamindb.core.types import ScverseDataStructures
|
27
31
|
|
28
|
-
def zarr_is_adata(storepath: UPathStr) -> bool:
|
29
|
-
fs, storepath_str = infer_filesystem(storepath)
|
30
|
-
if isinstance(fs, LocalFileSystem):
|
31
|
-
# this is faster than through an fsspec mapper for local
|
32
|
-
open_obj = storepath_str
|
33
|
-
else:
|
34
|
-
open_obj = create_mapper(fs, storepath_str, check=True)
|
35
|
-
storage = zarr.open(open_obj, mode="r")
|
36
|
-
return get_spec(storage).encoding_type == "anndata"
|
37
32
|
|
38
|
-
|
39
|
-
|
33
|
+
def create_zarr_open_obj(
|
34
|
+
storepath: UPathStr, *, check: bool = True
|
35
|
+
) -> str | S3FSMap | FSMap:
|
36
|
+
"""Creates the correct object that can be used to open a zarr file depending on local or remote location."""
|
40
37
|
fs, storepath_str = infer_filesystem(storepath)
|
38
|
+
|
41
39
|
if isinstance(fs, LocalFileSystem):
|
42
|
-
# this is faster than through an fsspec mapper for local
|
43
40
|
open_obj = storepath_str
|
44
41
|
else:
|
45
|
-
open_obj = create_mapper(fs, storepath_str, check=
|
46
|
-
|
47
|
-
return
|
42
|
+
open_obj = create_mapper(fs, storepath_str, check=check)
|
43
|
+
|
44
|
+
return open_obj
|
45
|
+
|
46
|
+
|
47
|
+
def _identify_zarr_type_from_storage(
|
48
|
+
storage: zarr.Group,
|
49
|
+
) -> Literal["anndata", "mudata", "spatialdata", "unknown"]:
|
50
|
+
"""Internal helper to identify zarr type from an open storage object."""
|
51
|
+
try:
|
52
|
+
if storage.attrs.get("encoding-type", "") == "anndata":
|
53
|
+
return "anndata"
|
54
|
+
elif storage.attrs.get("encoding-type", "") == "MuData":
|
55
|
+
return "mudata"
|
56
|
+
elif "spatialdata_attrs" in storage.attrs:
|
57
|
+
return "spatialdata"
|
58
|
+
except Exception as error:
|
59
|
+
logger.warning(f"an exception occurred {error}")
|
60
|
+
return "unknown"
|
61
|
+
|
62
|
+
|
63
|
+
def identify_zarr_type(
|
64
|
+
storepath: UPathStr, *, check: bool = True
|
65
|
+
) -> Literal["anndata", "mudata", "spatialdata", "unknown"]:
|
66
|
+
"""Identify whether a zarr store is AnnData, SpatialData, or unknown type."""
|
67
|
+
# we can add these cheap suffix-based-checks later
|
68
|
+
# also need to check whether the .spatialdata.zarr suffix
|
69
|
+
# actually becomes a "standard"; currently we don't recognize it
|
70
|
+
# unlike ".anndata.zarr" in VALID_SUFFIXES
|
71
|
+
# suffixes = UPath(storepath).suffixes
|
72
|
+
# if ".spatialdata" in suffixes:
|
73
|
+
# return "spatialdata"
|
74
|
+
# elif ".anndata" in suffixes:
|
75
|
+
# return "anndata"
|
76
|
+
|
77
|
+
open_obj = create_zarr_open_obj(storepath, check=check)
|
78
|
+
try:
|
79
|
+
storage = zarr.open(open_obj, mode="r")
|
80
|
+
return _identify_zarr_type_from_storage(storage)
|
81
|
+
except Exception as error:
|
82
|
+
logger.warning(
|
83
|
+
f"an exception occured while trying to open the zarr store\n {error}"
|
84
|
+
)
|
85
|
+
return "unknown"
|
86
|
+
|
87
|
+
|
88
|
+
def load_zarr(
|
89
|
+
storepath: UPathStr,
|
90
|
+
expected_type: Literal["anndata", "mudata", "spatialdata"] = None,
|
91
|
+
) -> ScverseDataStructures:
|
92
|
+
"""Loads a zarr store and returns the corresponding scverse data structure.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
storepath: Path to the zarr store
|
96
|
+
expected_type: If provided, ensures the zarr store is of this type ("anndata", "mudata", "spatialdata")
|
97
|
+
and raises ValueError if it's not
|
98
|
+
"""
|
99
|
+
open_obj = create_zarr_open_obj(storepath, check=True)
|
100
|
+
|
101
|
+
# Open the storage once
|
102
|
+
try:
|
103
|
+
storage = zarr.open(open_obj, mode="r")
|
104
|
+
except Exception as error:
|
105
|
+
raise ValueError(f"Could not open zarr store: {error}") from None
|
106
|
+
|
107
|
+
actual_type = _identify_zarr_type_from_storage(storage)
|
108
|
+
if expected_type is not None and actual_type != expected_type:
|
109
|
+
raise ValueError(
|
110
|
+
f"Expected zarr store of type '{expected_type}', but found '{actual_type}'"
|
111
|
+
)
|
112
|
+
|
113
|
+
match actual_type:
|
114
|
+
case "anndata":
|
115
|
+
scverse_obj = read_anndata_zarr(open_obj)
|
116
|
+
case "mudata":
|
117
|
+
scverse_obj = with_package("mudata", lambda mod: mod.read_zarr(open_obj))
|
118
|
+
case "spatialdata":
|
119
|
+
scverse_obj = with_package(
|
120
|
+
"spatialdata", lambda mod: mod.read_zarr(open_obj)
|
121
|
+
)
|
122
|
+
case "unknown" | _:
|
123
|
+
raise ValueError(
|
124
|
+
"Unable to determine zarr store format and therefore cannot load Artifact."
|
125
|
+
)
|
126
|
+
return scverse_obj
|
48
127
|
|
49
128
|
|
50
129
|
def write_adata_zarr(
|
51
130
|
adata: AnnData, storepath: UPathStr, callback=None, chunks=None, **dataset_kwargs
|
52
|
-
):
|
131
|
+
) -> None:
|
53
132
|
fs, storepath_str = infer_filesystem(storepath)
|
54
133
|
store = create_mapper(fs, storepath_str, create=True)
|
55
134
|
|
@@ -65,7 +144,7 @@ def write_adata_zarr(
|
|
65
144
|
adata_size = None
|
66
145
|
cumulative_val = 0
|
67
146
|
|
68
|
-
def
|
147
|
+
def _report_progress(key_write: str | None = None):
|
69
148
|
nonlocal adata_size
|
70
149
|
nonlocal cumulative_val
|
71
150
|
|
@@ -91,9 +170,9 @@ def write_adata_zarr(
|
|
91
170
|
|
92
171
|
def _write_elem_cb(f, k, elem, dataset_kwargs):
|
93
172
|
write_elem(f, k, elem, dataset_kwargs=dataset_kwargs)
|
94
|
-
|
173
|
+
_report_progress(k)
|
95
174
|
|
96
|
-
|
175
|
+
_report_progress(None)
|
97
176
|
with warnings.catch_warnings():
|
98
177
|
warnings.filterwarnings("ignore", category=UserWarning, module="zarr")
|
99
178
|
|
@@ -114,4 +193,4 @@ def write_adata_zarr(
|
|
114
193
|
)
|
115
194
|
_write_elem_cb(f, "raw", adata.raw, dataset_kwargs=dataset_kwargs)
|
116
195
|
# todo: fix size less than total at the end
|
117
|
-
|
196
|
+
_report_progress(None)
|
lamindb/core/storage/objects.py
CHANGED
@@ -1,62 +1,97 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from pathlib import PurePosixPath
|
4
|
-
from typing import TYPE_CHECKING
|
4
|
+
from typing import TYPE_CHECKING, TypeAlias
|
5
5
|
|
6
6
|
from anndata import AnnData
|
7
7
|
from pandas import DataFrame
|
8
8
|
|
9
|
+
from lamindb.core._compat import (
|
10
|
+
with_package_obj,
|
11
|
+
)
|
12
|
+
from lamindb.core.types import ScverseDataStructures
|
13
|
+
|
9
14
|
if TYPE_CHECKING:
|
10
15
|
from lamindb_setup.core.types import UPathStr
|
11
16
|
|
12
|
-
|
13
|
-
def _mudata_is_installed():
|
14
|
-
try:
|
15
|
-
import mudata # noqa: F401c
|
16
|
-
except ImportError:
|
17
|
-
return False
|
18
|
-
return True
|
17
|
+
SupportedDataTypes: TypeAlias = DataFrame | ScverseDataStructures
|
19
18
|
|
20
19
|
|
21
|
-
def infer_suffix(dmem,
|
20
|
+
def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
|
22
21
|
"""Infer LaminDB storage file suffix from a data object."""
|
23
22
|
if isinstance(dmem, AnnData):
|
24
|
-
if
|
25
|
-
if
|
23
|
+
if format is not None:
|
24
|
+
if format not in {"h5ad", "zarr", "anndata.zarr"}:
|
26
25
|
raise ValueError(
|
27
26
|
"Error when specifying AnnData storage format, it should be"
|
28
|
-
f" 'h5ad', 'zarr', not '{
|
27
|
+
f" 'h5ad', 'zarr', not '{format}'. Check 'format'"
|
29
28
|
" or the suffix of 'key'."
|
30
29
|
)
|
31
|
-
return "." +
|
30
|
+
return "." + format
|
32
31
|
return ".h5ad"
|
33
|
-
|
32
|
+
|
33
|
+
if isinstance(dmem, DataFrame):
|
34
34
|
return ".parquet"
|
35
|
-
else:
|
36
|
-
if _mudata_is_installed():
|
37
|
-
from mudata import MuData
|
38
35
|
|
39
|
-
|
40
|
-
|
36
|
+
if with_package_obj(
|
37
|
+
dmem,
|
38
|
+
"MuData",
|
39
|
+
"mudata",
|
40
|
+
lambda obj: True, # Just checking type, not calling any method
|
41
|
+
)[0]:
|
42
|
+
return ".h5mu"
|
43
|
+
|
44
|
+
has_spatialdata, spatialdata_suffix = with_package_obj(
|
45
|
+
dmem,
|
46
|
+
"SpatialData",
|
47
|
+
"spatialdata",
|
48
|
+
lambda obj: "."
|
49
|
+
+ (
|
50
|
+
format
|
51
|
+
if format is not None and format in {"spatialdata.zarr", "zarr"}
|
52
|
+
else ".zarr"
|
53
|
+
if format is None
|
54
|
+
else (_ for _ in ()).throw(
|
55
|
+
ValueError(
|
56
|
+
"Error when specifying SpatialData storage format, it should be"
|
57
|
+
f" 'zarr', 'spatialdata.zarr', not '{format}'. Check 'format'"
|
58
|
+
" or the suffix of 'key'."
|
59
|
+
)
|
60
|
+
)
|
61
|
+
),
|
62
|
+
)
|
63
|
+
if has_spatialdata:
|
64
|
+
return spatialdata_suffix
|
65
|
+
else:
|
41
66
|
raise NotImplementedError
|
42
67
|
|
43
68
|
|
44
|
-
def write_to_disk(dmem, filepath: UPathStr):
|
69
|
+
def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
|
70
|
+
"""Writes the passed in memory data to disk to a specified path."""
|
45
71
|
if isinstance(dmem, AnnData):
|
46
72
|
suffix = PurePosixPath(filepath).suffix
|
47
73
|
if suffix == ".h5ad":
|
48
74
|
dmem.write_h5ad(filepath)
|
75
|
+
return
|
49
76
|
elif suffix == ".zarr":
|
50
77
|
dmem.write_zarr(filepath)
|
78
|
+
return
|
51
79
|
else:
|
52
80
|
raise NotImplementedError
|
53
|
-
|
81
|
+
|
82
|
+
if isinstance(dmem, DataFrame):
|
54
83
|
dmem.to_parquet(filepath)
|
55
|
-
|
56
|
-
if _mudata_is_installed():
|
57
|
-
from mudata import MuData
|
84
|
+
return
|
58
85
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
86
|
+
if with_package_obj(dmem, "MuData", "mudata", lambda obj: obj.write(filepath))[0]:
|
87
|
+
return
|
88
|
+
|
89
|
+
if with_package_obj(
|
90
|
+
dmem,
|
91
|
+
"SpatialData",
|
92
|
+
"spatialdata",
|
93
|
+
lambda obj: obj.write(filepath, overwrite=True),
|
94
|
+
)[0]:
|
95
|
+
return
|
96
|
+
|
97
|
+
raise NotImplementedError
|