lamindb 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +14 -5
- lamindb/_artifact.py +150 -53
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +85 -51
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +12 -6
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +59 -17
- lamindb/_record.py +171 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +33 -10
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +106 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +7 -7
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +53 -43
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +20 -7
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +66 -20
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +6 -12
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +41 -0
- lamindb/core/storage/_backed_access.py +2 -2
- lamindb/core/storage/_pyarrow_dataset.py +25 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +27 -21
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2168 -833
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +420 -153
- {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/METADATA +9 -7
- lamindb-1.1.0.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.5.dist-info/RECORD +0 -102
- {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
- {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
lamindb/core/datasets/_small.py
CHANGED
@@ -8,20 +8,25 @@ import pandas as pd
|
|
8
8
|
|
9
9
|
|
10
10
|
def small_dataset1(
|
11
|
-
|
11
|
+
otype: Literal["DataFrame", "AnnData"],
|
12
|
+
gene_symbols_in_index: bool = False,
|
12
13
|
with_typo: bool = False,
|
13
|
-
) ->
|
14
|
+
) -> pd.DataFrame | ad.AnnData:
|
14
15
|
# define the data in the dataset
|
15
16
|
# it's a mix of numerical measurements and observation-level metadata
|
16
17
|
ifng = "IFNJ" if with_typo else "IFNG"
|
18
|
+
if gene_symbols_in_index:
|
19
|
+
var_ids = ["CD8A", "CD4", "CD14"]
|
20
|
+
else:
|
21
|
+
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
|
17
22
|
dataset_dict = {
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
"cell_medium": ["DMSO", ifng, "DMSO"],
|
23
|
+
var_ids[0]: [1, 2, 3],
|
24
|
+
var_ids[1]: [3, 4, 5],
|
25
|
+
var_ids[2]: [5, 6, 7],
|
26
|
+
"cell_medium": pd.Categorical(["DMSO", ifng, "DMSO"]),
|
22
27
|
"sample_note": ["was ok", "looks naah", "pretty! 🤩"],
|
23
|
-
"cell_type_by_expert": ["B cell", "T cell", "T cell"],
|
24
|
-
"cell_type_by_model": ["B cell", "T cell", "T cell"],
|
28
|
+
"cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
|
29
|
+
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
25
30
|
}
|
26
31
|
# define the dataset-level metadata
|
27
32
|
metadata = {
|
@@ -32,8 +37,10 @@ def small_dataset1(
|
|
32
37
|
}
|
33
38
|
# the dataset as DataFrame
|
34
39
|
dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
|
35
|
-
if
|
36
|
-
|
40
|
+
if otype == "DataFrame":
|
41
|
+
for key, value in metadata.items():
|
42
|
+
dataset_df.attrs[key] = value
|
43
|
+
return dataset_df
|
37
44
|
else:
|
38
45
|
dataset_ad = ad.AnnData(
|
39
46
|
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
@@ -42,14 +49,19 @@ def small_dataset1(
|
|
42
49
|
|
43
50
|
|
44
51
|
def small_dataset2(
|
45
|
-
|
46
|
-
|
52
|
+
otype: Literal["DataFrame", "AnnData"],
|
53
|
+
gene_symbols_in_index: bool = False,
|
54
|
+
) -> pd.DataFrame | ad.AnnData:
|
55
|
+
if gene_symbols_in_index:
|
56
|
+
var_ids = ["CD8A", "CD4", "CD38"]
|
57
|
+
else:
|
58
|
+
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
|
47
59
|
dataset_dict = {
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
"cell_medium": ["DMSO", "IFNG", "IFNG"],
|
52
|
-
"cell_type_by_model": ["B cell", "T cell", "T cell"],
|
60
|
+
var_ids[0]: [2, 3, 3],
|
61
|
+
var_ids[1]: [3, 4, 5],
|
62
|
+
var_ids[2]: [4, 2, 3],
|
63
|
+
"cell_medium": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
|
64
|
+
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
53
65
|
}
|
54
66
|
metadata = {
|
55
67
|
"temperature": 22.6,
|
@@ -61,11 +73,13 @@ def small_dataset2(
|
|
61
73
|
index=["sample4", "sample5", "sample6"],
|
62
74
|
)
|
63
75
|
ad.AnnData(
|
64
|
-
dataset_df[
|
76
|
+
dataset_df[var_ids],
|
65
77
|
obs=dataset_df[["cell_medium", "cell_type_by_model"]],
|
66
78
|
)
|
67
|
-
if
|
68
|
-
|
79
|
+
if otype == "DataFrame":
|
80
|
+
for key, value in metadata.items():
|
81
|
+
dataset_df.attrs[key] = value
|
82
|
+
return dataset_df
|
69
83
|
else:
|
70
84
|
dataset_ad = ad.AnnData(
|
71
85
|
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
@@ -73,6 +87,38 @@ def small_dataset2(
|
|
73
87
|
return dataset_ad
|
74
88
|
|
75
89
|
|
90
|
+
def small_dataset3_cellxgene(
|
91
|
+
otype: Literal["DataFrame", "AnnData"] = "AnnData",
|
92
|
+
) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
|
93
|
+
# TODO: consider other ids for other organisms
|
94
|
+
# "ENSMUSG00002076988"
|
95
|
+
var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
|
96
|
+
dataset_dict = {
|
97
|
+
var_ids[0]: [2, 3, 3],
|
98
|
+
var_ids[1]: [3, 4, 5],
|
99
|
+
var_ids[2]: [4, 2, 3],
|
100
|
+
"disease_ontology_term_id": ["MONDO:0004975", "MONDO:0004980", "MONDO:0004980"],
|
101
|
+
"organism": ["human", "human", "human"],
|
102
|
+
"sex": ["female", "male", "unknown"],
|
103
|
+
"tissue": ["lungg", "lungg", "heart"],
|
104
|
+
"donor": ["-1", "1", "2"],
|
105
|
+
}
|
106
|
+
dataset_df = pd.DataFrame(
|
107
|
+
dataset_dict,
|
108
|
+
index=["barcode1", "barcode2", "barcode3"],
|
109
|
+
)
|
110
|
+
dataset_df["tissue"] = dataset_df["tissue"].astype("category")
|
111
|
+
ad.AnnData(
|
112
|
+
dataset_df[var_ids],
|
113
|
+
obs=dataset_df[[key for key in dataset_dict if key not in var_ids]],
|
114
|
+
)
|
115
|
+
if otype == "DataFrame":
|
116
|
+
return dataset_df
|
117
|
+
else:
|
118
|
+
dataset_ad = ad.AnnData(dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:])
|
119
|
+
return dataset_ad
|
120
|
+
|
121
|
+
|
76
122
|
def anndata_with_obs() -> ad.AnnData:
|
77
123
|
"""Create a mini anndata with cell_type, disease and tissue."""
|
78
124
|
import anndata as ad
|
lamindb/core/exceptions.py
CHANGED
@@ -1,90 +1 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
.. autosummary::
|
4
|
-
:toctree: .
|
5
|
-
|
6
|
-
InvalidArgument
|
7
|
-
DoesNotExist
|
8
|
-
ValidationError
|
9
|
-
NotebookNotSaved
|
10
|
-
MissingContextUID
|
11
|
-
UpdateContext
|
12
|
-
IntegrityError
|
13
|
-
RecordNameChangeIntegrityError
|
14
|
-
|
15
|
-
"""
|
16
|
-
|
17
|
-
# inheriting from SystemExit has the sole purpose of suppressing
|
18
|
-
# the traceback - this isn't optimal but the current best solution
|
19
|
-
# https://laminlabs.slack.com/archives/C04A0RMA0SC/p1726856875597489
|
20
|
-
|
21
|
-
|
22
|
-
class InvalidArgument(SystemExit):
|
23
|
-
"""Invalid method or function argument."""
|
24
|
-
|
25
|
-
pass
|
26
|
-
|
27
|
-
|
28
|
-
class TrackNotCalled(SystemExit):
|
29
|
-
"""`ln.track()` wasn't called."""
|
30
|
-
|
31
|
-
pass
|
32
|
-
|
33
|
-
|
34
|
-
class NotebookNotSaved(SystemExit):
|
35
|
-
"""Notebook wasn't saved."""
|
36
|
-
|
37
|
-
pass
|
38
|
-
|
39
|
-
|
40
|
-
class ValidationError(SystemExit):
|
41
|
-
"""Validation error: not mapped in registry."""
|
42
|
-
|
43
|
-
pass
|
44
|
-
|
45
|
-
|
46
|
-
# inspired by Django's DoesNotExist
|
47
|
-
# equivalent to SQLAlchemy's NoResultFound
|
48
|
-
class DoesNotExist(SystemExit):
|
49
|
-
"""No record found."""
|
50
|
-
|
51
|
-
pass
|
52
|
-
|
53
|
-
|
54
|
-
class InconsistentKey(Exception):
|
55
|
-
"""Inconsistent transform or artifact `key`."""
|
56
|
-
|
57
|
-
pass
|
58
|
-
|
59
|
-
|
60
|
-
class RecordNameChangeIntegrityError(SystemExit):
|
61
|
-
"""Custom exception for name change errors."""
|
62
|
-
|
63
|
-
pass
|
64
|
-
|
65
|
-
|
66
|
-
# -------------------------------------------------------------------------------------
|
67
|
-
# run context
|
68
|
-
# -------------------------------------------------------------------------------------
|
69
|
-
|
70
|
-
|
71
|
-
class IntegrityError(Exception):
|
72
|
-
"""Integrity error.
|
73
|
-
|
74
|
-
For instance, it's not allowed to delete artifacts outside managed storage
|
75
|
-
locations.
|
76
|
-
"""
|
77
|
-
|
78
|
-
pass
|
79
|
-
|
80
|
-
|
81
|
-
class MissingContextUID(SystemExit):
|
82
|
-
"""User didn't define transform settings."""
|
83
|
-
|
84
|
-
pass
|
85
|
-
|
86
|
-
|
87
|
-
class UpdateContext(SystemExit):
|
88
|
-
"""Transform settings require update."""
|
89
|
-
|
90
|
-
pass
|
1
|
+
from ..errors import * # noqa: F403 backward compat
|
lamindb/core/loaders.py
CHANGED
@@ -109,19 +109,13 @@ def load_json(path: UPathStr) -> dict:
|
|
109
109
|
return data
|
110
110
|
|
111
111
|
|
112
|
-
def load_yaml(path: UPathStr) -> dict
|
112
|
+
def load_yaml(path: UPathStr) -> dict:
|
113
113
|
"""Load `.yaml` to `dict`."""
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
return data
|
120
|
-
except ImportError:
|
121
|
-
logger.warning(
|
122
|
-
"Please install PyYAML (`pip install PyYAML`) to load `.yaml` files."
|
123
|
-
)
|
124
|
-
return path
|
114
|
+
import yaml # type: ignore
|
115
|
+
|
116
|
+
with open(path) as f:
|
117
|
+
data = yaml.safe_load(f)
|
118
|
+
return data
|
125
119
|
|
126
120
|
|
127
121
|
def load_image(path: UPathStr) -> None | UPathStr:
|
lamindb/core/relations.py
CHANGED
@@ -8,7 +8,7 @@ from lamindb_setup._connect_instance import (
|
|
8
8
|
)
|
9
9
|
from lamindb_setup.core._settings_store import instance_settings_file
|
10
10
|
|
11
|
-
from lamindb.models import LinkORM, Record, Schema
|
11
|
+
from lamindb.models import LinkORM, Record, Registry, Schema
|
12
12
|
|
13
13
|
|
14
14
|
def get_schema_modules(instance: str | None) -> set[str]:
|
@@ -35,9 +35,11 @@ def get_schema_modules(instance: str | None) -> set[str]:
|
|
35
35
|
return shared_schema_modules
|
36
36
|
|
37
37
|
|
38
|
+
# this function here should likely be renamed
|
39
|
+
# it maps the __get_name_with_module__() onto the actual model
|
38
40
|
def dict_module_name_to_model_name(
|
39
|
-
registry:
|
40
|
-
) -> dict[str,
|
41
|
+
registry: Registry, instance: str | None = None
|
42
|
+
) -> dict[str, Registry]:
|
41
43
|
schema_modules = get_schema_modules(instance)
|
42
44
|
d: dict = {
|
43
45
|
i.related_model.__get_name_with_module__(): i.related_model
|
@@ -92,7 +94,7 @@ def get_related_name(features_type: type[Record]) -> str:
|
|
92
94
|
f"Can't create feature sets from {features_type.__name__} because it's not"
|
93
95
|
" related to it!\nYou need to create a link model between Schema and"
|
94
96
|
" your Record in your custom module.\nTo do so, add a"
|
95
|
-
" line:\
|
97
|
+
" line:\n_feature_sets = models.ManyToMany(Schema,"
|
96
98
|
" related_name='mythings')\n"
|
97
99
|
)
|
98
100
|
return candidates[0]
|
@@ -19,6 +19,7 @@ from fsspec.implementations.local import LocalFileSystem
|
|
19
19
|
from lamin_utils import logger
|
20
20
|
from lamindb_setup.core.upath import create_mapper, infer_filesystem
|
21
21
|
from packaging import version
|
22
|
+
from upath import UPath
|
22
23
|
|
23
24
|
if TYPE_CHECKING:
|
24
25
|
from collections.abc import Mapping
|
@@ -741,3 +742,43 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
|
|
741
742
|
return AnnDataRawAccessor(
|
742
743
|
self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
|
743
744
|
)
|
745
|
+
|
746
|
+
|
747
|
+
# get the number of observations in an anndata object or file fast and safely
|
748
|
+
def _anndata_n_observations(object: UPathStr | AnnData) -> int | None:
|
749
|
+
if isinstance(object, AnnData):
|
750
|
+
return object.n_obs
|
751
|
+
|
752
|
+
try:
|
753
|
+
objectpath = UPath(object)
|
754
|
+
suffix = objectpath.suffix
|
755
|
+
conn_module = {".h5ad": "h5py", ".zarr": "zarr"}.get(suffix, suffix[1:])
|
756
|
+
conn, storage = registry.open(conn_module, objectpath, mode="r")
|
757
|
+
except Exception as e:
|
758
|
+
logger.warning(f"Could not open {object} to read n_observations: {e}")
|
759
|
+
return None
|
760
|
+
|
761
|
+
n_observations: int | None = None
|
762
|
+
try:
|
763
|
+
obs = storage["obs"]
|
764
|
+
if isinstance(obs, GroupTypes): # type: ignore
|
765
|
+
if "_index" in obs.attrs:
|
766
|
+
elem_key = _read_attr(obs.attrs, "_index")
|
767
|
+
else:
|
768
|
+
elem_key = next(iter(obs))
|
769
|
+
elem = obs[elem_key]
|
770
|
+
if isinstance(elem, ArrayTypes): # type: ignore
|
771
|
+
n_observations = elem.shape[0]
|
772
|
+
else:
|
773
|
+
# assume standard obs group
|
774
|
+
n_observations = elem["codes"].shape[0]
|
775
|
+
else:
|
776
|
+
n_observations = obs.shape[0]
|
777
|
+
except Exception as e:
|
778
|
+
logger.warning(f"Could not read n_observations from anndata {object}: {e}")
|
779
|
+
finally:
|
780
|
+
if hasattr(storage, "close"):
|
781
|
+
storage.close()
|
782
|
+
if hasattr(conn, "close"):
|
783
|
+
conn.close()
|
784
|
+
return n_observations
|
@@ -94,8 +94,8 @@ def backed_access(
|
|
94
94
|
return _open_pyarrow_dataset(objectpath)
|
95
95
|
else:
|
96
96
|
raise ValueError(
|
97
|
-
"object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix
|
98
|
-
f" {suffix}."
|
97
|
+
"The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
|
98
|
+
f"or be compatible with pyarrow.dataset.dataset, instead of being {suffix} object."
|
99
99
|
)
|
100
100
|
|
101
101
|
is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
|
@@ -6,26 +6,36 @@ import pyarrow.dataset
|
|
6
6
|
from lamindb_setup.core.upath import LocalPathClasses
|
7
7
|
|
8
8
|
if TYPE_CHECKING:
|
9
|
+
from pyarrow.dataset import Dataset as PyArrowDataset
|
9
10
|
from upath import UPath
|
10
11
|
|
11
12
|
|
12
|
-
PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather")
|
13
|
+
PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
|
13
14
|
|
14
15
|
|
15
|
-
def _is_pyarrow_dataset(
|
16
|
-
# it is assumed here that
|
17
|
-
|
18
|
-
|
16
|
+
def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
|
17
|
+
# it is assumed here that the paths exist
|
18
|
+
# we don't check here that the filesystem is the same
|
19
|
+
# but this is a requirement for pyarrow.dataset.dataset
|
20
|
+
if isinstance(paths, list):
|
21
|
+
suffixes = {path.suffix for path in paths}
|
22
|
+
elif paths.is_file():
|
23
|
+
suffixes = {paths.suffix}
|
19
24
|
else:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
suffixes = {path.suffix for path in paths.rglob("*") if path.suffix != ""}
|
26
|
+
return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
|
27
|
+
|
28
|
+
|
29
|
+
def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
|
30
|
+
if isinstance(paths, list):
|
31
|
+
path0 = paths[0]
|
32
|
+
if isinstance(path0, LocalPathClasses):
|
33
|
+
paths_str, filesystem = [path.as_posix() for path in paths], None
|
34
|
+
else:
|
35
|
+
paths_str, filesystem = [path.path for path in paths], path0.fs
|
36
|
+
elif isinstance(paths, LocalPathClasses):
|
37
|
+
paths_str, filesystem = paths.as_posix(), None
|
28
38
|
else:
|
29
|
-
|
39
|
+
paths_str, filesystem = paths.path, paths.fs
|
30
40
|
|
31
|
-
return pyarrow.dataset.dataset(
|
41
|
+
return pyarrow.dataset.dataset(paths_str, filesystem=filesystem)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, Literal
|
4
|
+
from urllib.parse import urlparse
|
4
5
|
|
5
6
|
import pandas as pd
|
6
7
|
import pyarrow as pa
|
@@ -17,6 +18,7 @@ if TYPE_CHECKING:
|
|
17
18
|
from lamindb_setup.core.types import UPathStr
|
18
19
|
from tiledbsoma import Collection as SOMACollection
|
19
20
|
from tiledbsoma import Experiment as SOMAExperiment
|
21
|
+
from tiledbsoma import Measurement as SOMAMeasurement
|
20
22
|
from upath import UPath
|
21
23
|
|
22
24
|
|
@@ -36,9 +38,21 @@ def _load_h5ad_zarr(objpath: UPath):
|
|
36
38
|
|
37
39
|
|
38
40
|
def _tiledb_config_s3(storepath: UPath) -> dict:
|
39
|
-
region = get_storage_region(storepath)
|
40
|
-
tiledb_config = {"vfs.s3.region": region}
|
41
41
|
storage_options = storepath.storage_options
|
42
|
+
tiledb_config = {}
|
43
|
+
|
44
|
+
endpoint_url = storage_options.get("endpoint_url", None)
|
45
|
+
if endpoint_url is not None:
|
46
|
+
tiledb_config["vfs.s3.region"] = ""
|
47
|
+
tiledb_config["vfs.s3.use_virtual_addressing"] = "false"
|
48
|
+
parsed = urlparse(endpoint_url)
|
49
|
+
tiledb_config["vfs.s3.scheme"] = parsed.scheme
|
50
|
+
tiledb_config["vfs.s3.endpoint_override"] = (
|
51
|
+
parsed._replace(scheme="").geturl().lstrip("/")
|
52
|
+
)
|
53
|
+
else:
|
54
|
+
tiledb_config["vfs.s3.region"] = get_storage_region(storepath)
|
55
|
+
|
42
56
|
if "key" in storage_options:
|
43
57
|
tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
|
44
58
|
if "secret" in storage_options:
|
@@ -51,7 +65,7 @@ def _tiledb_config_s3(storepath: UPath) -> dict:
|
|
51
65
|
|
52
66
|
def _open_tiledbsoma(
|
53
67
|
storepath: UPath, mode: Literal["r", "w"] = "r"
|
54
|
-
) -> SOMACollection | SOMAExperiment:
|
68
|
+
) -> SOMACollection | SOMAExperiment | SOMAMeasurement:
|
55
69
|
try:
|
56
70
|
import tiledbsoma as soma
|
57
71
|
except ImportError as e:
|
@@ -71,6 +85,8 @@ def _open_tiledbsoma(
|
|
71
85
|
soma_objects = [obj.name for obj in storepath.iterdir()]
|
72
86
|
if "obs" in soma_objects and "ms" in soma_objects:
|
73
87
|
SOMAType = soma.Experiment
|
88
|
+
elif "var" in soma_objects:
|
89
|
+
SOMAType = soma.Measurement
|
74
90
|
else:
|
75
91
|
SOMAType = soma.Collection
|
76
92
|
return SOMAType.open(storepath_str, mode=mode, context=ctx)
|
@@ -134,17 +150,17 @@ def save_tiledbsoma_experiment(
|
|
134
150
|
)
|
135
151
|
storepath = setup_settings.storage.root / storage_key
|
136
152
|
|
137
|
-
if storepath.protocol == "s3":
|
153
|
+
if storepath.protocol == "s3": # type: ignore
|
138
154
|
ctx = soma.SOMATileDBContext(tiledb_config=_tiledb_config_s3(storepath))
|
139
155
|
else:
|
140
156
|
ctx = None
|
141
157
|
|
142
|
-
|
158
|
+
storepath_str = storepath.as_posix()
|
143
159
|
|
144
160
|
add_run_uid = True
|
145
161
|
run_uid_dtype = "category"
|
146
162
|
if appending:
|
147
|
-
with soma.Experiment.open(
|
163
|
+
with soma.Experiment.open(storepath_str, mode="r", context=ctx) as store:
|
148
164
|
obs_schema = store["obs"].schema
|
149
165
|
add_run_uid = "lamin_run_uid" in obs_schema.names
|
150
166
|
# this is needed to enable backwards compatibility with tiledbsoma stores
|
@@ -175,7 +191,7 @@ def save_tiledbsoma_experiment(
|
|
175
191
|
registration_mapping = kwargs.get("registration_mapping", None)
|
176
192
|
if registration_mapping is None and (appending or len(adata_objects) > 1):
|
177
193
|
registration_mapping = soma_io.register_anndatas(
|
178
|
-
experiment_uri=
|
194
|
+
experiment_uri=storepath_str if appending else None,
|
179
195
|
adatas=adata_objects,
|
180
196
|
measurement_name=measurement_name,
|
181
197
|
obs_field_name=obs_id_name,
|
@@ -195,19 +211,19 @@ def save_tiledbsoma_experiment(
|
|
195
211
|
assert len(adata_objects) == 1 # noqa: S101
|
196
212
|
n_observations = adata_objects[0].n_obs
|
197
213
|
|
198
|
-
logger.important(f"Writing the tiledbsoma store to {
|
214
|
+
logger.important(f"Writing the tiledbsoma store to {storepath_str}")
|
199
215
|
for adata_obj in adata_objects:
|
200
|
-
if resize_experiment and soma.Experiment.exists(
|
216
|
+
if resize_experiment and soma.Experiment.exists(storepath_str, context=ctx):
|
201
217
|
# can only happen if registration_mapping is not None
|
202
218
|
soma_io.resize_experiment(
|
203
|
-
|
219
|
+
storepath_str,
|
204
220
|
nobs=n_observations,
|
205
221
|
nvars=registration_mapping.get_var_shapes(),
|
206
222
|
context=ctx,
|
207
223
|
)
|
208
224
|
resize_experiment = False
|
209
225
|
soma_io.from_anndata(
|
210
|
-
|
226
|
+
storepath_str,
|
211
227
|
adata_obj,
|
212
228
|
measurement_name,
|
213
229
|
context=ctx,
|
@@ -217,7 +233,7 @@ def save_tiledbsoma_experiment(
|
|
217
233
|
**kwargs,
|
218
234
|
)
|
219
235
|
|
220
|
-
artifact = Artifact(
|
236
|
+
artifact = Artifact( # type: ignore
|
221
237
|
storepath,
|
222
238
|
key=key,
|
223
239
|
description=description,
|
@@ -229,3 +245,31 @@ def save_tiledbsoma_experiment(
|
|
229
245
|
artifact.otype = "tiledbsoma"
|
230
246
|
|
231
247
|
return artifact.save()
|
248
|
+
|
249
|
+
|
250
|
+
# this is less defensive than _anndata_n_observations
|
251
|
+
# this doesn't really catches errors
|
252
|
+
# assumes that the tiledbsoma object is well-formed
|
253
|
+
def _soma_store_n_observations(obj) -> int:
|
254
|
+
if obj.soma_type in {"SOMADataFrame", "SOMASparseNDArray", "SOMADenseNDArray"}:
|
255
|
+
return obj.non_empty_domain()[0][1] + 1
|
256
|
+
elif obj.soma_type == "SOMAExperiment":
|
257
|
+
return _soma_store_n_observations(obj["obs"])
|
258
|
+
elif obj.soma_type == "SOMAMeasurement":
|
259
|
+
keys = obj.keys()
|
260
|
+
for slot in ("X", "obsm", "obsp"):
|
261
|
+
if slot in keys:
|
262
|
+
return _soma_store_n_observations(next(iter(obj[slot].values())))
|
263
|
+
elif obj.soma_type == "SOMACollection":
|
264
|
+
n_obs = 0
|
265
|
+
for value in obj.values():
|
266
|
+
n_obs += _soma_store_n_observations(value)
|
267
|
+
return n_obs
|
268
|
+
raise ValueError(
|
269
|
+
"Could not infer the number of observations from the tiledbsoma object."
|
270
|
+
)
|
271
|
+
|
272
|
+
|
273
|
+
def _soma_n_observations(objectpath: UPath) -> int:
|
274
|
+
with _open_tiledbsoma(objectpath, mode="r") as store:
|
275
|
+
return _soma_store_n_observations(store)
|
lamindb/core/storage/paths.py
CHANGED
@@ -4,7 +4,6 @@ import shutil
|
|
4
4
|
from typing import TYPE_CHECKING
|
5
5
|
|
6
6
|
import fsspec
|
7
|
-
from lamin_utils import logger
|
8
7
|
from lamindb_setup.core import StorageSettings
|
9
8
|
from lamindb_setup.core.upath import (
|
10
9
|
LocalPathClasses,
|
@@ -42,25 +41,27 @@ def auto_storage_key_from_artifact_uid(uid: str, suffix: str, is_dir: bool) -> s
|
|
42
41
|
return storage_key
|
43
42
|
|
44
43
|
|
45
|
-
def
|
46
|
-
|
47
|
-
# from the equality checks below
|
48
|
-
# and for fsspec.utils.get_protocol
|
49
|
-
path_str = str(path)
|
50
|
-
root_str = str(root)
|
51
|
-
root_protocol = fsspec.utils.get_protocol(root_str)
|
52
|
-
# check that the protocols are the same first
|
53
|
-
if fsspec.utils.get_protocol(path_str) != root_protocol:
|
54
|
-
return False
|
55
|
-
if root_protocol in {"http", "https"}:
|
56
|
-
# in this case it is a base url, not a file
|
57
|
-
# so formally does not exist
|
44
|
+
def _safely_resolve(upath: UPath) -> UPath:
|
45
|
+
if upath.protocol in {"http", "https"}:
|
58
46
|
resolve_kwargs = {"follow_redirects": False}
|
59
47
|
else:
|
60
48
|
resolve_kwargs = {}
|
61
|
-
return (
|
62
|
-
|
63
|
-
|
49
|
+
return upath.resolve(**resolve_kwargs)
|
50
|
+
|
51
|
+
|
52
|
+
def check_path_is_child_of_root(path: UPathStr, root: UPathStr) -> bool:
|
53
|
+
if fsspec.utils.get_protocol(str(path)) != fsspec.utils.get_protocol(str(root)):
|
54
|
+
return False
|
55
|
+
path_upath = _safely_resolve(UPath(path))
|
56
|
+
root_upath = _safely_resolve(UPath(root))
|
57
|
+
if path_upath.protocol == "s3":
|
58
|
+
endpoint_path = path_upath.storage_options.get("endpoint_url", "")
|
59
|
+
endpoint_root = root_upath.storage_options.get("endpoint_url", "")
|
60
|
+
if endpoint_path != endpoint_root:
|
61
|
+
return False
|
62
|
+
# str is needed to eliminate UPath storage_options
|
63
|
+
# which affect equality checks
|
64
|
+
return UPath(str(root_upath)) in UPath(str(path_upath)).parents
|
64
65
|
|
65
66
|
|
66
67
|
# returns filepath and root of the storage
|
@@ -169,10 +170,15 @@ def store_file_or_folder(
|
|
169
170
|
|
170
171
|
|
171
172
|
def delete_storage_using_key(
|
172
|
-
artifact: Artifact,
|
173
|
-
|
173
|
+
artifact: Artifact,
|
174
|
+
storage_key: str,
|
175
|
+
raise_file_not_found_error: bool = True,
|
176
|
+
using_key: str | None = None,
|
177
|
+
) -> None | str:
|
174
178
|
filepath, _ = attempt_accessing_path(artifact, storage_key, using_key=using_key)
|
175
|
-
delete_storage(
|
179
|
+
return delete_storage(
|
180
|
+
filepath, raise_file_not_found_error=raise_file_not_found_error
|
181
|
+
)
|
176
182
|
|
177
183
|
|
178
184
|
def delete_storage(
|
@@ -191,5 +197,5 @@ def delete_storage(
|
|
191
197
|
elif raise_file_not_found_error:
|
192
198
|
raise FileNotFoundError(f"{storagepath} is not an existing path!")
|
193
199
|
else:
|
194
|
-
|
200
|
+
return "did-not-delete"
|
195
201
|
return None
|
@@ -1,13 +1,8 @@
|
|
1
|
-
from typing import Literal
|
2
|
-
|
3
|
-
|
4
1
|
class CreationSettings:
|
5
|
-
|
6
|
-
|
7
|
-
] = "warn_return_existing"
|
8
|
-
"""Behavior if file hash exists (default `"warn_return_existing"`).
|
2
|
+
search_names: bool = True
|
3
|
+
"""Switch off to speed up creating records (default `True`).
|
9
4
|
|
10
|
-
|
5
|
+
If `True`, search for alternative names and avoids duplicates.
|
11
6
|
|
12
7
|
FAQ: :doc:`/faq/idempotency`
|
13
8
|
"""
|
@@ -18,15 +13,8 @@ class CreationSettings:
|
|
18
13
|
|
19
14
|
It speeds up file creation by about a factor 100.
|
20
15
|
"""
|
21
|
-
search_names: bool = True
|
22
|
-
"""To speed up creating records (default `True`).
|
23
|
-
|
24
|
-
If `True`, search for alternative names.
|
25
|
-
|
26
|
-
FAQ: :doc:`/faq/idempotency`
|
27
|
-
"""
|
28
16
|
artifact_silence_missing_run_warning: bool = False
|
29
|
-
"""Silence warning about missing run & transform during artifact creation."""
|
17
|
+
"""Silence warning about missing run & transform during artifact creation (default `False`)."""
|
30
18
|
_artifact_use_virtual_keys: bool = True
|
31
19
|
"""Treat `key` parameter in :class:`~lamindb.Artifact` as virtual.
|
32
20
|
|