lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +14 -5
- lamindb/_artifact.py +174 -57
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +85 -51
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +222 -81
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +59 -17
- lamindb/_record.py +171 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +33 -10
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +106 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +39 -36
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +54 -44
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +20 -7
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +66 -20
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +7 -13
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +41 -0
- lamindb/core/storage/_backed_access.py +2 -2
- lamindb/core/storage/_pyarrow_dataset.py +25 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +41 -22
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2168 -833
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +423 -156
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
- lamindb-1.1.0.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.4.dist-info/RECORD +0 -102
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
@@ -20,7 +20,7 @@ Registries.
|
|
20
20
|
User
|
21
21
|
Storage
|
22
22
|
Feature
|
23
|
-
|
23
|
+
Schema
|
24
24
|
Param
|
25
25
|
Collection
|
26
26
|
Project
|
@@ -33,7 +33,6 @@ Key functionality.
|
|
33
33
|
:toctree: .
|
34
34
|
|
35
35
|
connect
|
36
|
-
Curator
|
37
36
|
view
|
38
37
|
save
|
39
38
|
|
@@ -44,23 +43,32 @@ Modules and settings.
|
|
44
43
|
|
45
44
|
integrations
|
46
45
|
context
|
46
|
+
curators
|
47
47
|
settings
|
48
|
+
errors
|
48
49
|
setup
|
49
50
|
UPath
|
50
51
|
base
|
51
52
|
core
|
52
53
|
|
54
|
+
Backward compatibility.
|
55
|
+
|
56
|
+
.. autosummary::
|
57
|
+
:toctree: .
|
58
|
+
|
59
|
+
FeatureSet
|
60
|
+
|
53
61
|
"""
|
54
62
|
|
55
63
|
# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
|
56
|
-
__version__ = "1.0
|
64
|
+
__version__ = "1.1.0"
|
57
65
|
|
58
66
|
from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
|
59
67
|
from lamindb_setup._check_setup import _check_instance_setup
|
60
68
|
from lamindb_setup._connect_instance import connect
|
61
69
|
from lamindb_setup.core.upath import UPath
|
62
70
|
|
63
|
-
from . import base, setup
|
71
|
+
from . import base, errors, setup
|
64
72
|
|
65
73
|
|
66
74
|
def __getattr__(name):
|
@@ -86,10 +94,11 @@ if _check_instance_setup(from_module="lamindb"):
|
|
86
94
|
integrations,
|
87
95
|
)
|
88
96
|
from ._save import save
|
97
|
+
from ._tracked import tracked
|
89
98
|
from ._view import view
|
90
99
|
from .core._context import context
|
91
100
|
from .core._settings import settings
|
92
|
-
from .curators import Curator
|
101
|
+
from .curators import CatManager as Curator
|
93
102
|
from .models import (
|
94
103
|
Artifact,
|
95
104
|
Collection,
|
lamindb/_artifact.py
CHANGED
@@ -23,6 +23,8 @@ from lamindb_setup.core.upath import (
|
|
23
23
|
get_stat_file_cloud,
|
24
24
|
)
|
25
25
|
|
26
|
+
from lamindb._record import _get_record_kwargs
|
27
|
+
from lamindb.errors import FieldValidationError
|
26
28
|
from lamindb.models import Artifact, FeatureManager, ParamManager, Run, Storage
|
27
29
|
|
28
30
|
from ._parents import view_lineage
|
@@ -32,10 +34,9 @@ from .core._data import (
|
|
32
34
|
describe,
|
33
35
|
get_run,
|
34
36
|
save_schema_links,
|
35
|
-
|
37
|
+
save_staged_feature_sets,
|
36
38
|
)
|
37
39
|
from .core._settings import settings
|
38
|
-
from .core.exceptions import IntegrityError, InvalidArgument
|
39
40
|
from .core.loaders import load_to_memory
|
40
41
|
from .core.storage import (
|
41
42
|
LocalPathClasses,
|
@@ -44,7 +45,9 @@ from .core.storage import (
|
|
44
45
|
infer_suffix,
|
45
46
|
write_to_disk,
|
46
47
|
)
|
48
|
+
from .core.storage._anndata_accessor import _anndata_n_observations
|
47
49
|
from .core.storage._pyarrow_dataset import PYARROW_SUFFIXES
|
50
|
+
from .core.storage._tiledbsoma import _soma_n_observations
|
48
51
|
from .core.storage.objects import _mudata_is_installed
|
49
52
|
from .core.storage.paths import (
|
50
53
|
AUTO_KEY_PREFIX,
|
@@ -58,13 +61,14 @@ from .core.versioning import (
|
|
58
61
|
create_uid,
|
59
62
|
message_update_key_in_version_family,
|
60
63
|
)
|
64
|
+
from .errors import IntegrityError, InvalidArgument
|
61
65
|
|
62
66
|
try:
|
63
67
|
from .core.storage._zarr import zarr_is_adata
|
64
68
|
except ImportError:
|
65
69
|
|
66
70
|
def zarr_is_adata(storepath): # type: ignore
|
67
|
-
raise ImportError("Please install zarr: pip install zarr")
|
71
|
+
raise ImportError("Please install zarr: pip install zarr<=2.18.4")
|
68
72
|
|
69
73
|
|
70
74
|
if TYPE_CHECKING:
|
@@ -73,6 +77,7 @@ if TYPE_CHECKING:
|
|
73
77
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
74
78
|
from tiledbsoma import Collection as SOMACollection
|
75
79
|
from tiledbsoma import Experiment as SOMAExperiment
|
80
|
+
from tiledbsoma import Measurement as SOMAMeasurement
|
76
81
|
|
77
82
|
from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
|
78
83
|
|
@@ -83,6 +88,7 @@ def process_pathlike(
|
|
83
88
|
using_key: str | None,
|
84
89
|
skip_existence_check: bool = False,
|
85
90
|
) -> tuple[Storage, bool]:
|
91
|
+
"""Determines the appropriate storage for a given path and whether to use an existing storage key."""
|
86
92
|
if not skip_existence_check:
|
87
93
|
try: # check if file exists
|
88
94
|
if not filepath.exists():
|
@@ -112,6 +118,10 @@ def process_pathlike(
|
|
112
118
|
hf_path.path_in_repo = ""
|
113
119
|
new_root = "hf://" + hf_path.unresolve()
|
114
120
|
else:
|
121
|
+
if filepath.protocol == "s3":
|
122
|
+
# check that endpoint_url didn't propagate here
|
123
|
+
# as a part of the path string
|
124
|
+
assert "?" not in filepath.path # noqa: S101
|
115
125
|
new_root = list(filepath.parents)[-1]
|
116
126
|
# do not register remote storage locations on hub if the current instance
|
117
127
|
# is not managed on the hub
|
@@ -192,8 +202,7 @@ def process_data(
|
|
192
202
|
use_existing_storage_key = False
|
193
203
|
else:
|
194
204
|
raise NotImplementedError(
|
195
|
-
f"Do not know how to create a artifact object from {data}, pass a path"
|
196
|
-
" instead!"
|
205
|
+
f"Do not know how to create a artifact object from {data}, pass a path instead!"
|
197
206
|
)
|
198
207
|
return memory_rep, path, suffix, storage, use_existing_storage_key
|
199
208
|
|
@@ -205,6 +214,7 @@ def get_stat_or_artifact(
|
|
205
214
|
is_replace: bool = False,
|
206
215
|
instance: str | None = None,
|
207
216
|
) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
|
217
|
+
"""Retrieves file statistics or an existing artifact based on the path, hash, and key."""
|
208
218
|
n_files = None
|
209
219
|
if settings.creation.artifact_skip_size_hash:
|
210
220
|
return None, None, None, n_files, None
|
@@ -248,29 +258,14 @@ def get_stat_or_artifact(
|
|
248
258
|
)
|
249
259
|
previous_artifact_version = result[0]
|
250
260
|
if artifact_with_same_hash_exists:
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
)
|
257
|
-
|
258
|
-
|
259
|
-
logger.warning(
|
260
|
-
"creating new Artifact object despite existing artifact with same hash:"
|
261
|
-
f" {result[0]}"
|
262
|
-
)
|
263
|
-
return size, hash, hash_type, n_files, None
|
264
|
-
else:
|
265
|
-
if result[0]._branch_code == -1:
|
266
|
-
raise FileExistsError(
|
267
|
-
f"You're trying to re-create this artifact in trash: {result[0]}"
|
268
|
-
"Either permanently delete it with `artifact.delete(permanent=True)` or restore it with `artifact.restore()`"
|
269
|
-
)
|
270
|
-
logger.important(
|
271
|
-
f"returning existing artifact with same hash: {result[0]}; if you intended to query to track this artifact as an input, use: ln.Artifact.get()"
|
272
|
-
)
|
273
|
-
return result[0]
|
261
|
+
message = "found artifact with same hash"
|
262
|
+
if result[0]._branch_code == -1:
|
263
|
+
result[0].restore()
|
264
|
+
message = "restored artifact with same hash from trash"
|
265
|
+
logger.important(
|
266
|
+
f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
|
267
|
+
)
|
268
|
+
return result[0]
|
274
269
|
else:
|
275
270
|
return size, hash, hash_type, n_files, previous_artifact_version
|
276
271
|
|
@@ -441,7 +436,7 @@ def log_storage_hint(
|
|
441
436
|
root_path = Path(storage.root) # type: ignore
|
442
437
|
if check_path_is_child_of_root(root_path, Path.cwd()):
|
443
438
|
# only display the relative path, not the fully resolved path
|
444
|
-
display_root = root_path.relative_to(Path.cwd())
|
439
|
+
display_root = root_path.relative_to(Path.cwd()) # type: ignore
|
445
440
|
hint += f"path in storage '{display_root}'" # type: ignore
|
446
441
|
else:
|
447
442
|
hint += "path content will be copied to default storage upon `save()`"
|
@@ -480,7 +475,7 @@ def data_is_mudata(data: MuData | UPathStr) -> bool:
|
|
480
475
|
if isinstance(data, MuData):
|
481
476
|
return True
|
482
477
|
if isinstance(data, (str, Path)):
|
483
|
-
return UPath(data).suffix
|
478
|
+
return UPath(data).suffix == ".h5mu"
|
484
479
|
return False
|
485
480
|
|
486
481
|
|
@@ -506,8 +501,8 @@ def _check_otype_artifact(data: Any, otype: str | None = None):
|
|
506
501
|
|
507
502
|
|
508
503
|
def __init__(artifact: Artifact, *args, **kwargs):
|
509
|
-
artifact.features = FeatureManager(artifact)
|
510
|
-
artifact.params = ParamManager(artifact)
|
504
|
+
artifact.features = FeatureManager(artifact) # type: ignore
|
505
|
+
artifact.params = ParamManager(artifact) # type: ignore
|
511
506
|
# Below checks for the Django-internal call in from_db()
|
512
507
|
# it'd be better if we could avoid this, but not being able to create a Artifact
|
513
508
|
# from data with the default constructor renders the central class of the API
|
@@ -559,9 +554,9 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
559
554
|
logger.warning("`type` will be removed soon, please use `kind`")
|
560
555
|
kind = kwargs.pop("type")
|
561
556
|
if not len(kwargs) == 0:
|
562
|
-
|
563
|
-
|
564
|
-
f" can be passed, you passed: {kwargs}"
|
557
|
+
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
|
558
|
+
raise FieldValidationError(
|
559
|
+
f"Only {valid_keywords} can be passed, you passed: {kwargs}"
|
565
560
|
)
|
566
561
|
if revises is not None and key is not None and revises.key != key:
|
567
562
|
note = message_update_key_in_version_family(
|
@@ -676,6 +671,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
676
671
|
def from_df(
|
677
672
|
cls,
|
678
673
|
df: pd.DataFrame,
|
674
|
+
*,
|
679
675
|
key: str | None = None,
|
680
676
|
description: str | None = None,
|
681
677
|
run: Run | None = None,
|
@@ -683,7 +679,7 @@ def from_df(
|
|
683
679
|
**kwargs,
|
684
680
|
) -> Artifact:
|
685
681
|
"""{}""" # noqa: D415
|
686
|
-
artifact = Artifact(
|
682
|
+
artifact = Artifact( # type: ignore
|
687
683
|
data=df,
|
688
684
|
key=key,
|
689
685
|
run=run,
|
@@ -701,6 +697,7 @@ def from_df(
|
|
701
697
|
def from_anndata(
|
702
698
|
cls,
|
703
699
|
adata: AnnData | UPathStr,
|
700
|
+
*,
|
704
701
|
key: str | None = None,
|
705
702
|
description: str | None = None,
|
706
703
|
run: Run | None = None,
|
@@ -710,7 +707,8 @@ def from_anndata(
|
|
710
707
|
"""{}""" # noqa: D415
|
711
708
|
if not data_is_anndata(adata):
|
712
709
|
raise ValueError("data has to be an AnnData object or a path to AnnData-like")
|
713
|
-
|
710
|
+
_anndata_n_observations(adata)
|
711
|
+
artifact = Artifact( # type: ignore
|
714
712
|
data=adata,
|
715
713
|
key=key,
|
716
714
|
run=run,
|
@@ -720,6 +718,17 @@ def from_anndata(
|
|
720
718
|
kind="dataset",
|
721
719
|
**kwargs,
|
722
720
|
)
|
721
|
+
# this is done instead of _anndata_n_observations(adata)
|
722
|
+
# because we need a proper path through create_path for cloud paths
|
723
|
+
# for additional upath options etc that create_path adds
|
724
|
+
obj_for_obs: AnnData | UPath
|
725
|
+
if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
|
726
|
+
obj_for_obs = artifact._memory_rep
|
727
|
+
else:
|
728
|
+
# returns ._local_filepath for local files
|
729
|
+
# and the proper path through create_path for cloud paths
|
730
|
+
obj_for_obs = artifact.path
|
731
|
+
artifact.n_observations = _anndata_n_observations(obj_for_obs)
|
723
732
|
return artifact
|
724
733
|
|
725
734
|
|
@@ -728,6 +737,7 @@ def from_anndata(
|
|
728
737
|
def from_mudata(
|
729
738
|
cls,
|
730
739
|
mdata: MuData,
|
740
|
+
*,
|
731
741
|
key: str | None = None,
|
732
742
|
description: str | None = None,
|
733
743
|
run: Run | None = None,
|
@@ -735,7 +745,7 @@ def from_mudata(
|
|
735
745
|
**kwargs,
|
736
746
|
) -> Artifact:
|
737
747
|
"""{}""" # noqa: D415
|
738
|
-
artifact = Artifact(
|
748
|
+
artifact = Artifact( # type: ignore
|
739
749
|
data=mdata,
|
740
750
|
key=key,
|
741
751
|
run=run,
|
@@ -745,6 +755,38 @@ def from_mudata(
|
|
745
755
|
kind="dataset",
|
746
756
|
**kwargs,
|
747
757
|
)
|
758
|
+
artifact.n_observations = mdata.n_obs
|
759
|
+
return artifact
|
760
|
+
|
761
|
+
|
762
|
+
@classmethod # type: ignore
|
763
|
+
@doc_args(Artifact.from_tiledbsoma.__doc__)
|
764
|
+
def from_tiledbsoma(
|
765
|
+
cls,
|
766
|
+
path: UPathStr,
|
767
|
+
*,
|
768
|
+
key: str | None = None,
|
769
|
+
description: str | None = None,
|
770
|
+
run: Run | None = None,
|
771
|
+
revises: Artifact | None = None,
|
772
|
+
**kwargs,
|
773
|
+
) -> Artifact:
|
774
|
+
"""{}""" # noqa: D415
|
775
|
+
if UPath(path).suffix != ".tiledbsoma":
|
776
|
+
raise ValueError(
|
777
|
+
"A tiledbsoma store should have .tiledbsoma suffix to be registered."
|
778
|
+
)
|
779
|
+
artifact = Artifact( # type: ignore
|
780
|
+
data=path,
|
781
|
+
key=key,
|
782
|
+
run=run,
|
783
|
+
description=description,
|
784
|
+
revises=revises,
|
785
|
+
otype="tiledbsoma",
|
786
|
+
kind="dataset",
|
787
|
+
**kwargs,
|
788
|
+
)
|
789
|
+
artifact.n_observations = _soma_n_observations(artifact.path)
|
748
790
|
return artifact
|
749
791
|
|
750
792
|
|
@@ -753,8 +795,8 @@ def from_mudata(
|
|
753
795
|
def from_dir(
|
754
796
|
cls,
|
755
797
|
path: UPathStr,
|
756
|
-
key: str | None = None,
|
757
798
|
*,
|
799
|
+
key: str | None = None,
|
758
800
|
run: Run | None = None,
|
759
801
|
) -> list[Artifact]:
|
760
802
|
"""{}""" # noqa: D415
|
@@ -845,7 +887,7 @@ def from_dir(
|
|
845
887
|
# docstring handled through attach_func_to_class_method
|
846
888
|
def replace(
|
847
889
|
self,
|
848
|
-
data: UPathStr,
|
890
|
+
data: UPathStr | pd.DataFrame | AnnData | MuData,
|
849
891
|
run: Run | None = None,
|
850
892
|
format: str | None = None,
|
851
893
|
) -> None:
|
@@ -867,7 +909,18 @@ def replace(
|
|
867
909
|
|
868
910
|
check_path_in_storage = privates["check_path_in_storage"]
|
869
911
|
if check_path_in_storage:
|
870
|
-
|
912
|
+
err_msg = (
|
913
|
+
"Can only replace with a local path not in any Storage. "
|
914
|
+
f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}."
|
915
|
+
)
|
916
|
+
raise ValueError(err_msg)
|
917
|
+
|
918
|
+
_overwrite_versions = kwargs["_overwrite_versions"]
|
919
|
+
if self._overwrite_versions != _overwrite_versions:
|
920
|
+
err_msg = "It is not allowed to replace "
|
921
|
+
err_msg += "a folder" if self._overwrite_versions else "a file"
|
922
|
+
err_msg += " with " + ("a folder." if _overwrite_versions else "a file.")
|
923
|
+
raise ValueError(err_msg)
|
871
924
|
|
872
925
|
if self.key is not None and not self._key_is_virtual:
|
873
926
|
key_path = PurePosixPath(self.key)
|
@@ -902,6 +955,7 @@ def replace(
|
|
902
955
|
self._hash_type = kwargs["_hash_type"]
|
903
956
|
self.run_id = kwargs["run_id"]
|
904
957
|
self.run = kwargs["run"]
|
958
|
+
self.n_files = kwargs["n_files"]
|
905
959
|
|
906
960
|
self._local_filepath = privates["local_filepath"]
|
907
961
|
self._cloud_filepath = privates["cloud_filepath"]
|
@@ -921,12 +975,25 @@ inconsistent_state_msg = (
|
|
921
975
|
def open(
|
922
976
|
self, mode: str = "r", is_run_input: bool | None = None
|
923
977
|
) -> (
|
924
|
-
AnnDataAccessor
|
978
|
+
AnnDataAccessor
|
979
|
+
| BackedAccessor
|
980
|
+
| SOMACollection
|
981
|
+
| SOMAExperiment
|
982
|
+
| SOMAMeasurement
|
983
|
+
| PyArrowDataset
|
925
984
|
):
|
926
985
|
if self._overwrite_versions and not self.is_latest:
|
927
986
|
raise ValueError(inconsistent_state_msg)
|
928
987
|
# ignore empty suffix for now
|
929
|
-
suffixes = (
|
988
|
+
suffixes = (
|
989
|
+
"",
|
990
|
+
".h5",
|
991
|
+
".hdf5",
|
992
|
+
".h5ad",
|
993
|
+
".zarr",
|
994
|
+
".anndata.zarr",
|
995
|
+
".tiledbsoma",
|
996
|
+
) + PYARROW_SUFFIXES
|
930
997
|
if self.suffix not in suffixes:
|
931
998
|
raise ValueError(
|
932
999
|
"Artifact should have a zarr, h5, tiledbsoma object"
|
@@ -950,8 +1017,28 @@ def open(
|
|
950
1017
|
localpath = setup_settings.paths.cloud_to_local_no_update(
|
951
1018
|
filepath, cache_key=cache_key
|
952
1019
|
)
|
953
|
-
if
|
954
|
-
|
1020
|
+
if is_tiledbsoma_w:
|
1021
|
+
open_cache = False
|
1022
|
+
else:
|
1023
|
+
open_cache = not isinstance(
|
1024
|
+
filepath, LocalPathClasses
|
1025
|
+
) and not filepath.synchronize(localpath, just_check=True)
|
1026
|
+
if open_cache:
|
1027
|
+
try:
|
1028
|
+
access = backed_access(localpath, mode, using_key)
|
1029
|
+
except Exception as e:
|
1030
|
+
if isinstance(filepath, LocalPathClasses):
|
1031
|
+
raise e
|
1032
|
+
logger.warning(
|
1033
|
+
f"The cache might be corrupted: {e}. Trying to open directly."
|
1034
|
+
)
|
1035
|
+
access = backed_access(filepath, mode, using_key)
|
1036
|
+
# happens only if backed_access has been successful
|
1037
|
+
# delete the corrupted cache
|
1038
|
+
if localpath.is_dir():
|
1039
|
+
shutil.rmtree(localpath)
|
1040
|
+
else:
|
1041
|
+
localpath.unlink(missing_ok=True)
|
955
1042
|
else:
|
956
1043
|
access = backed_access(filepath, mode, using_key)
|
957
1044
|
if is_tiledbsoma_w:
|
@@ -993,10 +1080,10 @@ def _synchronize_cleanup_on_error(
|
|
993
1080
|
cache_path = setup_settings.paths.cloud_to_local_no_update(
|
994
1081
|
filepath, cache_key=cache_key
|
995
1082
|
)
|
996
|
-
if cache_path.
|
997
|
-
cache_path.unlink(missing_ok=True)
|
998
|
-
elif cache_path.is_dir():
|
1083
|
+
if cache_path.is_dir():
|
999
1084
|
shutil.rmtree(cache_path)
|
1085
|
+
else:
|
1086
|
+
cache_path.unlink(missing_ok=True)
|
1000
1087
|
raise e
|
1001
1088
|
return cache_path
|
1002
1089
|
|
@@ -1013,8 +1100,24 @@ def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
|
|
1013
1100
|
self, using_key=settings._using_key
|
1014
1101
|
)
|
1015
1102
|
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1016
|
-
|
1017
|
-
|
1103
|
+
try:
|
1104
|
+
# cache_path is local so doesn't trigger any sync in load_to_memory
|
1105
|
+
access_memory = load_to_memory(cache_path, **kwargs)
|
1106
|
+
except Exception as e:
|
1107
|
+
# just raise the exception if the original path is local
|
1108
|
+
if isinstance(filepath, LocalPathClasses):
|
1109
|
+
raise e
|
1110
|
+
logger.warning(
|
1111
|
+
f"The cache might be corrupted: {e}. Retrying to synchronize."
|
1112
|
+
)
|
1113
|
+
# delete the existing cache
|
1114
|
+
if cache_path.is_dir():
|
1115
|
+
shutil.rmtree(cache_path)
|
1116
|
+
else:
|
1117
|
+
cache_path.unlink(missing_ok=True)
|
1118
|
+
# download again and try to load into memory
|
1119
|
+
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1120
|
+
access_memory = load_to_memory(cache_path, **kwargs)
|
1018
1121
|
# only call if load is successfull
|
1019
1122
|
_track_run_input(self, is_run_input)
|
1020
1123
|
return access_memory
|
@@ -1154,15 +1257,27 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1154
1257
|
using_key = None
|
1155
1258
|
if "using" in kwargs:
|
1156
1259
|
using_key = kwargs["using"]
|
1157
|
-
|
1260
|
+
exception_upload = check_and_attempt_upload(
|
1158
1261
|
self, using_key, access_token=access_token, print_progress=print_progress
|
1159
1262
|
)
|
1160
|
-
if
|
1263
|
+
if exception_upload is not None:
|
1264
|
+
# we do not want to raise file not found on cleanup if upload of a file failed
|
1265
|
+
# often it is ACID in the filesystem itself
|
1266
|
+
# for example, s3 won't have the failed file, so just skip the delete in this case
|
1267
|
+
raise_file_not_found_error = False
|
1161
1268
|
self._delete_skip_storage()
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1269
|
+
else:
|
1270
|
+
# this is the case when it is cleaned on .replace
|
1271
|
+
raise_file_not_found_error = True
|
1272
|
+
# this is triggered by an exception in check_and_attempt_upload or by replace.
|
1273
|
+
exception_clear = check_and_attempt_clearing(
|
1274
|
+
self, raise_file_not_found_error=raise_file_not_found_error, using_key=using_key
|
1275
|
+
)
|
1276
|
+
if exception_upload is not None:
|
1277
|
+
raise RuntimeError(exception_upload)
|
1278
|
+
if exception_clear is not None:
|
1279
|
+
raise RuntimeError(exception_clear)
|
1280
|
+
# this is only for keep_artifacts_local
|
1166
1281
|
if local_path is not None and not state_was_adding:
|
1167
1282
|
# only move the local artifact to cache if it was not newly created
|
1168
1283
|
local_path_cache = ln_setup.settings.cache_dir / local_path.name
|
@@ -1177,7 +1292,7 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1177
1292
|
|
1178
1293
|
|
1179
1294
|
def _save_skip_storage(file, **kwargs) -> None:
|
1180
|
-
|
1295
|
+
save_staged_feature_sets(file)
|
1181
1296
|
super(Artifact, file).save(**kwargs)
|
1182
1297
|
save_schema_links(file)
|
1183
1298
|
|
@@ -1213,6 +1328,7 @@ METHOD_NAMES = [
|
|
1213
1328
|
"from_anndata",
|
1214
1329
|
"from_df",
|
1215
1330
|
"from_mudata",
|
1331
|
+
"from_tiledbsoma",
|
1216
1332
|
"open",
|
1217
1333
|
"cache",
|
1218
1334
|
"load",
|
@@ -1236,6 +1352,7 @@ for name in METHOD_NAMES:
|
|
1236
1352
|
attach_func_to_class_method(name, Artifact, globals())
|
1237
1353
|
|
1238
1354
|
# privates currently dealt with separately
|
1355
|
+
# mypy: ignore-errors
|
1239
1356
|
Artifact._delete_skip_storage = _delete_skip_storage
|
1240
1357
|
Artifact._save_skip_storage = _save_skip_storage
|
1241
1358
|
Artifact._cache_path = _cache_path
|
lamindb/_can_curate.py
CHANGED
@@ -14,7 +14,7 @@ from lamindb.models import CanCurate, Record
|
|
14
14
|
from ._from_values import _format_values, _has_organism_field, get_or_create_records
|
15
15
|
from ._record import _queryset, get_name_field
|
16
16
|
from ._utils import attach_func_to_class_method
|
17
|
-
from .
|
17
|
+
from .errors import ValidationError
|
18
18
|
|
19
19
|
if TYPE_CHECKING:
|
20
20
|
from django.db.models import QuerySet
|
@@ -61,6 +61,7 @@ def inspect(
|
|
61
61
|
mute: bool = False,
|
62
62
|
organism: str | Record | None = None,
|
63
63
|
source: Record | None = None,
|
64
|
+
strict_source: bool = False,
|
64
65
|
) -> InspectResult:
|
65
66
|
"""{}""" # noqa: D415
|
66
67
|
return _inspect(
|
@@ -68,6 +69,7 @@ def inspect(
|
|
68
69
|
values=values,
|
69
70
|
field=field,
|
70
71
|
mute=mute,
|
72
|
+
strict_source=strict_source,
|
71
73
|
organism=organism,
|
72
74
|
source=source,
|
73
75
|
)
|
@@ -83,10 +85,17 @@ def validate(
|
|
83
85
|
mute: bool = False,
|
84
86
|
organism: str | Record | None = None,
|
85
87
|
source: Record | None = None,
|
88
|
+
strict_source: bool = False,
|
86
89
|
) -> np.ndarray:
|
87
90
|
"""{}""" # noqa: D415
|
88
91
|
return _validate(
|
89
|
-
cls=cls,
|
92
|
+
cls=cls,
|
93
|
+
values=values,
|
94
|
+
field=field,
|
95
|
+
mute=mute,
|
96
|
+
strict_source=strict_source,
|
97
|
+
organism=organism,
|
98
|
+
source=source,
|
90
99
|
)
|
91
100
|
|
92
101
|
|
@@ -99,7 +108,7 @@ def _check_source_db(source: Record, using_key: str | None):
|
|
99
108
|
)
|
100
109
|
|
101
110
|
|
102
|
-
def _check_organism_db(organism: Record, using_key: str | None):
|
111
|
+
def _check_organism_db(organism: str | Record | None, using_key: str | None):
|
103
112
|
"""Check if the organism is from the DB."""
|
104
113
|
if isinstance(organism, Record):
|
105
114
|
if using_key is not None and using_key != "default":
|
@@ -131,6 +140,7 @@ def _inspect(
|
|
131
140
|
using_key: str | None = None,
|
132
141
|
organism: str | Record | None = None,
|
133
142
|
source: Record | None = None,
|
143
|
+
strict_source: bool = False,
|
134
144
|
) -> pd.DataFrame | dict[str, list[str]]:
|
135
145
|
"""{}""" # noqa: D415
|
136
146
|
from lamin_utils._inspect import inspect
|
@@ -144,7 +154,10 @@ def _inspect(
|
|
144
154
|
using_key = queryset.db
|
145
155
|
if isinstance(source, Record):
|
146
156
|
_check_source_db(source, using_key)
|
147
|
-
|
157
|
+
# if strict_source mode, restrict the query to the passed ontology source
|
158
|
+
# otherwise, inspect across records present in the DB from all ontology sources and no-source
|
159
|
+
if strict_source:
|
160
|
+
queryset = queryset.filter(source=source)
|
148
161
|
_check_organism_db(organism, using_key)
|
149
162
|
registry = queryset.model
|
150
163
|
model_name = registry._meta.model.__name__
|
@@ -200,7 +213,7 @@ def _inspect(
|
|
200
213
|
f" {colors.italic('.from_values()')}"
|
201
214
|
)
|
202
215
|
|
203
|
-
nonval = [i for i in bionty_result.non_validated if i not in bionty_mapper]
|
216
|
+
nonval = [i for i in bionty_result.non_validated if i not in bionty_mapper] # type: ignore
|
204
217
|
# no bionty source is found
|
205
218
|
except ValueError:
|
206
219
|
logger.warning("no Bionty source found, skipping Bionty validation")
|
@@ -227,6 +240,7 @@ def _validate(
|
|
227
240
|
using_key: str | None = None,
|
228
241
|
organism: str | Record | None = None,
|
229
242
|
source: Record | None = None,
|
243
|
+
strict_source: bool = False,
|
230
244
|
) -> np.ndarray:
|
231
245
|
"""{}""" # noqa: D415
|
232
246
|
from lamin_utils._inspect import validate
|
@@ -242,7 +256,8 @@ def _validate(
|
|
242
256
|
using_key = queryset.db
|
243
257
|
if isinstance(source, Record):
|
244
258
|
_check_source_db(source, using_key)
|
245
|
-
|
259
|
+
if strict_source:
|
260
|
+
queryset = queryset.filter(source=source)
|
246
261
|
_check_organism_db(organism, using_key)
|
247
262
|
field_values = pd.Series(
|
248
263
|
_filter_query_based_on_organism(
|
@@ -292,6 +307,7 @@ def standardize(
|
|
292
307
|
synonyms_field: str = "synonyms",
|
293
308
|
organism: str | Record | None = None,
|
294
309
|
source: Record | None = None,
|
310
|
+
strict_source: bool = False,
|
295
311
|
) -> list[str] | dict[str, str]:
|
296
312
|
"""{}""" # noqa: D415
|
297
313
|
return _standardize(
|
@@ -302,6 +318,7 @@ def standardize(
|
|
302
318
|
return_mapper=return_mapper,
|
303
319
|
case_sensitive=case_sensitive,
|
304
320
|
mute=mute,
|
321
|
+
strict_source=strict_source,
|
305
322
|
public_aware=public_aware,
|
306
323
|
keep=keep,
|
307
324
|
synonyms_field=synonyms_field,
|
@@ -359,6 +376,7 @@ def _standardize(
|
|
359
376
|
using_key: str | None = None,
|
360
377
|
organism: str | Record | None = None,
|
361
378
|
source: Record | None = None,
|
379
|
+
strict_source: bool = False,
|
362
380
|
) -> list[str] | dict[str, str]:
|
363
381
|
"""{}""" # noqa: D415
|
364
382
|
from lamin_utils._standardize import standardize as map_synonyms
|
@@ -376,7 +394,8 @@ def _standardize(
|
|
376
394
|
using_key = queryset.db
|
377
395
|
if isinstance(source, Record):
|
378
396
|
_check_source_db(source, using_key)
|
379
|
-
|
397
|
+
if strict_source:
|
398
|
+
queryset = queryset.filter(source=source)
|
380
399
|
_check_organism_db(organism, using_key)
|
381
400
|
registry = queryset.model
|
382
401
|
|
@@ -476,7 +495,7 @@ def _standardize(
|
|
476
495
|
logger.warning(warn_msg)
|
477
496
|
|
478
497
|
mapper.update(std_names_bt_mapper)
|
479
|
-
if pd.
|
498
|
+
if isinstance(std_names_db, pd.CategoricalDtype):
|
480
499
|
result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()
|
481
500
|
else:
|
482
501
|
result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
|