lamindb 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +14 -5
- lamindb/_artifact.py +150 -53
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +85 -51
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +12 -6
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +59 -17
- lamindb/_record.py +171 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +33 -10
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +106 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +7 -7
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +53 -43
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +20 -7
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +66 -20
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +6 -12
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +41 -0
- lamindb/core/storage/_backed_access.py +2 -2
- lamindb/core/storage/_pyarrow_dataset.py +25 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +27 -21
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2168 -833
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +420 -153
- {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/METADATA +9 -7
- lamindb-1.1.0.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.5.dist-info/RECORD +0 -102
- {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
- {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
@@ -20,7 +20,7 @@ Registries.
|
|
20
20
|
User
|
21
21
|
Storage
|
22
22
|
Feature
|
23
|
-
|
23
|
+
Schema
|
24
24
|
Param
|
25
25
|
Collection
|
26
26
|
Project
|
@@ -33,7 +33,6 @@ Key functionality.
|
|
33
33
|
:toctree: .
|
34
34
|
|
35
35
|
connect
|
36
|
-
Curator
|
37
36
|
view
|
38
37
|
save
|
39
38
|
|
@@ -44,23 +43,32 @@ Modules and settings.
|
|
44
43
|
|
45
44
|
integrations
|
46
45
|
context
|
46
|
+
curators
|
47
47
|
settings
|
48
|
+
errors
|
48
49
|
setup
|
49
50
|
UPath
|
50
51
|
base
|
51
52
|
core
|
52
53
|
|
54
|
+
Backward compatibility.
|
55
|
+
|
56
|
+
.. autosummary::
|
57
|
+
:toctree: .
|
58
|
+
|
59
|
+
FeatureSet
|
60
|
+
|
53
61
|
"""
|
54
62
|
|
55
63
|
# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
|
56
|
-
__version__ = "1.0
|
64
|
+
__version__ = "1.1.0"
|
57
65
|
|
58
66
|
from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
|
59
67
|
from lamindb_setup._check_setup import _check_instance_setup
|
60
68
|
from lamindb_setup._connect_instance import connect
|
61
69
|
from lamindb_setup.core.upath import UPath
|
62
70
|
|
63
|
-
from . import base, setup
|
71
|
+
from . import base, errors, setup
|
64
72
|
|
65
73
|
|
66
74
|
def __getattr__(name):
|
@@ -86,10 +94,11 @@ if _check_instance_setup(from_module="lamindb"):
|
|
86
94
|
integrations,
|
87
95
|
)
|
88
96
|
from ._save import save
|
97
|
+
from ._tracked import tracked
|
89
98
|
from ._view import view
|
90
99
|
from .core._context import context
|
91
100
|
from .core._settings import settings
|
92
|
-
from .curators import Curator
|
101
|
+
from .curators import CatManager as Curator
|
93
102
|
from .models import (
|
94
103
|
Artifact,
|
95
104
|
Collection,
|
lamindb/_artifact.py
CHANGED
@@ -23,6 +23,8 @@ from lamindb_setup.core.upath import (
|
|
23
23
|
get_stat_file_cloud,
|
24
24
|
)
|
25
25
|
|
26
|
+
from lamindb._record import _get_record_kwargs
|
27
|
+
from lamindb.errors import FieldValidationError
|
26
28
|
from lamindb.models import Artifact, FeatureManager, ParamManager, Run, Storage
|
27
29
|
|
28
30
|
from ._parents import view_lineage
|
@@ -32,10 +34,9 @@ from .core._data import (
|
|
32
34
|
describe,
|
33
35
|
get_run,
|
34
36
|
save_schema_links,
|
35
|
-
|
37
|
+
save_staged_feature_sets,
|
36
38
|
)
|
37
39
|
from .core._settings import settings
|
38
|
-
from .core.exceptions import IntegrityError, InvalidArgument
|
39
40
|
from .core.loaders import load_to_memory
|
40
41
|
from .core.storage import (
|
41
42
|
LocalPathClasses,
|
@@ -44,7 +45,9 @@ from .core.storage import (
|
|
44
45
|
infer_suffix,
|
45
46
|
write_to_disk,
|
46
47
|
)
|
48
|
+
from .core.storage._anndata_accessor import _anndata_n_observations
|
47
49
|
from .core.storage._pyarrow_dataset import PYARROW_SUFFIXES
|
50
|
+
from .core.storage._tiledbsoma import _soma_n_observations
|
48
51
|
from .core.storage.objects import _mudata_is_installed
|
49
52
|
from .core.storage.paths import (
|
50
53
|
AUTO_KEY_PREFIX,
|
@@ -58,6 +61,7 @@ from .core.versioning import (
|
|
58
61
|
create_uid,
|
59
62
|
message_update_key_in_version_family,
|
60
63
|
)
|
64
|
+
from .errors import IntegrityError, InvalidArgument
|
61
65
|
|
62
66
|
try:
|
63
67
|
from .core.storage._zarr import zarr_is_adata
|
@@ -73,6 +77,7 @@ if TYPE_CHECKING:
|
|
73
77
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
74
78
|
from tiledbsoma import Collection as SOMACollection
|
75
79
|
from tiledbsoma import Experiment as SOMAExperiment
|
80
|
+
from tiledbsoma import Measurement as SOMAMeasurement
|
76
81
|
|
77
82
|
from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
|
78
83
|
|
@@ -83,6 +88,7 @@ def process_pathlike(
|
|
83
88
|
using_key: str | None,
|
84
89
|
skip_existence_check: bool = False,
|
85
90
|
) -> tuple[Storage, bool]:
|
91
|
+
"""Determines the appropriate storage for a given path and whether to use an existing storage key."""
|
86
92
|
if not skip_existence_check:
|
87
93
|
try: # check if file exists
|
88
94
|
if not filepath.exists():
|
@@ -112,6 +118,10 @@ def process_pathlike(
|
|
112
118
|
hf_path.path_in_repo = ""
|
113
119
|
new_root = "hf://" + hf_path.unresolve()
|
114
120
|
else:
|
121
|
+
if filepath.protocol == "s3":
|
122
|
+
# check that endpoint_url didn't propagate here
|
123
|
+
# as a part of the path string
|
124
|
+
assert "?" not in filepath.path # noqa: S101
|
115
125
|
new_root = list(filepath.parents)[-1]
|
116
126
|
# do not register remote storage locations on hub if the current instance
|
117
127
|
# is not managed on the hub
|
@@ -192,8 +202,7 @@ def process_data(
|
|
192
202
|
use_existing_storage_key = False
|
193
203
|
else:
|
194
204
|
raise NotImplementedError(
|
195
|
-
f"Do not know how to create a artifact object from {data}, pass a path"
|
196
|
-
" instead!"
|
205
|
+
f"Do not know how to create a artifact object from {data}, pass a path instead!"
|
197
206
|
)
|
198
207
|
return memory_rep, path, suffix, storage, use_existing_storage_key
|
199
208
|
|
@@ -205,6 +214,7 @@ def get_stat_or_artifact(
|
|
205
214
|
is_replace: bool = False,
|
206
215
|
instance: str | None = None,
|
207
216
|
) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
|
217
|
+
"""Retrieves file statistics or an existing artifact based on the path, hash, and key."""
|
208
218
|
n_files = None
|
209
219
|
if settings.creation.artifact_skip_size_hash:
|
210
220
|
return None, None, None, n_files, None
|
@@ -248,29 +258,14 @@ def get_stat_or_artifact(
|
|
248
258
|
)
|
249
259
|
previous_artifact_version = result[0]
|
250
260
|
if artifact_with_same_hash_exists:
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
)
|
257
|
-
|
258
|
-
|
259
|
-
logger.warning(
|
260
|
-
"creating new Artifact object despite existing artifact with same hash:"
|
261
|
-
f" {result[0]}"
|
262
|
-
)
|
263
|
-
return size, hash, hash_type, n_files, None
|
264
|
-
else:
|
265
|
-
if result[0]._branch_code == -1:
|
266
|
-
raise FileExistsError(
|
267
|
-
f"You're trying to re-create this artifact in trash: {result[0]}"
|
268
|
-
"Either permanently delete it with `artifact.delete(permanent=True)` or restore it with `artifact.restore()`"
|
269
|
-
)
|
270
|
-
logger.important(
|
271
|
-
f"returning existing artifact with same hash: {result[0]}; if you intended to query to track this artifact as an input, use: ln.Artifact.get()"
|
272
|
-
)
|
273
|
-
return result[0]
|
261
|
+
message = "found artifact with same hash"
|
262
|
+
if result[0]._branch_code == -1:
|
263
|
+
result[0].restore()
|
264
|
+
message = "restored artifact with same hash from trash"
|
265
|
+
logger.important(
|
266
|
+
f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
|
267
|
+
)
|
268
|
+
return result[0]
|
274
269
|
else:
|
275
270
|
return size, hash, hash_type, n_files, previous_artifact_version
|
276
271
|
|
@@ -441,7 +436,7 @@ def log_storage_hint(
|
|
441
436
|
root_path = Path(storage.root) # type: ignore
|
442
437
|
if check_path_is_child_of_root(root_path, Path.cwd()):
|
443
438
|
# only display the relative path, not the fully resolved path
|
444
|
-
display_root = root_path.relative_to(Path.cwd())
|
439
|
+
display_root = root_path.relative_to(Path.cwd()) # type: ignore
|
445
440
|
hint += f"path in storage '{display_root}'" # type: ignore
|
446
441
|
else:
|
447
442
|
hint += "path content will be copied to default storage upon `save()`"
|
@@ -480,7 +475,7 @@ def data_is_mudata(data: MuData | UPathStr) -> bool:
|
|
480
475
|
if isinstance(data, MuData):
|
481
476
|
return True
|
482
477
|
if isinstance(data, (str, Path)):
|
483
|
-
return UPath(data).suffix
|
478
|
+
return UPath(data).suffix == ".h5mu"
|
484
479
|
return False
|
485
480
|
|
486
481
|
|
@@ -506,8 +501,8 @@ def _check_otype_artifact(data: Any, otype: str | None = None):
|
|
506
501
|
|
507
502
|
|
508
503
|
def __init__(artifact: Artifact, *args, **kwargs):
|
509
|
-
artifact.features = FeatureManager(artifact)
|
510
|
-
artifact.params = ParamManager(artifact)
|
504
|
+
artifact.features = FeatureManager(artifact) # type: ignore
|
505
|
+
artifact.params = ParamManager(artifact) # type: ignore
|
511
506
|
# Below checks for the Django-internal call in from_db()
|
512
507
|
# it'd be better if we could avoid this, but not being able to create a Artifact
|
513
508
|
# from data with the default constructor renders the central class of the API
|
@@ -559,9 +554,9 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
559
554
|
logger.warning("`type` will be removed soon, please use `kind`")
|
560
555
|
kind = kwargs.pop("type")
|
561
556
|
if not len(kwargs) == 0:
|
562
|
-
|
563
|
-
|
564
|
-
f" can be passed, you passed: {kwargs}"
|
557
|
+
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
|
558
|
+
raise FieldValidationError(
|
559
|
+
f"Only {valid_keywords} can be passed, you passed: {kwargs}"
|
565
560
|
)
|
566
561
|
if revises is not None and key is not None and revises.key != key:
|
567
562
|
note = message_update_key_in_version_family(
|
@@ -676,6 +671,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
676
671
|
def from_df(
|
677
672
|
cls,
|
678
673
|
df: pd.DataFrame,
|
674
|
+
*,
|
679
675
|
key: str | None = None,
|
680
676
|
description: str | None = None,
|
681
677
|
run: Run | None = None,
|
@@ -683,7 +679,7 @@ def from_df(
|
|
683
679
|
**kwargs,
|
684
680
|
) -> Artifact:
|
685
681
|
"""{}""" # noqa: D415
|
686
|
-
artifact = Artifact(
|
682
|
+
artifact = Artifact( # type: ignore
|
687
683
|
data=df,
|
688
684
|
key=key,
|
689
685
|
run=run,
|
@@ -701,6 +697,7 @@ def from_df(
|
|
701
697
|
def from_anndata(
|
702
698
|
cls,
|
703
699
|
adata: AnnData | UPathStr,
|
700
|
+
*,
|
704
701
|
key: str | None = None,
|
705
702
|
description: str | None = None,
|
706
703
|
run: Run | None = None,
|
@@ -710,7 +707,8 @@ def from_anndata(
|
|
710
707
|
"""{}""" # noqa: D415
|
711
708
|
if not data_is_anndata(adata):
|
712
709
|
raise ValueError("data has to be an AnnData object or a path to AnnData-like")
|
713
|
-
|
710
|
+
_anndata_n_observations(adata)
|
711
|
+
artifact = Artifact( # type: ignore
|
714
712
|
data=adata,
|
715
713
|
key=key,
|
716
714
|
run=run,
|
@@ -720,6 +718,17 @@ def from_anndata(
|
|
720
718
|
kind="dataset",
|
721
719
|
**kwargs,
|
722
720
|
)
|
721
|
+
# this is done instead of _anndata_n_observations(adata)
|
722
|
+
# because we need a proper path through create_path for cloud paths
|
723
|
+
# for additional upath options etc that create_path adds
|
724
|
+
obj_for_obs: AnnData | UPath
|
725
|
+
if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
|
726
|
+
obj_for_obs = artifact._memory_rep
|
727
|
+
else:
|
728
|
+
# returns ._local_filepath for local files
|
729
|
+
# and the proper path through create_path for cloud paths
|
730
|
+
obj_for_obs = artifact.path
|
731
|
+
artifact.n_observations = _anndata_n_observations(obj_for_obs)
|
723
732
|
return artifact
|
724
733
|
|
725
734
|
|
@@ -728,6 +737,7 @@ def from_anndata(
|
|
728
737
|
def from_mudata(
|
729
738
|
cls,
|
730
739
|
mdata: MuData,
|
740
|
+
*,
|
731
741
|
key: str | None = None,
|
732
742
|
description: str | None = None,
|
733
743
|
run: Run | None = None,
|
@@ -735,7 +745,7 @@ def from_mudata(
|
|
735
745
|
**kwargs,
|
736
746
|
) -> Artifact:
|
737
747
|
"""{}""" # noqa: D415
|
738
|
-
artifact = Artifact(
|
748
|
+
artifact = Artifact( # type: ignore
|
739
749
|
data=mdata,
|
740
750
|
key=key,
|
741
751
|
run=run,
|
@@ -745,6 +755,38 @@ def from_mudata(
|
|
745
755
|
kind="dataset",
|
746
756
|
**kwargs,
|
747
757
|
)
|
758
|
+
artifact.n_observations = mdata.n_obs
|
759
|
+
return artifact
|
760
|
+
|
761
|
+
|
762
|
+
@classmethod # type: ignore
|
763
|
+
@doc_args(Artifact.from_tiledbsoma.__doc__)
|
764
|
+
def from_tiledbsoma(
|
765
|
+
cls,
|
766
|
+
path: UPathStr,
|
767
|
+
*,
|
768
|
+
key: str | None = None,
|
769
|
+
description: str | None = None,
|
770
|
+
run: Run | None = None,
|
771
|
+
revises: Artifact | None = None,
|
772
|
+
**kwargs,
|
773
|
+
) -> Artifact:
|
774
|
+
"""{}""" # noqa: D415
|
775
|
+
if UPath(path).suffix != ".tiledbsoma":
|
776
|
+
raise ValueError(
|
777
|
+
"A tiledbsoma store should have .tiledbsoma suffix to be registered."
|
778
|
+
)
|
779
|
+
artifact = Artifact( # type: ignore
|
780
|
+
data=path,
|
781
|
+
key=key,
|
782
|
+
run=run,
|
783
|
+
description=description,
|
784
|
+
revises=revises,
|
785
|
+
otype="tiledbsoma",
|
786
|
+
kind="dataset",
|
787
|
+
**kwargs,
|
788
|
+
)
|
789
|
+
artifact.n_observations = _soma_n_observations(artifact.path)
|
748
790
|
return artifact
|
749
791
|
|
750
792
|
|
@@ -753,8 +795,8 @@ def from_mudata(
|
|
753
795
|
def from_dir(
|
754
796
|
cls,
|
755
797
|
path: UPathStr,
|
756
|
-
key: str | None = None,
|
757
798
|
*,
|
799
|
+
key: str | None = None,
|
758
800
|
run: Run | None = None,
|
759
801
|
) -> list[Artifact]:
|
760
802
|
"""{}""" # noqa: D415
|
@@ -933,7 +975,12 @@ inconsistent_state_msg = (
|
|
933
975
|
def open(
|
934
976
|
self, mode: str = "r", is_run_input: bool | None = None
|
935
977
|
) -> (
|
936
|
-
AnnDataAccessor
|
978
|
+
AnnDataAccessor
|
979
|
+
| BackedAccessor
|
980
|
+
| SOMACollection
|
981
|
+
| SOMAExperiment
|
982
|
+
| SOMAMeasurement
|
983
|
+
| PyArrowDataset
|
937
984
|
):
|
938
985
|
if self._overwrite_versions and not self.is_latest:
|
939
986
|
raise ValueError(inconsistent_state_msg)
|
@@ -970,8 +1017,28 @@ def open(
|
|
970
1017
|
localpath = setup_settings.paths.cloud_to_local_no_update(
|
971
1018
|
filepath, cache_key=cache_key
|
972
1019
|
)
|
973
|
-
if
|
974
|
-
|
1020
|
+
if is_tiledbsoma_w:
|
1021
|
+
open_cache = False
|
1022
|
+
else:
|
1023
|
+
open_cache = not isinstance(
|
1024
|
+
filepath, LocalPathClasses
|
1025
|
+
) and not filepath.synchronize(localpath, just_check=True)
|
1026
|
+
if open_cache:
|
1027
|
+
try:
|
1028
|
+
access = backed_access(localpath, mode, using_key)
|
1029
|
+
except Exception as e:
|
1030
|
+
if isinstance(filepath, LocalPathClasses):
|
1031
|
+
raise e
|
1032
|
+
logger.warning(
|
1033
|
+
f"The cache might be corrupted: {e}. Trying to open directly."
|
1034
|
+
)
|
1035
|
+
access = backed_access(filepath, mode, using_key)
|
1036
|
+
# happens only if backed_access has been successful
|
1037
|
+
# delete the corrupted cache
|
1038
|
+
if localpath.is_dir():
|
1039
|
+
shutil.rmtree(localpath)
|
1040
|
+
else:
|
1041
|
+
localpath.unlink(missing_ok=True)
|
975
1042
|
else:
|
976
1043
|
access = backed_access(filepath, mode, using_key)
|
977
1044
|
if is_tiledbsoma_w:
|
@@ -1013,10 +1080,10 @@ def _synchronize_cleanup_on_error(
|
|
1013
1080
|
cache_path = setup_settings.paths.cloud_to_local_no_update(
|
1014
1081
|
filepath, cache_key=cache_key
|
1015
1082
|
)
|
1016
|
-
if cache_path.
|
1017
|
-
cache_path.unlink(missing_ok=True)
|
1018
|
-
elif cache_path.is_dir():
|
1083
|
+
if cache_path.is_dir():
|
1019
1084
|
shutil.rmtree(cache_path)
|
1085
|
+
else:
|
1086
|
+
cache_path.unlink(missing_ok=True)
|
1020
1087
|
raise e
|
1021
1088
|
return cache_path
|
1022
1089
|
|
@@ -1033,8 +1100,24 @@ def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
|
|
1033
1100
|
self, using_key=settings._using_key
|
1034
1101
|
)
|
1035
1102
|
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1036
|
-
|
1037
|
-
|
1103
|
+
try:
|
1104
|
+
# cache_path is local so doesn't trigger any sync in load_to_memory
|
1105
|
+
access_memory = load_to_memory(cache_path, **kwargs)
|
1106
|
+
except Exception as e:
|
1107
|
+
# just raise the exception if the original path is local
|
1108
|
+
if isinstance(filepath, LocalPathClasses):
|
1109
|
+
raise e
|
1110
|
+
logger.warning(
|
1111
|
+
f"The cache might be corrupted: {e}. Retrying to synchronize."
|
1112
|
+
)
|
1113
|
+
# delete the existing cache
|
1114
|
+
if cache_path.is_dir():
|
1115
|
+
shutil.rmtree(cache_path)
|
1116
|
+
else:
|
1117
|
+
cache_path.unlink(missing_ok=True)
|
1118
|
+
# download again and try to load into memory
|
1119
|
+
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1120
|
+
access_memory = load_to_memory(cache_path, **kwargs)
|
1038
1121
|
# only call if load is successfull
|
1039
1122
|
_track_run_input(self, is_run_input)
|
1040
1123
|
return access_memory
|
@@ -1174,15 +1257,27 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1174
1257
|
using_key = None
|
1175
1258
|
if "using" in kwargs:
|
1176
1259
|
using_key = kwargs["using"]
|
1177
|
-
|
1260
|
+
exception_upload = check_and_attempt_upload(
|
1178
1261
|
self, using_key, access_token=access_token, print_progress=print_progress
|
1179
1262
|
)
|
1180
|
-
if
|
1263
|
+
if exception_upload is not None:
|
1264
|
+
# we do not want to raise file not found on cleanup if upload of a file failed
|
1265
|
+
# often it is ACID in the filesystem itself
|
1266
|
+
# for example, s3 won't have the failed file, so just skip the delete in this case
|
1267
|
+
raise_file_not_found_error = False
|
1181
1268
|
self._delete_skip_storage()
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1269
|
+
else:
|
1270
|
+
# this is the case when it is cleaned on .replace
|
1271
|
+
raise_file_not_found_error = True
|
1272
|
+
# this is triggered by an exception in check_and_attempt_upload or by replace.
|
1273
|
+
exception_clear = check_and_attempt_clearing(
|
1274
|
+
self, raise_file_not_found_error=raise_file_not_found_error, using_key=using_key
|
1275
|
+
)
|
1276
|
+
if exception_upload is not None:
|
1277
|
+
raise RuntimeError(exception_upload)
|
1278
|
+
if exception_clear is not None:
|
1279
|
+
raise RuntimeError(exception_clear)
|
1280
|
+
# this is only for keep_artifacts_local
|
1186
1281
|
if local_path is not None and not state_was_adding:
|
1187
1282
|
# only move the local artifact to cache if it was not newly created
|
1188
1283
|
local_path_cache = ln_setup.settings.cache_dir / local_path.name
|
@@ -1197,7 +1292,7 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1197
1292
|
|
1198
1293
|
|
1199
1294
|
def _save_skip_storage(file, **kwargs) -> None:
|
1200
|
-
|
1295
|
+
save_staged_feature_sets(file)
|
1201
1296
|
super(Artifact, file).save(**kwargs)
|
1202
1297
|
save_schema_links(file)
|
1203
1298
|
|
@@ -1233,6 +1328,7 @@ METHOD_NAMES = [
|
|
1233
1328
|
"from_anndata",
|
1234
1329
|
"from_df",
|
1235
1330
|
"from_mudata",
|
1331
|
+
"from_tiledbsoma",
|
1236
1332
|
"open",
|
1237
1333
|
"cache",
|
1238
1334
|
"load",
|
@@ -1256,6 +1352,7 @@ for name in METHOD_NAMES:
|
|
1256
1352
|
attach_func_to_class_method(name, Artifact, globals())
|
1257
1353
|
|
1258
1354
|
# privates currently dealt with separately
|
1355
|
+
# mypy: ignore-errors
|
1259
1356
|
Artifact._delete_skip_storage = _delete_skip_storage
|
1260
1357
|
Artifact._save_skip_storage = _save_skip_storage
|
1261
1358
|
Artifact._cache_path = _cache_path
|
lamindb/_can_curate.py
CHANGED
@@ -14,7 +14,7 @@ from lamindb.models import CanCurate, Record
|
|
14
14
|
from ._from_values import _format_values, _has_organism_field, get_or_create_records
|
15
15
|
from ._record import _queryset, get_name_field
|
16
16
|
from ._utils import attach_func_to_class_method
|
17
|
-
from .
|
17
|
+
from .errors import ValidationError
|
18
18
|
|
19
19
|
if TYPE_CHECKING:
|
20
20
|
from django.db.models import QuerySet
|
@@ -61,6 +61,7 @@ def inspect(
|
|
61
61
|
mute: bool = False,
|
62
62
|
organism: str | Record | None = None,
|
63
63
|
source: Record | None = None,
|
64
|
+
strict_source: bool = False,
|
64
65
|
) -> InspectResult:
|
65
66
|
"""{}""" # noqa: D415
|
66
67
|
return _inspect(
|
@@ -68,6 +69,7 @@ def inspect(
|
|
68
69
|
values=values,
|
69
70
|
field=field,
|
70
71
|
mute=mute,
|
72
|
+
strict_source=strict_source,
|
71
73
|
organism=organism,
|
72
74
|
source=source,
|
73
75
|
)
|
@@ -83,10 +85,17 @@ def validate(
|
|
83
85
|
mute: bool = False,
|
84
86
|
organism: str | Record | None = None,
|
85
87
|
source: Record | None = None,
|
88
|
+
strict_source: bool = False,
|
86
89
|
) -> np.ndarray:
|
87
90
|
"""{}""" # noqa: D415
|
88
91
|
return _validate(
|
89
|
-
cls=cls,
|
92
|
+
cls=cls,
|
93
|
+
values=values,
|
94
|
+
field=field,
|
95
|
+
mute=mute,
|
96
|
+
strict_source=strict_source,
|
97
|
+
organism=organism,
|
98
|
+
source=source,
|
90
99
|
)
|
91
100
|
|
92
101
|
|
@@ -99,7 +108,7 @@ def _check_source_db(source: Record, using_key: str | None):
|
|
99
108
|
)
|
100
109
|
|
101
110
|
|
102
|
-
def _check_organism_db(organism: Record, using_key: str | None):
|
111
|
+
def _check_organism_db(organism: str | Record | None, using_key: str | None):
|
103
112
|
"""Check if the organism is from the DB."""
|
104
113
|
if isinstance(organism, Record):
|
105
114
|
if using_key is not None and using_key != "default":
|
@@ -131,6 +140,7 @@ def _inspect(
|
|
131
140
|
using_key: str | None = None,
|
132
141
|
organism: str | Record | None = None,
|
133
142
|
source: Record | None = None,
|
143
|
+
strict_source: bool = False,
|
134
144
|
) -> pd.DataFrame | dict[str, list[str]]:
|
135
145
|
"""{}""" # noqa: D415
|
136
146
|
from lamin_utils._inspect import inspect
|
@@ -144,7 +154,10 @@ def _inspect(
|
|
144
154
|
using_key = queryset.db
|
145
155
|
if isinstance(source, Record):
|
146
156
|
_check_source_db(source, using_key)
|
147
|
-
|
157
|
+
# if strict_source mode, restrict the query to the passed ontology source
|
158
|
+
# otherwise, inspect across records present in the DB from all ontology sources and no-source
|
159
|
+
if strict_source:
|
160
|
+
queryset = queryset.filter(source=source)
|
148
161
|
_check_organism_db(organism, using_key)
|
149
162
|
registry = queryset.model
|
150
163
|
model_name = registry._meta.model.__name__
|
@@ -200,7 +213,7 @@ def _inspect(
|
|
200
213
|
f" {colors.italic('.from_values()')}"
|
201
214
|
)
|
202
215
|
|
203
|
-
nonval = [i for i in bionty_result.non_validated if i not in bionty_mapper]
|
216
|
+
nonval = [i for i in bionty_result.non_validated if i not in bionty_mapper] # type: ignore
|
204
217
|
# no bionty source is found
|
205
218
|
except ValueError:
|
206
219
|
logger.warning("no Bionty source found, skipping Bionty validation")
|
@@ -227,6 +240,7 @@ def _validate(
|
|
227
240
|
using_key: str | None = None,
|
228
241
|
organism: str | Record | None = None,
|
229
242
|
source: Record | None = None,
|
243
|
+
strict_source: bool = False,
|
230
244
|
) -> np.ndarray:
|
231
245
|
"""{}""" # noqa: D415
|
232
246
|
from lamin_utils._inspect import validate
|
@@ -242,7 +256,8 @@ def _validate(
|
|
242
256
|
using_key = queryset.db
|
243
257
|
if isinstance(source, Record):
|
244
258
|
_check_source_db(source, using_key)
|
245
|
-
|
259
|
+
if strict_source:
|
260
|
+
queryset = queryset.filter(source=source)
|
246
261
|
_check_organism_db(organism, using_key)
|
247
262
|
field_values = pd.Series(
|
248
263
|
_filter_query_based_on_organism(
|
@@ -292,6 +307,7 @@ def standardize(
|
|
292
307
|
synonyms_field: str = "synonyms",
|
293
308
|
organism: str | Record | None = None,
|
294
309
|
source: Record | None = None,
|
310
|
+
strict_source: bool = False,
|
295
311
|
) -> list[str] | dict[str, str]:
|
296
312
|
"""{}""" # noqa: D415
|
297
313
|
return _standardize(
|
@@ -302,6 +318,7 @@ def standardize(
|
|
302
318
|
return_mapper=return_mapper,
|
303
319
|
case_sensitive=case_sensitive,
|
304
320
|
mute=mute,
|
321
|
+
strict_source=strict_source,
|
305
322
|
public_aware=public_aware,
|
306
323
|
keep=keep,
|
307
324
|
synonyms_field=synonyms_field,
|
@@ -359,6 +376,7 @@ def _standardize(
|
|
359
376
|
using_key: str | None = None,
|
360
377
|
organism: str | Record | None = None,
|
361
378
|
source: Record | None = None,
|
379
|
+
strict_source: bool = False,
|
362
380
|
) -> list[str] | dict[str, str]:
|
363
381
|
"""{}""" # noqa: D415
|
364
382
|
from lamin_utils._standardize import standardize as map_synonyms
|
@@ -376,7 +394,8 @@ def _standardize(
|
|
376
394
|
using_key = queryset.db
|
377
395
|
if isinstance(source, Record):
|
378
396
|
_check_source_db(source, using_key)
|
379
|
-
|
397
|
+
if strict_source:
|
398
|
+
queryset = queryset.filter(source=source)
|
380
399
|
_check_organism_db(organism, using_key)
|
381
400
|
registry = queryset.model
|
382
401
|
|
@@ -476,7 +495,7 @@ def _standardize(
|
|
476
495
|
logger.warning(warn_msg)
|
477
496
|
|
478
497
|
mapper.update(std_names_bt_mapper)
|
479
|
-
if pd.
|
498
|
+
if isinstance(std_names_db, pd.CategoricalDtype):
|
480
499
|
result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()
|
481
500
|
else:
|
482
501
|
result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
|