lamindb 0.74.3__py3-none-any.whl → 0.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +85 -43
- lamindb/_can_validate.py +55 -20
- lamindb/_collection.py +36 -28
- lamindb/_curate.py +55 -44
- lamindb/_feature_set.py +5 -5
- lamindb/_filter.py +3 -3
- lamindb/_finish.py +29 -23
- lamindb/_from_values.py +41 -60
- lamindb/_is_versioned.py +1 -1
- lamindb/_parents.py +38 -13
- lamindb/_record.py +19 -20
- lamindb/_save.py +2 -2
- lamindb/_transform.py +27 -16
- lamindb/core/_data.py +14 -16
- lamindb/core/_feature_manager.py +34 -44
- lamindb/core/_label_manager.py +17 -19
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_run_context.py +6 -8
- lamindb/core/datasets/_core.py +7 -7
- lamindb/core/exceptions.py +11 -0
- lamindb/core/storage/__init__.py +1 -0
- lamindb/core/storage/_anndata_accessor.py +735 -0
- lamindb/core/storage/_backed_access.py +77 -747
- lamindb/core/storage/paths.py +9 -14
- lamindb/core/types.py +3 -0
- lamindb/core/versioning.py +1 -1
- lamindb/integrations/__init__.py +1 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/METADATA +5 -5
- lamindb-0.75.0.dist-info/RECORD +58 -0
- lamindb-0.74.3.dist-info/RECORD +0 -57
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/LICENSE +0 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
lamindb/_artifact.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import os
|
3
4
|
import shutil
|
4
5
|
from pathlib import Path, PurePath, PurePosixPath
|
5
6
|
from typing import TYPE_CHECKING, Any, Mapping
|
@@ -28,6 +29,7 @@ from lnschema_core.types import (
|
|
28
29
|
from lamindb._utils import attach_func_to_class_method
|
29
30
|
from lamindb.core._data import HasFeatures, _track_run_input
|
30
31
|
from lamindb.core._settings import settings
|
32
|
+
from lamindb.core.exceptions import IntegrityError
|
31
33
|
from lamindb.core.storage import (
|
32
34
|
LocalPathClasses,
|
33
35
|
UPath,
|
@@ -39,6 +41,7 @@ from lamindb.core.storage import (
|
|
39
41
|
from lamindb.core.storage.paths import (
|
40
42
|
auto_storage_key_from_artifact,
|
41
43
|
auto_storage_key_from_artifact_uid,
|
44
|
+
check_path_is_child_of_root,
|
42
45
|
filepath_from_artifact,
|
43
46
|
)
|
44
47
|
from lamindb.core.versioning import get_uid_from_old_version, init_uid
|
@@ -102,7 +105,11 @@ def process_pathlike(
|
|
102
105
|
if not isinstance(filepath, LocalPathClasses):
|
103
106
|
# for a cloud path, new_root is always the bucket name
|
104
107
|
new_root = list(filepath.parents)[-1]
|
105
|
-
|
108
|
+
# do not register remote storage locations on hub if the current instance
|
109
|
+
# is not managed on the hub
|
110
|
+
storage_settings = init_storage(
|
111
|
+
new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
|
112
|
+
)
|
106
113
|
storage_record = register_storage_in_instance(storage_settings)
|
107
114
|
use_existing_storage_key = True
|
108
115
|
return storage_record, use_existing_storage_key
|
@@ -257,14 +264,6 @@ def check_path_in_existing_storage(
|
|
257
264
|
return False
|
258
265
|
|
259
266
|
|
260
|
-
def check_path_is_child_of_root(path: Path | UPath, root: Path | UPath | None) -> bool:
|
261
|
-
# str is needed to eliminate UPath storage_options
|
262
|
-
# from the equality checks below
|
263
|
-
path = UPath(str(path))
|
264
|
-
root = UPath(str(root))
|
265
|
-
return root.resolve() in path.resolve().parents
|
266
|
-
|
267
|
-
|
268
267
|
def get_relative_path_to_directory(
|
269
268
|
path: PurePath | Path | UPath, directory: PurePath | Path | UPath
|
270
269
|
) -> PurePath | Path:
|
@@ -343,8 +342,10 @@ def get_artifact_kwargs_from_data(
|
|
343
342
|
else:
|
344
343
|
storage = default_storage
|
345
344
|
|
346
|
-
|
347
|
-
|
345
|
+
# for now comment out this error to allow creating new versions of stores
|
346
|
+
# in the default folder (.lamindb)
|
347
|
+
# if key is not None and key.startswith(AUTO_KEY_PREFIX):
|
348
|
+
# raise ValueError(f"Key cannot start with {AUTO_KEY_PREFIX}")
|
348
349
|
|
349
350
|
log_storage_hint(
|
350
351
|
check_path_in_storage=check_path_in_storage,
|
@@ -366,7 +367,7 @@ def get_artifact_kwargs_from_data(
|
|
366
367
|
kwargs = {
|
367
368
|
"suffix": suffix,
|
368
369
|
"hash": hash,
|
369
|
-
"
|
370
|
+
"_hash_type": hash_type,
|
370
371
|
"key": key,
|
371
372
|
"size": size,
|
372
373
|
"storage_id": storage.id,
|
@@ -377,7 +378,7 @@ def get_artifact_kwargs_from_data(
|
|
377
378
|
"n_observations": None, # to implement
|
378
379
|
"run_id": run.id if run is not None else None,
|
379
380
|
"run": run,
|
380
|
-
"
|
381
|
+
"_key_is_virtual": key_is_virtual,
|
381
382
|
}
|
382
383
|
if not isinstance(path, LocalPathClasses):
|
383
384
|
local_filepath = None
|
@@ -502,7 +503,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
502
503
|
raise ValueError("Only one non-keyword arg allowed: data")
|
503
504
|
|
504
505
|
data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
|
505
|
-
type: str = kwargs.pop("type") if "type" in kwargs else
|
506
|
+
type: str = kwargs.pop("type") if "type" in kwargs else None
|
506
507
|
key: str | None = kwargs.pop("key") if "key" in kwargs else None
|
507
508
|
run: Run | None = kwargs.pop("run") if "run" in kwargs else None
|
508
509
|
description: str | None = (
|
@@ -531,7 +532,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
531
532
|
using_key = (
|
532
533
|
kwargs.pop("using_key") if "using_key" in kwargs else settings._using_key
|
533
534
|
)
|
534
|
-
accessor = kwargs.pop("
|
535
|
+
accessor = kwargs.pop("_accessor") if "_accessor" in kwargs else None
|
535
536
|
accessor = _check_accessor_artifact(data=data, accessor=accessor)
|
536
537
|
if not len(kwargs) == 0:
|
537
538
|
raise ValueError(
|
@@ -592,7 +593,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
592
593
|
kwargs["version"] = version
|
593
594
|
kwargs["description"] = description
|
594
595
|
kwargs["visibility"] = visibility
|
595
|
-
kwargs["
|
596
|
+
kwargs["_accessor"] = accessor
|
596
597
|
# this check needs to come down here because key might be populated from an
|
597
598
|
# existing file path during get_artifact_kwargs_from_data()
|
598
599
|
if (
|
@@ -633,7 +634,7 @@ def from_df(
|
|
633
634
|
description=description,
|
634
635
|
version=version,
|
635
636
|
is_new_version_of=is_new_version_of,
|
636
|
-
|
637
|
+
_accessor="DataFrame",
|
637
638
|
type="dataset",
|
638
639
|
**kwargs,
|
639
640
|
)
|
@@ -662,7 +663,7 @@ def from_anndata(
|
|
662
663
|
description=description,
|
663
664
|
version=version,
|
664
665
|
is_new_version_of=is_new_version_of,
|
665
|
-
|
666
|
+
_accessor="AnnData",
|
666
667
|
type="dataset",
|
667
668
|
**kwargs,
|
668
669
|
)
|
@@ -689,7 +690,7 @@ def from_mudata(
|
|
689
690
|
description=description,
|
690
691
|
version=version,
|
691
692
|
is_new_version_of=is_new_version_of,
|
692
|
-
|
693
|
+
_accessor="MuData",
|
693
694
|
type="dataset",
|
694
695
|
**kwargs,
|
695
696
|
)
|
@@ -707,8 +708,8 @@ def from_dir(
|
|
707
708
|
) -> list[Artifact]:
|
708
709
|
"""{}""" # noqa: D415
|
709
710
|
logger.warning(
|
710
|
-
"this creates one artifact per file in the directory -
|
711
|
-
" ln.Artifact(
|
711
|
+
"this creates one artifact per file in the directory - consider"
|
712
|
+
" ln.Artifact(dir_path) to get one artifact for the entire directory"
|
712
713
|
)
|
713
714
|
folderpath: UPath = create_path(path) # returns Path for local
|
714
715
|
default_storage = settings._storage_settings.record
|
@@ -823,7 +824,7 @@ def replace(
|
|
823
824
|
if check_path_in_storage:
|
824
825
|
raise ValueError("Can only replace with a local file not in any Storage.")
|
825
826
|
|
826
|
-
if self.key is not None and not self.
|
827
|
+
if self.key is not None and not self._key_is_virtual:
|
827
828
|
key_path = PurePosixPath(self.key)
|
828
829
|
new_filename = f"{key_path.stem}{kwargs['suffix']}"
|
829
830
|
# the following will only be true if the suffix changes!
|
@@ -849,7 +850,7 @@ def replace(
|
|
849
850
|
self.suffix = kwargs["suffix"]
|
850
851
|
self.size = kwargs["size"]
|
851
852
|
self.hash = kwargs["hash"]
|
852
|
-
self.
|
853
|
+
self._hash_type = kwargs["_hash_type"]
|
853
854
|
self.run_id = kwargs["run_id"]
|
854
855
|
self.run = kwargs["run"]
|
855
856
|
|
@@ -862,15 +863,15 @@ def replace(
|
|
862
863
|
|
863
864
|
# deprecated
|
864
865
|
def backed(
|
865
|
-
self, is_run_input: bool | None = None
|
866
|
+
self, mode: str = "r", is_run_input: bool | None = None
|
866
867
|
) -> AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment:
|
867
868
|
logger.warning("`.backed()` is deprecated, use `.open()`!'")
|
868
|
-
return self.open(is_run_input)
|
869
|
+
return self.open(mode, is_run_input)
|
869
870
|
|
870
871
|
|
871
872
|
# docstring handled through attach_func_to_class_method
|
872
873
|
def open(
|
873
|
-
self, is_run_input: bool | None = None
|
874
|
+
self, mode: str = "r", is_run_input: bool | None = None
|
874
875
|
) -> AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment:
|
875
876
|
# ignore empty suffix for now
|
876
877
|
suffixes = (".h5", ".hdf5", ".h5ad", ".zarr", ".tiledbsoma", "")
|
@@ -880,29 +881,61 @@ def open(
|
|
880
881
|
" use one of the following suffixes for the object name:"
|
881
882
|
f" {', '.join(suffixes[:-1])}."
|
882
883
|
)
|
884
|
+
if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
|
885
|
+
raise ValueError("Only a tiledbsoma store can be openened with `mode!='r'`.")
|
883
886
|
|
884
|
-
from lamindb.core.storage._backed_access import backed_access
|
887
|
+
from lamindb.core.storage._backed_access import _track_writes_factory, backed_access
|
885
888
|
|
886
|
-
_track_run_input(self, is_run_input)
|
887
889
|
using_key = settings._using_key
|
888
890
|
filepath = filepath_from_artifact(self, using_key=using_key)
|
891
|
+
is_tiledbsoma_w = (
|
892
|
+
filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
|
893
|
+
) and mode == "w"
|
889
894
|
# consider the case where an object is already locally cached
|
890
895
|
localpath = setup_settings.instance.storage.cloud_to_local_no_update(filepath)
|
891
|
-
if localpath.exists():
|
892
|
-
|
896
|
+
if not is_tiledbsoma_w and localpath.exists():
|
897
|
+
access = backed_access(localpath, mode, using_key)
|
893
898
|
else:
|
894
|
-
|
899
|
+
access = backed_access(filepath, mode, using_key)
|
900
|
+
if is_tiledbsoma_w:
|
901
|
+
|
902
|
+
def finalize():
|
903
|
+
nonlocal self, filepath, localpath
|
904
|
+
if not isinstance(filepath, LocalPathClasses):
|
905
|
+
_, hash, _, _ = get_stat_dir_cloud(filepath)
|
906
|
+
else:
|
907
|
+
# this can be very slow
|
908
|
+
_, hash, _, _ = hash_dir(filepath)
|
909
|
+
if self.hash != hash:
|
910
|
+
from ._record import init_self_from_db
|
911
|
+
|
912
|
+
logger.warning(
|
913
|
+
"The hash of the tiledbsoma store has changed, creating a new version of the artifact."
|
914
|
+
)
|
915
|
+
new_version = Artifact(filepath, is_new_version_of=self).save()
|
916
|
+
init_self_from_db(self, new_version)
|
917
|
+
|
918
|
+
if localpath != filepath and localpath.exists():
|
919
|
+
shutil.rmtree(localpath)
|
920
|
+
|
921
|
+
access = _track_writes_factory(access, finalize)
|
922
|
+
# only call if open is successfull
|
923
|
+
_track_run_input(self, is_run_input)
|
924
|
+
return access
|
895
925
|
|
896
926
|
|
897
927
|
# docstring handled through attach_func_to_class_method
|
898
928
|
def load(self, is_run_input: bool | None = None, stream: bool = False, **kwargs) -> Any:
|
899
|
-
_track_run_input(self, is_run_input)
|
900
929
|
if hasattr(self, "_memory_rep") and self._memory_rep is not None:
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
930
|
+
access_memory = self._memory_rep
|
931
|
+
else:
|
932
|
+
using_key = settings._using_key
|
933
|
+
access_memory = load_to_memory(
|
934
|
+
filepath_from_artifact(self, using_key=using_key), stream=stream, **kwargs
|
935
|
+
)
|
936
|
+
# only call if load is successfull
|
937
|
+
_track_run_input(self, is_run_input)
|
938
|
+
return access_memory
|
906
939
|
|
907
940
|
|
908
941
|
# docstring handled through attach_func_to_class_method
|
@@ -935,6 +968,17 @@ def delete(
|
|
935
968
|
storage: bool | None = None,
|
936
969
|
using_key: str | None = None,
|
937
970
|
) -> None:
|
971
|
+
# this first check means an invalid delete fails fast rather than cascading through
|
972
|
+
# database and storage permission errors
|
973
|
+
if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
|
974
|
+
isettings = setup_settings.instance
|
975
|
+
if self.storage.instance_uid != isettings.uid and (storage or storage is None):
|
976
|
+
raise IntegrityError(
|
977
|
+
"Cannot simply delete artifacts outside of this instance's managed storage locations."
|
978
|
+
"\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
|
979
|
+
f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
|
980
|
+
f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
|
981
|
+
)
|
938
982
|
# by default, we only move artifacts into the trash (visibility = -1)
|
939
983
|
trash_visibility = VisibilityChoice.trash.value
|
940
984
|
if self.visibility > trash_visibility and not permanent:
|
@@ -943,7 +987,7 @@ def delete(
|
|
943
987
|
# move to trash
|
944
988
|
self.visibility = trash_visibility
|
945
989
|
self.save()
|
946
|
-
logger.
|
990
|
+
logger.important(f"moved artifact to trash (visibility = {trash_visibility})")
|
947
991
|
return
|
948
992
|
|
949
993
|
# if the artifact is already in the trash
|
@@ -970,7 +1014,7 @@ def delete(
|
|
970
1014
|
# only delete in storage if DB delete is successful
|
971
1015
|
# DB delete might error because of a foreign key constraint violated etc.
|
972
1016
|
self._delete_skip_storage()
|
973
|
-
if self.key is None or self.
|
1017
|
+
if self.key is None or self._key_is_virtual:
|
974
1018
|
# do not ask for confirmation also if storage is None
|
975
1019
|
delete_in_storage = storage is None or storage
|
976
1020
|
else:
|
@@ -985,9 +1029,7 @@ def delete(
|
|
985
1029
|
else:
|
986
1030
|
delete_in_storage = storage
|
987
1031
|
if not delete_in_storage:
|
988
|
-
logger.
|
989
|
-
f"you will retain a dangling store here: {path}, not referenced via an artifact"
|
990
|
-
)
|
1032
|
+
logger.important(f"a file/folder remains here: {path}")
|
991
1033
|
# we don't yet have logic to bring back the deleted metadata record
|
992
1034
|
# in case storage deletion fails - this is important for ACID down the road
|
993
1035
|
if delete_in_storage:
|
@@ -1013,7 +1055,7 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1013
1055
|
self._local_filepath = local_path
|
1014
1056
|
# switch to virtual storage key upon upload
|
1015
1057
|
# the local filepath is already cached at that point
|
1016
|
-
self.
|
1058
|
+
self._key_is_virtual = True
|
1017
1059
|
# ensure that the artifact is uploaded
|
1018
1060
|
self._to_store = True
|
1019
1061
|
|
lamindb/_can_validate.py
CHANGED
@@ -13,7 +13,7 @@ from lnschema_core import CanValidate, Record
|
|
13
13
|
from lamindb._utils import attach_func_to_class_method
|
14
14
|
|
15
15
|
from ._from_values import _has_organism_field, _print_values
|
16
|
-
from ._record import _queryset,
|
16
|
+
from ._record import _queryset, get_name_field
|
17
17
|
|
18
18
|
if TYPE_CHECKING:
|
19
19
|
from django.db.models import QuerySet
|
@@ -30,7 +30,7 @@ def inspect(
|
|
30
30
|
*,
|
31
31
|
mute: bool = False,
|
32
32
|
organism: str | Record | None = None,
|
33
|
-
|
33
|
+
source: Record | None = None,
|
34
34
|
) -> InspectResult:
|
35
35
|
"""{}""" # noqa: D415
|
36
36
|
return _inspect(
|
@@ -39,7 +39,7 @@ def inspect(
|
|
39
39
|
field=field,
|
40
40
|
mute=mute,
|
41
41
|
organism=organism,
|
42
|
-
|
42
|
+
source=source,
|
43
43
|
)
|
44
44
|
|
45
45
|
|
@@ -52,9 +52,12 @@ def validate(
|
|
52
52
|
*,
|
53
53
|
mute: bool = False,
|
54
54
|
organism: str | Record | None = None,
|
55
|
+
source: Record | None = None,
|
55
56
|
) -> np.ndarray:
|
56
57
|
"""{}""" # noqa: D415
|
57
|
-
return _validate(
|
58
|
+
return _validate(
|
59
|
+
cls=cls, values=values, field=field, mute=mute, organism=organism, source=source
|
60
|
+
)
|
58
61
|
|
59
62
|
|
60
63
|
def _inspect(
|
@@ -65,7 +68,7 @@ def _inspect(
|
|
65
68
|
mute: bool = False,
|
66
69
|
using_key: str | None = None,
|
67
70
|
organism: str | Record | None = None,
|
68
|
-
|
71
|
+
source: Record | None = None,
|
69
72
|
) -> pd.DataFrame | dict[str, list[str]]:
|
70
73
|
"""{}""" # noqa: D415
|
71
74
|
from lamin_utils._inspect import inspect
|
@@ -73,8 +76,10 @@ def _inspect(
|
|
73
76
|
if isinstance(values, str):
|
74
77
|
values = [values]
|
75
78
|
|
76
|
-
field =
|
79
|
+
field = get_name_field(cls, field=field)
|
77
80
|
queryset = _queryset(cls, using_key)
|
81
|
+
if isinstance(source, Record) and hasattr(cls, "source_id"):
|
82
|
+
queryset = queryset.filter(source=source).all()
|
78
83
|
orm = queryset.model
|
79
84
|
model_name = orm._meta.model.__name__
|
80
85
|
|
@@ -91,9 +96,9 @@ def _inspect(
|
|
91
96
|
|
92
97
|
if len(nonval) > 0 and orm.__get_schema_name__() == "bionty":
|
93
98
|
try:
|
94
|
-
bionty_result = orm.public(
|
95
|
-
|
96
|
-
)
|
99
|
+
bionty_result = orm.public(organism=organism, source=source).inspect(
|
100
|
+
values=nonval, field=field, mute=True
|
101
|
+
)
|
97
102
|
bionty_validated = bionty_result.validated
|
98
103
|
bionty_mapper = bionty_result.synonyms_mapper
|
99
104
|
hint = False
|
@@ -135,7 +140,7 @@ def _inspect(
|
|
135
140
|
logger.print(f" couldn't validate {labels}: {colors.red(print_values)}")
|
136
141
|
logger.print(
|
137
142
|
f"→ if you are sure, create new record{s} via"
|
138
|
-
f" {colors.italic(f'
|
143
|
+
f" {colors.italic(f'{orm.__name__}()')} and save to your registry"
|
139
144
|
)
|
140
145
|
|
141
146
|
return result_db
|
@@ -149,6 +154,7 @@ def _validate(
|
|
149
154
|
mute: bool = False,
|
150
155
|
using_key: str | None = None,
|
151
156
|
organism: str | Record | None = None,
|
157
|
+
source: Record | None = None,
|
152
158
|
) -> np.ndarray:
|
153
159
|
"""{}""" # noqa: D415
|
154
160
|
from lamin_utils._inspect import validate
|
@@ -157,9 +163,11 @@ def _validate(
|
|
157
163
|
if isinstance(values, str):
|
158
164
|
values = [values]
|
159
165
|
|
160
|
-
field =
|
166
|
+
field = get_name_field(cls, field=field)
|
161
167
|
|
162
168
|
queryset = _queryset(cls, using_key)
|
169
|
+
if isinstance(source, Record) and hasattr(cls, "source_id"):
|
170
|
+
queryset = queryset.filter(source=source).all()
|
163
171
|
field_values = pd.Series(
|
164
172
|
_filter_query_based_on_organism(
|
165
173
|
queryset=queryset,
|
@@ -169,6 +177,15 @@ def _validate(
|
|
169
177
|
),
|
170
178
|
dtype="object",
|
171
179
|
)
|
180
|
+
if field_values.empty:
|
181
|
+
if not mute:
|
182
|
+
msg = (
|
183
|
+
f"Your {cls.__name__} registry is empty, consider populating it first!"
|
184
|
+
)
|
185
|
+
if hasattr(cls, "source_id"):
|
186
|
+
msg += "\n → use `.import_from_source()` to import records from a source, e.g. a public ontology"
|
187
|
+
logger.warning(msg)
|
188
|
+
return np.array([False] * len(values))
|
172
189
|
|
173
190
|
result = validate(
|
174
191
|
identifiers=values,
|
@@ -198,6 +215,7 @@ def standardize(
|
|
198
215
|
keep: Literal["first", "last", False] = "first",
|
199
216
|
synonyms_field: str = "synonyms",
|
200
217
|
organism: str | Record | None = None,
|
218
|
+
source: Record | None = None,
|
201
219
|
) -> list[str] | dict[str, str]:
|
202
220
|
"""{}""" # noqa: D415
|
203
221
|
return _standardize(
|
@@ -212,6 +230,7 @@ def standardize(
|
|
212
230
|
keep=keep,
|
213
231
|
synonyms_field=synonyms_field,
|
214
232
|
organism=organism,
|
233
|
+
source=source,
|
215
234
|
)
|
216
235
|
|
217
236
|
|
@@ -263,6 +282,7 @@ def _standardize(
|
|
263
282
|
synonyms_field: str = "synonyms",
|
264
283
|
using_key: str | None = None,
|
265
284
|
organism: str | Record | None = None,
|
285
|
+
source: Record | None = None,
|
266
286
|
) -> list[str] | dict[str, str]:
|
267
287
|
"""{}""" # noqa: D415
|
268
288
|
from lamin_utils._standardize import standardize as map_synonyms
|
@@ -271,16 +291,18 @@ def _standardize(
|
|
271
291
|
if isinstance(values, str):
|
272
292
|
values = [values]
|
273
293
|
|
274
|
-
field =
|
275
|
-
return_field =
|
294
|
+
field = get_name_field(cls, field=field)
|
295
|
+
return_field = get_name_field(
|
276
296
|
cls, field=field if return_field is None else return_field
|
277
297
|
)
|
278
298
|
queryset = _queryset(cls, using_key)
|
299
|
+
if isinstance(source, Record) and hasattr(cls, "source_id"):
|
300
|
+
queryset = queryset.filter(source=source).all()
|
279
301
|
orm = queryset.model
|
280
302
|
|
281
303
|
if _has_organism_field(orm):
|
282
|
-
# here, we can safely import
|
283
|
-
from
|
304
|
+
# here, we can safely import bionty
|
305
|
+
from bionty._bionty import create_or_get_organism_record
|
284
306
|
|
285
307
|
organism_record = create_or_get_organism_record(organism=organism, orm=orm)
|
286
308
|
organism = (
|
@@ -388,7 +410,10 @@ def _add_or_remove_synonyms(
|
|
388
410
|
" with the following records:\n"
|
389
411
|
)
|
390
412
|
display(records_df)
|
391
|
-
raise
|
413
|
+
raise ValueError(
|
414
|
+
"cannot assigned a synonym that is already associated with a record to a different record.\n"
|
415
|
+
"Consider removing the synonym from existing records or using a different synonym."
|
416
|
+
)
|
392
417
|
|
393
418
|
# passed synonyms
|
394
419
|
# nothing happens when passing an empty string or list
|
@@ -405,7 +430,7 @@ def _add_or_remove_synonyms(
|
|
405
430
|
return
|
406
431
|
# because we use | as the separator
|
407
432
|
if any("|" in i for i in syn_new_set):
|
408
|
-
raise
|
433
|
+
raise ValueError("a synonym can't contain '|'!")
|
409
434
|
|
410
435
|
# existing synonyms
|
411
436
|
syns_exist = record.synonyms
|
@@ -455,9 +480,9 @@ def _filter_query_based_on_organism(
|
|
455
480
|
|
456
481
|
orm = queryset.model
|
457
482
|
|
458
|
-
if _has_organism_field(orm) and not field
|
459
|
-
# here, we can safely import
|
460
|
-
from
|
483
|
+
if _has_organism_field(orm) and not _field_is_id(field, orm):
|
484
|
+
# here, we can safely import bionty
|
485
|
+
from bionty._bionty import create_or_get_organism_record
|
461
486
|
|
462
487
|
organism_record = create_or_get_organism_record(organism=organism, orm=orm)
|
463
488
|
if organism_record is not None:
|
@@ -469,6 +494,16 @@ def _filter_query_based_on_organism(
|
|
469
494
|
return queryset.values_list(values_list_field, flat=True)
|
470
495
|
|
471
496
|
|
497
|
+
def _field_is_id(field: str, orm: type[Record]) -> bool:
|
498
|
+
"""Check if the field is an ontology ID."""
|
499
|
+
if hasattr(orm, "_ontology_id_field"):
|
500
|
+
if field == orm._ontology_id_field:
|
501
|
+
return True
|
502
|
+
if field.endswith("id"):
|
503
|
+
return True
|
504
|
+
return False
|
505
|
+
|
506
|
+
|
472
507
|
METHOD_NAMES = [
|
473
508
|
"validate",
|
474
509
|
"inspect",
|
lamindb/_collection.py
CHANGED
@@ -60,7 +60,9 @@ def __init__(
|
|
60
60
|
artifacts: Artifact | Iterable[Artifact] = (
|
61
61
|
kwargs.pop("artifacts") if len(args) == 0 else args[0]
|
62
62
|
)
|
63
|
-
|
63
|
+
meta_artifact: Artifact | None = (
|
64
|
+
kwargs.pop("meta_artifact") if "meta_artifact" in kwargs else None
|
65
|
+
)
|
64
66
|
name: str | None = kwargs.pop("name") if "name" in kwargs else None
|
65
67
|
description: str | None = (
|
66
68
|
kwargs.pop("description") if "description" in kwargs else None
|
@@ -102,16 +104,18 @@ def __init__(
|
|
102
104
|
raise ValueError("Artifact or List[Artifact] is allowed.")
|
103
105
|
assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101
|
104
106
|
hash, feature_sets = from_artifacts(artifacts) # type: ignore
|
105
|
-
if
|
106
|
-
if not isinstance(
|
107
|
-
raise ValueError("
|
108
|
-
if isinstance(
|
109
|
-
if
|
110
|
-
raise ValueError(
|
107
|
+
if meta_artifact is not None:
|
108
|
+
if not isinstance(meta_artifact, Artifact):
|
109
|
+
raise ValueError("meta_artifact has to be an Artifact")
|
110
|
+
if isinstance(meta_artifact, Artifact):
|
111
|
+
if meta_artifact._state.adding:
|
112
|
+
raise ValueError(
|
113
|
+
"Save meta_artifact artifact before creating collection!"
|
114
|
+
)
|
111
115
|
if not feature_sets:
|
112
|
-
feature_sets =
|
116
|
+
feature_sets = meta_artifact.features._feature_set_by_slot
|
113
117
|
else:
|
114
|
-
if len(
|
118
|
+
if len(meta_artifact.features._feature_set_by_slot) > 0:
|
115
119
|
logger.info("overwriting feature sets linked to artifact")
|
116
120
|
# we ignore collections in trash containing the same hash
|
117
121
|
if hash is not None:
|
@@ -149,7 +153,7 @@ def __init__(
|
|
149
153
|
description=description,
|
150
154
|
reference=reference,
|
151
155
|
reference_type=reference_type,
|
152
|
-
|
156
|
+
meta_artifact=meta_artifact,
|
153
157
|
hash=hash,
|
154
158
|
run=run,
|
155
159
|
version=version,
|
@@ -176,13 +180,13 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
|
|
176
180
|
artifact_ids = [artifact.id for artifact in artifacts]
|
177
181
|
# query all feature sets at the same time rather
|
178
182
|
# than making a single query per artifact
|
179
|
-
logger.debug("
|
180
|
-
|
183
|
+
logger.debug("links_feature_set_artifact")
|
184
|
+
links_feature_set_artifact = Artifact.feature_sets.through.objects.filter(
|
181
185
|
artifact_id__in=artifact_ids
|
182
186
|
)
|
183
187
|
feature_sets_by_slots = defaultdict(list)
|
184
188
|
logger.debug("slots")
|
185
|
-
for link in
|
189
|
+
for link in links_feature_set_artifact:
|
186
190
|
feature_sets_by_slots[link.slot].append(link.featureset_id)
|
187
191
|
feature_sets_union = {}
|
188
192
|
logger.debug("union")
|
@@ -240,7 +244,7 @@ def mapped(
|
|
240
244
|
is_run_input: bool | None = None,
|
241
245
|
) -> MappedCollection:
|
242
246
|
path_list = []
|
243
|
-
for artifact in self.
|
247
|
+
for artifact in self.ordered_artifacts.all():
|
244
248
|
if artifact.suffix not in {".h5ad", ".zarr"}:
|
245
249
|
logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
|
246
250
|
continue
|
@@ -267,10 +271,10 @@ def mapped(
|
|
267
271
|
|
268
272
|
# docstring handled through attach_func_to_class_method
|
269
273
|
def cache(self, is_run_input: bool | None = None) -> list[UPath]:
|
270
|
-
_track_run_input(self, is_run_input)
|
271
274
|
path_list = []
|
272
|
-
for artifact in self.
|
275
|
+
for artifact in self.ordered_artifacts.all():
|
273
276
|
path_list.append(artifact.cache())
|
277
|
+
_track_run_input(self, is_run_input)
|
274
278
|
return path_list
|
275
279
|
|
276
280
|
|
@@ -282,7 +286,7 @@ def load(
|
|
282
286
|
**kwargs,
|
283
287
|
) -> Any:
|
284
288
|
# cannot call _track_run_input here, see comment further down
|
285
|
-
all_artifacts = self.
|
289
|
+
all_artifacts = self.ordered_artifacts.all()
|
286
290
|
suffixes = [artifact.suffix for artifact in all_artifacts]
|
287
291
|
if len(set(suffixes)) != 1:
|
288
292
|
raise RuntimeError(
|
@@ -329,8 +333,8 @@ def delete(self, permanent: bool | None = None) -> None:
|
|
329
333
|
|
330
334
|
# docstring handled through attach_func_to_class_method
|
331
335
|
def save(self, using: str | None = None) -> Collection:
|
332
|
-
if self.
|
333
|
-
self.
|
336
|
+
if self.meta_artifact is not None:
|
337
|
+
self.meta_artifact.save()
|
334
338
|
# we don't need to save feature sets again
|
335
339
|
save_feature_sets(self)
|
336
340
|
super(Collection, self).save()
|
@@ -344,7 +348,7 @@ def save(self, using: str | None = None) -> Collection:
|
|
344
348
|
]
|
345
349
|
# the below seems to preserve the order of the list in the
|
346
350
|
# auto-incrementing integer primary
|
347
|
-
# merely using .
|
351
|
+
# merely using .artifacts.set(*...) doesn't achieve this
|
348
352
|
# we need ignore_conflicts=True so that this won't error if links already exist
|
349
353
|
CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
|
350
354
|
save_feature_set_links(self)
|
@@ -357,16 +361,20 @@ def save(self, using: str | None = None) -> Collection:
|
|
357
361
|
def restore(self) -> None:
|
358
362
|
self.visibility = VisibilityChoice.default.value
|
359
363
|
self.save()
|
360
|
-
if self.artifact is not None:
|
361
|
-
self.artifact.visibility = VisibilityChoice.default.value
|
362
|
-
self.artifact.save()
|
363
364
|
|
364
365
|
|
365
366
|
@property # type: ignore
|
366
|
-
@doc_args(Collection.
|
367
|
-
def
|
367
|
+
@doc_args(Collection.ordered_artifacts.__doc__)
|
368
|
+
def ordered_artifacts(self) -> QuerySet:
|
369
|
+
"""{}""" # noqa: D415
|
370
|
+
return self.artifacts.order_by("links_collection__id")
|
371
|
+
|
372
|
+
|
373
|
+
@property # type: ignore
|
374
|
+
@doc_args(Collection.data_artifact.__doc__)
|
375
|
+
def data_artifact(self) -> Artifact | None:
|
368
376
|
"""{}""" # noqa: D415
|
369
|
-
return self.
|
377
|
+
return self.artifacts.first()
|
370
378
|
|
371
379
|
|
372
380
|
METHOD_NAMES = [
|
@@ -391,5 +399,5 @@ if ln_setup._TESTING:
|
|
391
399
|
for name in METHOD_NAMES:
|
392
400
|
attach_func_to_class_method(name, Collection, globals())
|
393
401
|
|
394
|
-
Collection.
|
395
|
-
Collection.
|
402
|
+
Collection.ordered_artifacts = ordered_artifacts
|
403
|
+
Collection.data_artifact = data_artifact
|