lamindb 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +85 -43
- lamindb/_can_validate.py +100 -35
- lamindb/_collection.py +36 -28
- lamindb/_curate.py +432 -181
- lamindb/_feature_set.py +5 -5
- lamindb/_filter.py +3 -3
- lamindb/_finish.py +29 -23
- lamindb/_from_values.py +47 -66
- lamindb/_is_versioned.py +1 -1
- lamindb/_parents.py +38 -13
- lamindb/_record.py +41 -42
- lamindb/_save.py +7 -7
- lamindb/_transform.py +27 -16
- lamindb/_view.py +13 -11
- lamindb/core/__init__.py +2 -0
- lamindb/core/_data.py +18 -20
- lamindb/core/_feature_manager.py +50 -50
- lamindb/core/_label_manager.py +17 -19
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_run_context.py +6 -8
- lamindb/core/datasets/_core.py +7 -7
- lamindb/core/exceptions.py +11 -0
- lamindb/core/schema.py +5 -5
- lamindb/core/storage/__init__.py +12 -2
- lamindb/core/storage/_anndata_accessor.py +735 -0
- lamindb/core/storage/_backed_access.py +77 -747
- lamindb/core/storage/_valid_suffixes.py +16 -2
- lamindb/core/storage/paths.py +9 -14
- lamindb/core/types.py +3 -0
- lamindb/core/versioning.py +1 -1
- lamindb/integrations/__init__.py +1 -0
- lamindb/integrations/_vitessce.py +68 -31
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/METADATA +5 -5
- lamindb-0.75.1.dist-info/RECORD +58 -0
- lamindb-0.74.3.dist-info/RECORD +0 -57
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/LICENSE +0 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
lamindb/_artifact.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import os
|
3
4
|
import shutil
|
4
5
|
from pathlib import Path, PurePath, PurePosixPath
|
5
6
|
from typing import TYPE_CHECKING, Any, Mapping
|
@@ -28,6 +29,7 @@ from lnschema_core.types import (
|
|
28
29
|
from lamindb._utils import attach_func_to_class_method
|
29
30
|
from lamindb.core._data import HasFeatures, _track_run_input
|
30
31
|
from lamindb.core._settings import settings
|
32
|
+
from lamindb.core.exceptions import IntegrityError
|
31
33
|
from lamindb.core.storage import (
|
32
34
|
LocalPathClasses,
|
33
35
|
UPath,
|
@@ -39,6 +41,7 @@ from lamindb.core.storage import (
|
|
39
41
|
from lamindb.core.storage.paths import (
|
40
42
|
auto_storage_key_from_artifact,
|
41
43
|
auto_storage_key_from_artifact_uid,
|
44
|
+
check_path_is_child_of_root,
|
42
45
|
filepath_from_artifact,
|
43
46
|
)
|
44
47
|
from lamindb.core.versioning import get_uid_from_old_version, init_uid
|
@@ -102,7 +105,11 @@ def process_pathlike(
|
|
102
105
|
if not isinstance(filepath, LocalPathClasses):
|
103
106
|
# for a cloud path, new_root is always the bucket name
|
104
107
|
new_root = list(filepath.parents)[-1]
|
105
|
-
|
108
|
+
# do not register remote storage locations on hub if the current instance
|
109
|
+
# is not managed on the hub
|
110
|
+
storage_settings = init_storage(
|
111
|
+
new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
|
112
|
+
)
|
106
113
|
storage_record = register_storage_in_instance(storage_settings)
|
107
114
|
use_existing_storage_key = True
|
108
115
|
return storage_record, use_existing_storage_key
|
@@ -257,14 +264,6 @@ def check_path_in_existing_storage(
|
|
257
264
|
return False
|
258
265
|
|
259
266
|
|
260
|
-
def check_path_is_child_of_root(path: Path | UPath, root: Path | UPath | None) -> bool:
|
261
|
-
# str is needed to eliminate UPath storage_options
|
262
|
-
# from the equality checks below
|
263
|
-
path = UPath(str(path))
|
264
|
-
root = UPath(str(root))
|
265
|
-
return root.resolve() in path.resolve().parents
|
266
|
-
|
267
|
-
|
268
267
|
def get_relative_path_to_directory(
|
269
268
|
path: PurePath | Path | UPath, directory: PurePath | Path | UPath
|
270
269
|
) -> PurePath | Path:
|
@@ -343,8 +342,10 @@ def get_artifact_kwargs_from_data(
|
|
343
342
|
else:
|
344
343
|
storage = default_storage
|
345
344
|
|
346
|
-
|
347
|
-
|
345
|
+
# for now comment out this error to allow creating new versions of stores
|
346
|
+
# in the default folder (.lamindb)
|
347
|
+
# if key is not None and key.startswith(AUTO_KEY_PREFIX):
|
348
|
+
# raise ValueError(f"Key cannot start with {AUTO_KEY_PREFIX}")
|
348
349
|
|
349
350
|
log_storage_hint(
|
350
351
|
check_path_in_storage=check_path_in_storage,
|
@@ -366,7 +367,7 @@ def get_artifact_kwargs_from_data(
|
|
366
367
|
kwargs = {
|
367
368
|
"suffix": suffix,
|
368
369
|
"hash": hash,
|
369
|
-
"
|
370
|
+
"_hash_type": hash_type,
|
370
371
|
"key": key,
|
371
372
|
"size": size,
|
372
373
|
"storage_id": storage.id,
|
@@ -377,7 +378,7 @@ def get_artifact_kwargs_from_data(
|
|
377
378
|
"n_observations": None, # to implement
|
378
379
|
"run_id": run.id if run is not None else None,
|
379
380
|
"run": run,
|
380
|
-
"
|
381
|
+
"_key_is_virtual": key_is_virtual,
|
381
382
|
}
|
382
383
|
if not isinstance(path, LocalPathClasses):
|
383
384
|
local_filepath = None
|
@@ -502,7 +503,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
502
503
|
raise ValueError("Only one non-keyword arg allowed: data")
|
503
504
|
|
504
505
|
data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
|
505
|
-
type: str = kwargs.pop("type") if "type" in kwargs else
|
506
|
+
type: str = kwargs.pop("type") if "type" in kwargs else None
|
506
507
|
key: str | None = kwargs.pop("key") if "key" in kwargs else None
|
507
508
|
run: Run | None = kwargs.pop("run") if "run" in kwargs else None
|
508
509
|
description: str | None = (
|
@@ -531,7 +532,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
531
532
|
using_key = (
|
532
533
|
kwargs.pop("using_key") if "using_key" in kwargs else settings._using_key
|
533
534
|
)
|
534
|
-
accessor = kwargs.pop("
|
535
|
+
accessor = kwargs.pop("_accessor") if "_accessor" in kwargs else None
|
535
536
|
accessor = _check_accessor_artifact(data=data, accessor=accessor)
|
536
537
|
if not len(kwargs) == 0:
|
537
538
|
raise ValueError(
|
@@ -592,7 +593,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
592
593
|
kwargs["version"] = version
|
593
594
|
kwargs["description"] = description
|
594
595
|
kwargs["visibility"] = visibility
|
595
|
-
kwargs["
|
596
|
+
kwargs["_accessor"] = accessor
|
596
597
|
# this check needs to come down here because key might be populated from an
|
597
598
|
# existing file path during get_artifact_kwargs_from_data()
|
598
599
|
if (
|
@@ -633,7 +634,7 @@ def from_df(
|
|
633
634
|
description=description,
|
634
635
|
version=version,
|
635
636
|
is_new_version_of=is_new_version_of,
|
636
|
-
|
637
|
+
_accessor="DataFrame",
|
637
638
|
type="dataset",
|
638
639
|
**kwargs,
|
639
640
|
)
|
@@ -662,7 +663,7 @@ def from_anndata(
|
|
662
663
|
description=description,
|
663
664
|
version=version,
|
664
665
|
is_new_version_of=is_new_version_of,
|
665
|
-
|
666
|
+
_accessor="AnnData",
|
666
667
|
type="dataset",
|
667
668
|
**kwargs,
|
668
669
|
)
|
@@ -689,7 +690,7 @@ def from_mudata(
|
|
689
690
|
description=description,
|
690
691
|
version=version,
|
691
692
|
is_new_version_of=is_new_version_of,
|
692
|
-
|
693
|
+
_accessor="MuData",
|
693
694
|
type="dataset",
|
694
695
|
**kwargs,
|
695
696
|
)
|
@@ -707,8 +708,8 @@ def from_dir(
|
|
707
708
|
) -> list[Artifact]:
|
708
709
|
"""{}""" # noqa: D415
|
709
710
|
logger.warning(
|
710
|
-
"this creates one artifact per file in the directory -
|
711
|
-
" ln.Artifact(
|
711
|
+
"this creates one artifact per file in the directory - consider"
|
712
|
+
" ln.Artifact(dir_path) to get one artifact for the entire directory"
|
712
713
|
)
|
713
714
|
folderpath: UPath = create_path(path) # returns Path for local
|
714
715
|
default_storage = settings._storage_settings.record
|
@@ -823,7 +824,7 @@ def replace(
|
|
823
824
|
if check_path_in_storage:
|
824
825
|
raise ValueError("Can only replace with a local file not in any Storage.")
|
825
826
|
|
826
|
-
if self.key is not None and not self.
|
827
|
+
if self.key is not None and not self._key_is_virtual:
|
827
828
|
key_path = PurePosixPath(self.key)
|
828
829
|
new_filename = f"{key_path.stem}{kwargs['suffix']}"
|
829
830
|
# the following will only be true if the suffix changes!
|
@@ -849,7 +850,7 @@ def replace(
|
|
849
850
|
self.suffix = kwargs["suffix"]
|
850
851
|
self.size = kwargs["size"]
|
851
852
|
self.hash = kwargs["hash"]
|
852
|
-
self.
|
853
|
+
self._hash_type = kwargs["_hash_type"]
|
853
854
|
self.run_id = kwargs["run_id"]
|
854
855
|
self.run = kwargs["run"]
|
855
856
|
|
@@ -862,15 +863,15 @@ def replace(
|
|
862
863
|
|
863
864
|
# deprecated
|
864
865
|
def backed(
|
865
|
-
self, is_run_input: bool | None = None
|
866
|
+
self, mode: str = "r", is_run_input: bool | None = None
|
866
867
|
) -> AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment:
|
867
868
|
logger.warning("`.backed()` is deprecated, use `.open()`!'")
|
868
|
-
return self.open(is_run_input)
|
869
|
+
return self.open(mode, is_run_input)
|
869
870
|
|
870
871
|
|
871
872
|
# docstring handled through attach_func_to_class_method
|
872
873
|
def open(
|
873
|
-
self, is_run_input: bool | None = None
|
874
|
+
self, mode: str = "r", is_run_input: bool | None = None
|
874
875
|
) -> AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment:
|
875
876
|
# ignore empty suffix for now
|
876
877
|
suffixes = (".h5", ".hdf5", ".h5ad", ".zarr", ".tiledbsoma", "")
|
@@ -880,29 +881,61 @@ def open(
|
|
880
881
|
" use one of the following suffixes for the object name:"
|
881
882
|
f" {', '.join(suffixes[:-1])}."
|
882
883
|
)
|
884
|
+
if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
|
885
|
+
raise ValueError("Only a tiledbsoma store can be openened with `mode!='r'`.")
|
883
886
|
|
884
|
-
from lamindb.core.storage._backed_access import backed_access
|
887
|
+
from lamindb.core.storage._backed_access import _track_writes_factory, backed_access
|
885
888
|
|
886
|
-
_track_run_input(self, is_run_input)
|
887
889
|
using_key = settings._using_key
|
888
890
|
filepath = filepath_from_artifact(self, using_key=using_key)
|
891
|
+
is_tiledbsoma_w = (
|
892
|
+
filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
|
893
|
+
) and mode == "w"
|
889
894
|
# consider the case where an object is already locally cached
|
890
895
|
localpath = setup_settings.instance.storage.cloud_to_local_no_update(filepath)
|
891
|
-
if localpath.exists():
|
892
|
-
|
896
|
+
if not is_tiledbsoma_w and localpath.exists():
|
897
|
+
access = backed_access(localpath, mode, using_key)
|
893
898
|
else:
|
894
|
-
|
899
|
+
access = backed_access(filepath, mode, using_key)
|
900
|
+
if is_tiledbsoma_w:
|
901
|
+
|
902
|
+
def finalize():
|
903
|
+
nonlocal self, filepath, localpath
|
904
|
+
if not isinstance(filepath, LocalPathClasses):
|
905
|
+
_, hash, _, _ = get_stat_dir_cloud(filepath)
|
906
|
+
else:
|
907
|
+
# this can be very slow
|
908
|
+
_, hash, _, _ = hash_dir(filepath)
|
909
|
+
if self.hash != hash:
|
910
|
+
from ._record import init_self_from_db
|
911
|
+
|
912
|
+
logger.warning(
|
913
|
+
"The hash of the tiledbsoma store has changed, creating a new version of the artifact."
|
914
|
+
)
|
915
|
+
new_version = Artifact(filepath, is_new_version_of=self).save()
|
916
|
+
init_self_from_db(self, new_version)
|
917
|
+
|
918
|
+
if localpath != filepath and localpath.exists():
|
919
|
+
shutil.rmtree(localpath)
|
920
|
+
|
921
|
+
access = _track_writes_factory(access, finalize)
|
922
|
+
# only call if open is successfull
|
923
|
+
_track_run_input(self, is_run_input)
|
924
|
+
return access
|
895
925
|
|
896
926
|
|
897
927
|
# docstring handled through attach_func_to_class_method
|
898
928
|
def load(self, is_run_input: bool | None = None, stream: bool = False, **kwargs) -> Any:
|
899
|
-
_track_run_input(self, is_run_input)
|
900
929
|
if hasattr(self, "_memory_rep") and self._memory_rep is not None:
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
930
|
+
access_memory = self._memory_rep
|
931
|
+
else:
|
932
|
+
using_key = settings._using_key
|
933
|
+
access_memory = load_to_memory(
|
934
|
+
filepath_from_artifact(self, using_key=using_key), stream=stream, **kwargs
|
935
|
+
)
|
936
|
+
# only call if load is successfull
|
937
|
+
_track_run_input(self, is_run_input)
|
938
|
+
return access_memory
|
906
939
|
|
907
940
|
|
908
941
|
# docstring handled through attach_func_to_class_method
|
@@ -935,6 +968,17 @@ def delete(
|
|
935
968
|
storage: bool | None = None,
|
936
969
|
using_key: str | None = None,
|
937
970
|
) -> None:
|
971
|
+
# this first check means an invalid delete fails fast rather than cascading through
|
972
|
+
# database and storage permission errors
|
973
|
+
if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
|
974
|
+
isettings = setup_settings.instance
|
975
|
+
if self.storage.instance_uid != isettings.uid and (storage or storage is None):
|
976
|
+
raise IntegrityError(
|
977
|
+
"Cannot simply delete artifacts outside of this instance's managed storage locations."
|
978
|
+
"\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
|
979
|
+
f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
|
980
|
+
f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
|
981
|
+
)
|
938
982
|
# by default, we only move artifacts into the trash (visibility = -1)
|
939
983
|
trash_visibility = VisibilityChoice.trash.value
|
940
984
|
if self.visibility > trash_visibility and not permanent:
|
@@ -943,7 +987,7 @@ def delete(
|
|
943
987
|
# move to trash
|
944
988
|
self.visibility = trash_visibility
|
945
989
|
self.save()
|
946
|
-
logger.
|
990
|
+
logger.important(f"moved artifact to trash (visibility = {trash_visibility})")
|
947
991
|
return
|
948
992
|
|
949
993
|
# if the artifact is already in the trash
|
@@ -970,7 +1014,7 @@ def delete(
|
|
970
1014
|
# only delete in storage if DB delete is successful
|
971
1015
|
# DB delete might error because of a foreign key constraint violated etc.
|
972
1016
|
self._delete_skip_storage()
|
973
|
-
if self.key is None or self.
|
1017
|
+
if self.key is None or self._key_is_virtual:
|
974
1018
|
# do not ask for confirmation also if storage is None
|
975
1019
|
delete_in_storage = storage is None or storage
|
976
1020
|
else:
|
@@ -985,9 +1029,7 @@ def delete(
|
|
985
1029
|
else:
|
986
1030
|
delete_in_storage = storage
|
987
1031
|
if not delete_in_storage:
|
988
|
-
logger.
|
989
|
-
f"you will retain a dangling store here: {path}, not referenced via an artifact"
|
990
|
-
)
|
1032
|
+
logger.important(f"a file/folder remains here: {path}")
|
991
1033
|
# we don't yet have logic to bring back the deleted metadata record
|
992
1034
|
# in case storage deletion fails - this is important for ACID down the road
|
993
1035
|
if delete_in_storage:
|
@@ -1013,7 +1055,7 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1013
1055
|
self._local_filepath = local_path
|
1014
1056
|
# switch to virtual storage key upon upload
|
1015
1057
|
# the local filepath is already cached at that point
|
1016
|
-
self.
|
1058
|
+
self._key_is_virtual = True
|
1017
1059
|
# ensure that the artifact is uploaded
|
1018
1060
|
self._to_store = True
|
1019
1061
|
|
lamindb/_can_validate.py
CHANGED
@@ -13,7 +13,7 @@ from lnschema_core import CanValidate, Record
|
|
13
13
|
from lamindb._utils import attach_func_to_class_method
|
14
14
|
|
15
15
|
from ._from_values import _has_organism_field, _print_values
|
16
|
-
from ._record import _queryset,
|
16
|
+
from ._record import _queryset, get_name_field
|
17
17
|
|
18
18
|
if TYPE_CHECKING:
|
19
19
|
from django.db.models import QuerySet
|
@@ -30,7 +30,7 @@ def inspect(
|
|
30
30
|
*,
|
31
31
|
mute: bool = False,
|
32
32
|
organism: str | Record | None = None,
|
33
|
-
|
33
|
+
source: Record | None = None,
|
34
34
|
) -> InspectResult:
|
35
35
|
"""{}""" # noqa: D415
|
36
36
|
return _inspect(
|
@@ -39,7 +39,7 @@ def inspect(
|
|
39
39
|
field=field,
|
40
40
|
mute=mute,
|
41
41
|
organism=organism,
|
42
|
-
|
42
|
+
source=source,
|
43
43
|
)
|
44
44
|
|
45
45
|
|
@@ -52,9 +52,31 @@ def validate(
|
|
52
52
|
*,
|
53
53
|
mute: bool = False,
|
54
54
|
organism: str | Record | None = None,
|
55
|
+
source: Record | None = None,
|
55
56
|
) -> np.ndarray:
|
56
57
|
"""{}""" # noqa: D415
|
57
|
-
return _validate(
|
58
|
+
return _validate(
|
59
|
+
cls=cls, values=values, field=field, mute=mute, organism=organism, source=source
|
60
|
+
)
|
61
|
+
|
62
|
+
|
63
|
+
def _check_source_db(source: Record, using_key: str | None):
|
64
|
+
"""Check if the source is from the DB."""
|
65
|
+
if using_key is not None and using_key != "default":
|
66
|
+
if source._state.db != using_key:
|
67
|
+
raise ValueError(
|
68
|
+
f"source must be a bionty.Source record from instance '{using_key}'!"
|
69
|
+
)
|
70
|
+
|
71
|
+
|
72
|
+
def _check_organism_db(organism: Record, using_key: str | None):
|
73
|
+
"""Check if the organism is from the DB."""
|
74
|
+
if isinstance(organism, Record):
|
75
|
+
if using_key is not None and using_key != "default":
|
76
|
+
if organism._state.db != using_key:
|
77
|
+
raise ValueError(
|
78
|
+
f"organism must be a bionty.Organism record from instance '{using_key}'!"
|
79
|
+
)
|
58
80
|
|
59
81
|
|
60
82
|
def _inspect(
|
@@ -65,7 +87,7 @@ def _inspect(
|
|
65
87
|
mute: bool = False,
|
66
88
|
using_key: str | None = None,
|
67
89
|
organism: str | Record | None = None,
|
68
|
-
|
90
|
+
source: Record | None = None,
|
69
91
|
) -> pd.DataFrame | dict[str, list[str]]:
|
70
92
|
"""{}""" # noqa: D415
|
71
93
|
from lamin_utils._inspect import inspect
|
@@ -73,10 +95,15 @@ def _inspect(
|
|
73
95
|
if isinstance(values, str):
|
74
96
|
values = [values]
|
75
97
|
|
76
|
-
field =
|
98
|
+
field = get_name_field(cls, field=field)
|
77
99
|
queryset = _queryset(cls, using_key)
|
78
|
-
|
79
|
-
|
100
|
+
using_key = queryset.db
|
101
|
+
if isinstance(source, Record):
|
102
|
+
_check_source_db(source, using_key)
|
103
|
+
queryset = queryset.filter(source=source).all()
|
104
|
+
_check_organism_db(organism, using_key)
|
105
|
+
registry = queryset.model
|
106
|
+
model_name = registry._meta.model.__name__
|
80
107
|
|
81
108
|
# inspect in the DB
|
82
109
|
result_db = inspect(
|
@@ -89,11 +116,11 @@ def _inspect(
|
|
89
116
|
)
|
90
117
|
nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
|
91
118
|
|
92
|
-
if len(nonval) > 0 and
|
119
|
+
if len(nonval) > 0 and registry.__get_schema_name__() == "bionty":
|
93
120
|
try:
|
94
|
-
bionty_result =
|
95
|
-
|
96
|
-
)
|
121
|
+
bionty_result = registry.public(organism=organism, source=source).inspect(
|
122
|
+
values=nonval, field=field, mute=True
|
123
|
+
)
|
97
124
|
bionty_validated = bionty_result.validated
|
98
125
|
bionty_mapper = bionty_result.synonyms_mapper
|
99
126
|
hint = False
|
@@ -135,7 +162,7 @@ def _inspect(
|
|
135
162
|
logger.print(f" couldn't validate {labels}: {colors.red(print_values)}")
|
136
163
|
logger.print(
|
137
164
|
f"→ if you are sure, create new record{s} via"
|
138
|
-
f" {colors.italic(f'
|
165
|
+
f" {colors.italic(f'{registry.__name__}()')} and save to your registry"
|
139
166
|
)
|
140
167
|
|
141
168
|
return result_db
|
@@ -149,6 +176,7 @@ def _validate(
|
|
149
176
|
mute: bool = False,
|
150
177
|
using_key: str | None = None,
|
151
178
|
organism: str | Record | None = None,
|
179
|
+
source: Record | None = None,
|
152
180
|
) -> np.ndarray:
|
153
181
|
"""{}""" # noqa: D415
|
154
182
|
from lamin_utils._inspect import validate
|
@@ -157,9 +185,14 @@ def _validate(
|
|
157
185
|
if isinstance(values, str):
|
158
186
|
values = [values]
|
159
187
|
|
160
|
-
field =
|
188
|
+
field = get_name_field(cls, field=field)
|
161
189
|
|
162
190
|
queryset = _queryset(cls, using_key)
|
191
|
+
using_key = queryset.db
|
192
|
+
if isinstance(source, Record):
|
193
|
+
_check_source_db(source, using_key)
|
194
|
+
queryset = queryset.filter(source=source).all()
|
195
|
+
_check_organism_db(organism, using_key)
|
163
196
|
field_values = pd.Series(
|
164
197
|
_filter_query_based_on_organism(
|
165
198
|
queryset=queryset,
|
@@ -169,6 +202,15 @@ def _validate(
|
|
169
202
|
),
|
170
203
|
dtype="object",
|
171
204
|
)
|
205
|
+
if field_values.empty:
|
206
|
+
if not mute:
|
207
|
+
msg = (
|
208
|
+
f"Your {cls.__name__} registry is empty, consider populating it first!"
|
209
|
+
)
|
210
|
+
if hasattr(cls, "source_id"):
|
211
|
+
msg += "\n → use `.import_from_source()` to import records from a source, e.g. a public ontology"
|
212
|
+
logger.warning(msg)
|
213
|
+
return np.array([False] * len(values))
|
172
214
|
|
173
215
|
result = validate(
|
174
216
|
identifiers=values,
|
@@ -198,6 +240,7 @@ def standardize(
|
|
198
240
|
keep: Literal["first", "last", False] = "first",
|
199
241
|
synonyms_field: str = "synonyms",
|
200
242
|
organism: str | Record | None = None,
|
243
|
+
source: Record | None = None,
|
201
244
|
) -> list[str] | dict[str, str]:
|
202
245
|
"""{}""" # noqa: D415
|
203
246
|
return _standardize(
|
@@ -212,6 +255,7 @@ def standardize(
|
|
212
255
|
keep=keep,
|
213
256
|
synonyms_field=synonyms_field,
|
214
257
|
organism=organism,
|
258
|
+
source=source,
|
215
259
|
)
|
216
260
|
|
217
261
|
|
@@ -263,6 +307,7 @@ def _standardize(
|
|
263
307
|
synonyms_field: str = "synonyms",
|
264
308
|
using_key: str | None = None,
|
265
309
|
organism: str | Record | None = None,
|
310
|
+
source: Record | None = None,
|
266
311
|
) -> list[str] | dict[str, str]:
|
267
312
|
"""{}""" # noqa: D415
|
268
313
|
from lamin_utils._standardize import standardize as map_synonyms
|
@@ -271,24 +316,29 @@ def _standardize(
|
|
271
316
|
if isinstance(values, str):
|
272
317
|
values = [values]
|
273
318
|
|
274
|
-
field =
|
275
|
-
return_field =
|
319
|
+
field = get_name_field(cls, field=field)
|
320
|
+
return_field = get_name_field(
|
276
321
|
cls, field=field if return_field is None else return_field
|
277
322
|
)
|
278
323
|
queryset = _queryset(cls, using_key)
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
324
|
+
using_key = queryset.db
|
325
|
+
if isinstance(source, Record):
|
326
|
+
_check_source_db(source, using_key)
|
327
|
+
queryset = queryset.filter(source=source).all()
|
328
|
+
_check_organism_db(organism, using_key)
|
329
|
+
registry = queryset.model
|
330
|
+
|
331
|
+
if _has_organism_field(registry):
|
332
|
+
# here, we can safely import bionty
|
333
|
+
from bionty._bionty import create_or_get_organism_record
|
334
|
+
|
335
|
+
organism_record = create_or_get_organism_record(organism=organism, orm=registry)
|
286
336
|
organism = (
|
287
337
|
organism_record.name if organism_record is not None else organism_record
|
288
338
|
)
|
289
339
|
|
290
340
|
try:
|
291
|
-
|
341
|
+
registry._meta.get_field(synonyms_field)
|
292
342
|
df = _filter_query_based_on_organism(
|
293
343
|
queryset=queryset, field=field, organism=organism
|
294
344
|
)
|
@@ -320,7 +370,7 @@ def _standardize(
|
|
320
370
|
return result
|
321
371
|
|
322
372
|
# map synonyms in Bionty
|
323
|
-
if
|
373
|
+
if registry.__get_schema_name__() == "bionty" and public_aware:
|
324
374
|
mapper = {}
|
325
375
|
if return_mapper:
|
326
376
|
mapper = std_names_db
|
@@ -328,12 +378,14 @@ def _standardize(
|
|
328
378
|
df=df, identifiers=values, return_mapper=False, mute=True, **_kwargs
|
329
379
|
)
|
330
380
|
|
331
|
-
val_res =
|
381
|
+
val_res = registry.validate(
|
382
|
+
std_names_db, field=field, mute=True, organism=organism
|
383
|
+
)
|
332
384
|
if all(val_res):
|
333
385
|
return _return(result=std_names_db, mapper=mapper)
|
334
386
|
|
335
387
|
nonval = np.array(std_names_db)[~val_res]
|
336
|
-
std_names_bt_mapper =
|
388
|
+
std_names_bt_mapper = registry.public(organism=organism).standardize(
|
337
389
|
nonval, return_mapper=True, mute=True, **_kwargs
|
338
390
|
)
|
339
391
|
|
@@ -345,7 +397,7 @@ def _standardize(
|
|
345
397
|
f" {list(std_names_bt_mapper.keys())}"
|
346
398
|
)
|
347
399
|
warn_msg += (
|
348
|
-
f"\n please add corresponding {
|
400
|
+
f"\n please add corresponding {registry._meta.model.__name__} records via"
|
349
401
|
f" `.from_values({list(set(std_names_bt_mapper.values()))})`"
|
350
402
|
)
|
351
403
|
logger.warning(warn_msg)
|
@@ -388,7 +440,10 @@ def _add_or_remove_synonyms(
|
|
388
440
|
" with the following records:\n"
|
389
441
|
)
|
390
442
|
display(records_df)
|
391
|
-
raise
|
443
|
+
raise ValueError(
|
444
|
+
"cannot assigned a synonym that is already associated with a record to a different record.\n"
|
445
|
+
"Consider removing the synonym from existing records or using a different synonym."
|
446
|
+
)
|
392
447
|
|
393
448
|
# passed synonyms
|
394
449
|
# nothing happens when passing an empty string or list
|
@@ -405,7 +460,7 @@ def _add_or_remove_synonyms(
|
|
405
460
|
return
|
406
461
|
# because we use | as the separator
|
407
462
|
if any("|" in i for i in syn_new_set):
|
408
|
-
raise
|
463
|
+
raise ValueError("a synonym can't contain '|'!")
|
409
464
|
|
410
465
|
# existing synonyms
|
411
466
|
syns_exist = record.synonyms
|
@@ -453,13 +508,13 @@ def _filter_query_based_on_organism(
|
|
453
508
|
"""Filter a queryset based on organism."""
|
454
509
|
import pandas as pd
|
455
510
|
|
456
|
-
|
511
|
+
registry = queryset.model
|
457
512
|
|
458
|
-
if _has_organism_field(
|
459
|
-
# here, we can safely import
|
460
|
-
from
|
513
|
+
if _has_organism_field(registry) and not _field_is_id(field, registry):
|
514
|
+
# here, we can safely import bionty
|
515
|
+
from bionty._bionty import create_or_get_organism_record
|
461
516
|
|
462
|
-
organism_record = create_or_get_organism_record(organism=organism, orm=
|
517
|
+
organism_record = create_or_get_organism_record(organism=organism, orm=registry)
|
463
518
|
if organism_record is not None:
|
464
519
|
queryset = queryset.filter(organism__name=organism_record.name)
|
465
520
|
|
@@ -469,6 +524,16 @@ def _filter_query_based_on_organism(
|
|
469
524
|
return queryset.values_list(values_list_field, flat=True)
|
470
525
|
|
471
526
|
|
527
|
+
def _field_is_id(field: str, registry: type[Record]) -> bool:
|
528
|
+
"""Check if the field is an ontology ID."""
|
529
|
+
if hasattr(registry, "_ontology_id_field"):
|
530
|
+
if field == registry._ontology_id_field:
|
531
|
+
return True
|
532
|
+
if field.endswith("id"):
|
533
|
+
return True
|
534
|
+
return False
|
535
|
+
|
536
|
+
|
472
537
|
METHOD_NAMES = [
|
473
538
|
"validate",
|
474
539
|
"inspect",
|