lamindb 0.49.3__py3-none-any.whl → 0.50.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +55 -15
- lamindb/_context.py +25 -25
- lamindb/_delete.py +8 -8
- lamindb/_feature.py +15 -11
- lamindb/_feature_set.py +70 -39
- lamindb/_file.py +80 -56
- lamindb/_filter.py +5 -5
- lamindb/_from_values.py +55 -92
- lamindb/{_manager.py → _query_manager.py} +8 -5
- lamindb/{_queryset.py → _query_set.py} +31 -28
- lamindb/{_orm.py → _registry.py} +53 -294
- lamindb/_save.py +14 -13
- lamindb/_synonym.py +203 -0
- lamindb/_validate.py +134 -0
- lamindb/_view.py +15 -9
- lamindb/dev/__init__.py +13 -6
- lamindb/dev/_data.py +195 -0
- lamindb/dev/_feature_manager.py +102 -0
- lamindb/dev/_settings.py +10 -9
- lamindb/dev/_view_parents.py +36 -17
- lamindb/dev/datasets/__init__.py +5 -3
- lamindb/dev/datasets/_core.py +35 -17
- lamindb/dev/exc.py +4 -0
- lamindb/dev/storage/_backed_access.py +53 -17
- lamindb/dev/storage/file.py +44 -15
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/METADATA +34 -36
- lamindb-0.50.1.dist-info/RECORD +47 -0
- lamindb/_feature_manager.py +0 -237
- lamindb-0.49.3.dist-info/RECORD +0 -43
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/LICENSE +0 -0
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/WHEEL +0 -0
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/entry_points.txt +0 -0
lamindb/_file.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from itertools import islice
|
2
2
|
from pathlib import Path, PurePath, PurePosixPath
|
3
|
-
from typing import Any, List, Optional, Tuple, Union
|
3
|
+
from typing import Any, List, Optional, Set, Tuple, Union
|
4
4
|
|
5
5
|
import anndata as ad
|
6
6
|
import lamindb_setup
|
@@ -17,7 +17,6 @@ from lnschema_core import Feature, FeatureSet, File, Run, Storage, ids
|
|
17
17
|
from lnschema_core.types import AnnDataLike, DataLike, PathLike
|
18
18
|
|
19
19
|
from lamindb._context import context
|
20
|
-
from lamindb.dev import FeatureManager
|
21
20
|
from lamindb.dev._settings import settings
|
22
21
|
from lamindb.dev.hashing import b16_to_b64, hash_file
|
23
22
|
from lamindb.dev.storage import (
|
@@ -33,6 +32,7 @@ from lamindb.dev.storage.file import (
|
|
33
32
|
ProgressCallback,
|
34
33
|
_str_to_path,
|
35
34
|
auto_storage_key_from_file,
|
35
|
+
extract_suffix_from_path,
|
36
36
|
filepath_from_file,
|
37
37
|
)
|
38
38
|
from lamindb.dev.utils import attach_func_to_class_method
|
@@ -76,7 +76,7 @@ def process_pathlike(
|
|
76
76
|
new_root = list(filepath.parents)[-1]
|
77
77
|
new_root_str = new_root.as_posix()
|
78
78
|
logger.warning(
|
79
|
-
f"
|
79
|
+
f"creating new storage location for root: {new_root_str}"
|
80
80
|
)
|
81
81
|
storage_settings = StorageSettings(new_root_str)
|
82
82
|
register_storage(storage_settings)
|
@@ -110,7 +110,7 @@ def process_data(
|
|
110
110
|
storage, use_existing_storage_key = process_pathlike(
|
111
111
|
filepath, skip_existence_check=skip_existence_check
|
112
112
|
)
|
113
|
-
suffix =
|
113
|
+
suffix = extract_suffix_from_path(filepath)
|
114
114
|
memory_rep = None
|
115
115
|
elif isinstance(data, (pd.DataFrame, AnnData)): # DataLike, spelled out
|
116
116
|
storage = lamindb_setup.settings.storage.record
|
@@ -162,7 +162,7 @@ def get_hash(
|
|
162
162
|
hash = f"{b16_to_b64(stripped_etag)}-{suffix}"
|
163
163
|
hash_type = "md5-n" # this is the S3 chunk-hashing strategy
|
164
164
|
else:
|
165
|
-
logger.warning(f"
|
165
|
+
logger.warning(f"did not add hash for {filepath}")
|
166
166
|
return None, None
|
167
167
|
else:
|
168
168
|
hash, hash_type = hash_file(filepath)
|
@@ -171,20 +171,20 @@ def get_hash(
|
|
171
171
|
result = File.filter(hash=hash).list()
|
172
172
|
if len(result) > 0:
|
173
173
|
if settings.upon_file_create_if_hash_exists == "error":
|
174
|
-
msg = f"
|
174
|
+
msg = f"file with same hash exists: {result[0]}"
|
175
175
|
hint = (
|
176
|
-
"💡
|
176
|
+
"💡 you can make this error a warning:\n"
|
177
177
|
" ln.settings.upon_file_create_if_hash_exists"
|
178
178
|
)
|
179
179
|
raise RuntimeError(f"{msg}\n{hint}")
|
180
180
|
elif settings.upon_file_create_if_hash_exists == "warn_create_new":
|
181
181
|
logger.warning(
|
182
|
-
"
|
182
|
+
"creating new File object despite existing file with same hash:"
|
183
183
|
f" {result[0]}"
|
184
184
|
)
|
185
185
|
return hash, hash_type
|
186
186
|
else:
|
187
|
-
logger.warning(f"
|
187
|
+
logger.warning(f"returning existing file with same hash: {result[0]}")
|
188
188
|
return result[0]
|
189
189
|
else:
|
190
190
|
return hash, hash_type
|
@@ -295,7 +295,7 @@ def get_relative_path_to_directory(
|
|
295
295
|
elif isinstance(directory, PurePath):
|
296
296
|
relpath = path.relative_to(directory)
|
297
297
|
else:
|
298
|
-
raise TypeError("
|
298
|
+
raise TypeError("Directory not of type Path or UPath")
|
299
299
|
return relpath
|
300
300
|
|
301
301
|
|
@@ -377,13 +377,13 @@ def log_storage_hint(
|
|
377
377
|
) -> None:
|
378
378
|
hint = ""
|
379
379
|
if check_path_in_storage:
|
380
|
-
hint += f"file in storage {storage.root}" # type: ignore
|
380
|
+
hint += f"file in storage '{storage.root}'" # type: ignore
|
381
381
|
else:
|
382
382
|
hint += "file will be copied to default storage upon `save()`"
|
383
383
|
if key is None:
|
384
|
-
hint += f" with key
|
384
|
+
hint += f" with key '{id}{suffix}'"
|
385
385
|
else:
|
386
|
-
hint += f" with key
|
386
|
+
hint += f" with key '{key}'"
|
387
387
|
logger.hint(hint)
|
388
388
|
|
389
389
|
|
@@ -441,7 +441,7 @@ def __init__(file: File, *args, **kwargs):
|
|
441
441
|
if name is not None and description is not None:
|
442
442
|
raise ValueError("Only pass description, do not pass a name")
|
443
443
|
if name is not None:
|
444
|
-
logger.warning("
|
444
|
+
logger.warning("argument `name` is deprecated, please use `description`")
|
445
445
|
description = name
|
446
446
|
|
447
447
|
provisional_id = ids.base62_20()
|
@@ -473,15 +473,15 @@ def __init__(file: File, *args, **kwargs):
|
|
473
473
|
if isinstance(data, pd.DataFrame):
|
474
474
|
if log_hint:
|
475
475
|
logger.hint(
|
476
|
-
"
|
477
|
-
" names as features
|
476
|
+
"file is a dataframe, consider using File.from_df() to link column"
|
477
|
+
" names as features"
|
478
478
|
)
|
479
479
|
kwargs["accessor"] = "DataFrame"
|
480
480
|
elif data_is_anndata(data):
|
481
481
|
if log_hint:
|
482
482
|
logger.hint(
|
483
|
-
"
|
484
|
-
" var_names and obs.columns as features
|
483
|
+
"file is AnnDataLike, consider using File.from_anndata() to link"
|
484
|
+
" var_names and obs.columns as features"
|
485
485
|
)
|
486
486
|
kwargs["accessor"] = "AnnData"
|
487
487
|
elif data_is_mudata(data):
|
@@ -524,7 +524,10 @@ def from_df(
|
|
524
524
|
"""{}"""
|
525
525
|
file = File(data=df, key=key, run=run, description=description, log_hint=False)
|
526
526
|
feature_set = FeatureSet.from_df(df)
|
527
|
-
|
527
|
+
if feature_set is not None:
|
528
|
+
file._feature_sets = {"columns": feature_set}
|
529
|
+
else:
|
530
|
+
file._feature_sets = {}
|
528
531
|
return file
|
529
532
|
|
530
533
|
|
@@ -554,20 +557,25 @@ def from_anndata(
|
|
554
557
|
else:
|
555
558
|
type = convert_numpy_dtype_to_lamin_feature_type(adata.X.dtype)
|
556
559
|
feature_sets = {}
|
557
|
-
logger.info("
|
560
|
+
logger.info("parsing feature names of X stored in slot 'var'")
|
558
561
|
logger.indent = " "
|
559
|
-
|
562
|
+
feature_set_var = FeatureSet.from_values(
|
560
563
|
data_parse.var.index,
|
561
564
|
var_ref,
|
562
565
|
type=type,
|
563
566
|
)
|
564
|
-
|
567
|
+
|
568
|
+
if feature_set_var is not None:
|
569
|
+
feature_sets["var"] = feature_set_var
|
570
|
+
logger.save(f"linked: {feature_set_var}")
|
565
571
|
logger.indent = ""
|
566
572
|
if len(data_parse.obs.columns) > 0:
|
567
|
-
logger.info("
|
573
|
+
logger.info("parsing feature names of slot 'obs'")
|
568
574
|
logger.indent = " "
|
569
575
|
feature_set_obs = FeatureSet.from_df(data_parse.obs)
|
570
|
-
|
576
|
+
if feature_set_obs is not None:
|
577
|
+
feature_sets["obs"] = feature_set_obs
|
578
|
+
logger.save(f"linked: {feature_set_obs}")
|
571
579
|
logger.indent = ""
|
572
580
|
file._feature_sets = feature_sets
|
573
581
|
return file
|
@@ -598,7 +606,7 @@ def from_dir(
|
|
598
606
|
if key is None:
|
599
607
|
if not use_existing_storage:
|
600
608
|
logger.warning(
|
601
|
-
"
|
609
|
+
"folder is outside existing storage location, will copy files from"
|
602
610
|
f" {path} to {storage}/{folderpath.name}"
|
603
611
|
)
|
604
612
|
folder_key_path = Path(folderpath.name)
|
@@ -612,7 +620,6 @@ def from_dir(
|
|
612
620
|
|
613
621
|
# always sanitize by stripping a trailing slash
|
614
622
|
folder_key = folder_key_path.as_posix().rstrip("/")
|
615
|
-
logger.hint(f"using storage {storage.root} and key prefix = {folder_key}/")
|
616
623
|
|
617
624
|
# TODO: UPath doesn't list the first level files and dirs with "*"
|
618
625
|
pattern = "" if isinstance(folderpath, UPath) else "*"
|
@@ -629,7 +636,10 @@ def from_dir(
|
|
629
636
|
file = File(filepath, run=run, key=file_key, skip_check_exists=True)
|
630
637
|
files.append(file)
|
631
638
|
settings.verbosity = verbosity
|
632
|
-
logger.
|
639
|
+
logger.success(
|
640
|
+
f"created {len(files)} files from directory using storage"
|
641
|
+
f" {storage.root} and key = {folder_key}/"
|
642
|
+
)
|
633
643
|
return files
|
634
644
|
|
635
645
|
|
@@ -654,7 +664,7 @@ def replace(
|
|
654
664
|
self._clear_storagekey = self.key
|
655
665
|
self.key = str(key_path.with_name(new_filename))
|
656
666
|
logger.warning(
|
657
|
-
f"
|
667
|
+
f"replacing the file will replace key '{key_path}' with '{self.key}'"
|
658
668
|
f" and delete '{key_path}' upon `save()`"
|
659
669
|
)
|
660
670
|
else:
|
@@ -716,18 +726,18 @@ def _track_run_input(file: File, is_run_input: Optional[bool] = None):
|
|
716
726
|
f", adding parent transform {file.transform.id}"
|
717
727
|
)
|
718
728
|
logger.info(
|
719
|
-
f"
|
729
|
+
f"adding file {file.id} as input for run"
|
720
730
|
f" {context.run.id}{transform_note}"
|
721
731
|
)
|
722
732
|
track_run_input = True
|
723
733
|
else:
|
724
734
|
logger.hint(
|
725
|
-
"
|
735
|
+
"track this file as a run input by passing `is_run_input=True`"
|
726
736
|
)
|
727
737
|
else:
|
728
738
|
if settings.track_run_inputs:
|
729
739
|
logger.hint(
|
730
|
-
"
|
740
|
+
"you can auto-track this file as a run input by calling"
|
731
741
|
" `ln.track()`"
|
732
742
|
)
|
733
743
|
else:
|
@@ -771,9 +781,9 @@ def delete(self, storage: Optional[bool] = None) -> None:
|
|
771
781
|
delete_in_storage = storage
|
772
782
|
|
773
783
|
if delete_in_storage:
|
774
|
-
filepath = self.path
|
784
|
+
filepath = self.path
|
775
785
|
delete_storage(filepath)
|
776
|
-
logger.success(f"
|
786
|
+
logger.success(f"deleted stored object {colors.yellow(f'{filepath}')}")
|
777
787
|
self._delete_skip_storage()
|
778
788
|
|
779
789
|
|
@@ -802,6 +812,11 @@ def _save_skip_storage(file, *args, **kwargs) -> None:
|
|
802
812
|
if hasattr(file, "_feature_sets"):
|
803
813
|
for feature_set in file._feature_sets.values():
|
804
814
|
feature_set.save()
|
815
|
+
s = "s" if len(file._feature_sets) > 1 else ""
|
816
|
+
logger.save(
|
817
|
+
f"saved {len(file._feature_sets)} feature set{s} for slot{s}:"
|
818
|
+
f" {list(file._feature_sets.keys())}"
|
819
|
+
)
|
805
820
|
super(File, file).save(*args, **kwargs)
|
806
821
|
if hasattr(file, "_feature_sets"):
|
807
822
|
links = []
|
@@ -817,11 +832,14 @@ def _save_skip_storage(file, *args, **kwargs) -> None:
|
|
817
832
|
bulk_create(links)
|
818
833
|
|
819
834
|
|
835
|
+
@property # type: ignore
|
836
|
+
@doc_args(File.path.__doc__)
|
820
837
|
def path(self) -> Union[Path, UPath]:
|
838
|
+
"""{}"""
|
821
839
|
return filepath_from_file(self)
|
822
840
|
|
823
841
|
|
824
|
-
# adapted from: https://stackoverflow.com/questions/9727673
|
842
|
+
# adapted from: https://stackoverflow.com/questions/9727673
|
825
843
|
@classmethod # type: ignore
|
826
844
|
@doc_args(File.tree.__doc__)
|
827
845
|
def tree(
|
@@ -831,7 +849,7 @@ def tree(
|
|
831
849
|
level: int = -1,
|
832
850
|
limit_to_directories: bool = False,
|
833
851
|
length_limit: int = 1000,
|
834
|
-
):
|
852
|
+
) -> None:
|
835
853
|
"""{}"""
|
836
854
|
space = " "
|
837
855
|
branch = "│ "
|
@@ -842,11 +860,21 @@ def tree(
|
|
842
860
|
dir_path = settings.storage
|
843
861
|
else:
|
844
862
|
dir_path = path if isinstance(path, (Path, UPath)) else _str_to_path(path)
|
845
|
-
|
846
|
-
|
863
|
+
n_files = 0
|
864
|
+
n_directories = 0
|
865
|
+
|
866
|
+
# by default only including registered files
|
867
|
+
# need a flag and a proper implementation
|
868
|
+
registered_paths: Set[Any] = set()
|
869
|
+
registered_dirs: Set[Any] = set()
|
870
|
+
if path is None:
|
871
|
+
registered_paths = {
|
872
|
+
file.path for file in cls.filter(storage_id=setup_settings.storage.id).all()
|
873
|
+
}
|
874
|
+
registered_dirs = {d for p in registered_paths for d in p.parents}
|
847
875
|
|
848
876
|
def inner(dir_path: Union[Path, UPath], prefix: str = "", level=-1):
|
849
|
-
nonlocal
|
877
|
+
nonlocal n_files, n_directories
|
850
878
|
if not level:
|
851
879
|
return # 0, stop iterating
|
852
880
|
stripped_dir_path = dir_path.as_posix().rstrip("/")
|
@@ -864,22 +892,29 @@ def tree(
|
|
864
892
|
pointers = [tee] * (len(contents) - 1) + [last]
|
865
893
|
for pointer, path in zip(pointers, contents):
|
866
894
|
if path.is_dir():
|
895
|
+
if registered_dirs and path not in registered_dirs:
|
896
|
+
continue
|
867
897
|
yield prefix + pointer + path.name
|
868
|
-
|
898
|
+
n_directories += 1
|
869
899
|
extension = branch if pointer == tee else space
|
870
900
|
yield from inner(path, prefix=prefix + extension, level=level - 1)
|
871
901
|
elif not limit_to_directories:
|
902
|
+
if registered_paths and path not in registered_paths:
|
903
|
+
continue
|
872
904
|
yield prefix + pointer + path.name
|
873
|
-
|
905
|
+
n_files += 1
|
874
906
|
|
875
|
-
folder_tree =
|
907
|
+
folder_tree = ""
|
876
908
|
iterator = inner(dir_path, level=level)
|
877
909
|
for line in islice(iterator, length_limit):
|
878
910
|
folder_tree += f"\n{line}"
|
879
911
|
if next(iterator, None):
|
880
912
|
folder_tree += f"... length_limit, {length_limit}, reached, counted:"
|
881
|
-
|
882
|
-
print(
|
913
|
+
directory_info = "directory" if n_directories == 1 else "directories"
|
914
|
+
print(
|
915
|
+
f"{dir_path.name} ({n_directories} sub-{directory_info} & {n_files} files):"
|
916
|
+
f" {folder_tree}"
|
917
|
+
)
|
883
918
|
|
884
919
|
|
885
920
|
def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
|
@@ -922,22 +957,13 @@ def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
|
|
922
957
|
]
|
923
958
|
|
924
959
|
s = "s" if len(inherit_names) > 1 else ""
|
925
|
-
logger.info(f"
|
960
|
+
logger.info(f"inheriting {len(inherit_names)} field{s}: {inherit_names}")
|
926
961
|
for related_name in inherit_names:
|
927
962
|
self.__getattribute__(related_name).set(
|
928
963
|
file.__getattribute__(related_name).all()
|
929
964
|
)
|
930
965
|
|
931
966
|
|
932
|
-
@property # type: ignore
|
933
|
-
@doc_args(File.features.__doc__)
|
934
|
-
def features(self) -> "FeatureManager":
|
935
|
-
"""{}"""
|
936
|
-
from lamindb._feature_manager import FeatureManager
|
937
|
-
|
938
|
-
return FeatureManager(self)
|
939
|
-
|
940
|
-
|
941
967
|
METHOD_NAMES = [
|
942
968
|
"__init__",
|
943
969
|
"from_anndata",
|
@@ -948,7 +974,6 @@ METHOD_NAMES = [
|
|
948
974
|
"delete",
|
949
975
|
"save",
|
950
976
|
"replace",
|
951
|
-
"path",
|
952
977
|
"from_dir",
|
953
978
|
"tree",
|
954
979
|
]
|
@@ -971,5 +996,4 @@ File._save_skip_storage = _save_skip_storage
|
|
971
996
|
# TODO: move these to METHOD_NAMES
|
972
997
|
setattr(File, "view_lineage", view_lineage)
|
973
998
|
setattr(File, "inherit_relations", inherit_relations)
|
974
|
-
|
975
|
-
setattr(File, "features", features)
|
999
|
+
setattr(File, "path", path)
|
lamindb/_filter.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
from typing import Type
|
2
2
|
|
3
|
-
from lnschema_core import
|
3
|
+
from lnschema_core import Registry
|
4
4
|
|
5
|
-
from lamindb.
|
5
|
+
from lamindb._query_set import QuerySet
|
6
6
|
|
7
7
|
|
8
|
-
def filter(
|
9
|
-
"""See :meth:`~lamindb.dev.
|
10
|
-
qs = QuerySet(model=
|
8
|
+
def filter(Registry: Type[Registry], **expressions) -> QuerySet:
|
9
|
+
"""See :meth:`~lamindb.dev.Registry.filter`."""
|
10
|
+
qs = QuerySet(model=Registry)
|
11
11
|
if len(expressions) > 0:
|
12
12
|
return qs.filter(**expressions)
|
13
13
|
else:
|
lamindb/_from_values.py
CHANGED
@@ -2,10 +2,9 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
2
2
|
|
3
3
|
import pandas as pd
|
4
4
|
from django.core.exceptions import FieldDoesNotExist
|
5
|
-
from django.db.models import Case, When
|
6
5
|
from django.db.models.query_utils import DeferredAttribute as Field
|
7
6
|
from lamin_utils import colors, logger
|
8
|
-
from lnschema_core.models import
|
7
|
+
from lnschema_core.models import Feature, Label, Registry
|
9
8
|
from lnschema_core.types import ListLike
|
10
9
|
|
11
10
|
from .dev._settings import settings
|
@@ -18,7 +17,7 @@ def get_or_create_records(
|
|
18
17
|
*,
|
19
18
|
from_bionty: bool = False,
|
20
19
|
**kwargs,
|
21
|
-
) -> List[
|
20
|
+
) -> List[Registry]:
|
22
21
|
"""Get or create records from iterables."""
|
23
22
|
upon_create_search_names = settings.upon_create_search_names
|
24
23
|
settings.upon_create_search_names = False
|
@@ -31,10 +30,10 @@ def get_or_create_records(
|
|
31
30
|
types = kwargs.pop("types")
|
32
31
|
try:
|
33
32
|
field_name = field.field.name
|
34
|
-
|
33
|
+
Registry = field.field.model
|
35
34
|
iterable_idx = index_iterable(iterable)
|
36
35
|
|
37
|
-
if isinstance(
|
36
|
+
if isinstance(Registry, Feature):
|
38
37
|
if types is None:
|
39
38
|
raise ValueError("Please pass types as {} or use FeatureSet.from_df()")
|
40
39
|
|
@@ -49,6 +48,8 @@ def get_or_create_records(
|
|
49
48
|
records_bionty, unmapped_values = create_records_from_bionty(
|
50
49
|
iterable_idx=nonexist_values, field=field, **kwargs
|
51
50
|
)
|
51
|
+
for record in records_bionty:
|
52
|
+
record._from_bionty = True
|
52
53
|
records += records_bionty
|
53
54
|
else:
|
54
55
|
unmapped_values = nonexist_values
|
@@ -58,19 +59,19 @@ def get_or_create_records(
|
|
58
59
|
params = {field_name: value}
|
59
60
|
if types is not None:
|
60
61
|
params["type"] = str(types[value])
|
61
|
-
records.append(
|
62
|
+
records.append(Registry(**params, **kwargs))
|
62
63
|
s = "" if len(unmapped_values) == 1 else "s"
|
63
|
-
print_unmapped_values = ", ".join(unmapped_values[:
|
64
|
-
if len(unmapped_values) >
|
64
|
+
print_unmapped_values = ", ".join(unmapped_values[:20])
|
65
|
+
if len(unmapped_values) > 20:
|
65
66
|
print_unmapped_values += ", ..."
|
66
67
|
additional_info = " "
|
67
68
|
if feature is not None:
|
68
69
|
additional_info = f" Feature {feature.name} and "
|
69
70
|
logger.warning(
|
70
|
-
f"
|
71
|
+
f"did not validate {colors.yellow(f'{len(unmapped_values)} {Registry.__name__} record{s}')} for{additional_info}" # noqa
|
71
72
|
f"{colors.yellow(f'{field_name}{s}')}: {print_unmapped_values}" # noqa
|
72
73
|
)
|
73
|
-
if
|
74
|
+
if Registry.__module__.startswith("lnschema_bionty.") or Registry == Label:
|
74
75
|
if isinstance(iterable, pd.Series):
|
75
76
|
feature = iterable.name
|
76
77
|
feature_name = None
|
@@ -82,13 +83,17 @@ def get_or_create_records(
|
|
82
83
|
if feature_name is not None:
|
83
84
|
for record in records:
|
84
85
|
record._feature = feature_name
|
85
|
-
logger.
|
86
|
+
logger.debug(f"added default feature '{feature_name}'")
|
86
87
|
return records
|
87
88
|
finally:
|
88
89
|
settings.upon_create_search_names = upon_create_search_names
|
89
90
|
|
90
91
|
|
91
|
-
def get_existing_records(
|
92
|
+
def get_existing_records(
|
93
|
+
iterable_idx: pd.Index,
|
94
|
+
field: Field,
|
95
|
+
kwargs: Dict = {},
|
96
|
+
):
|
92
97
|
field_name = field.field.name
|
93
98
|
model = field.field.model
|
94
99
|
condition: Dict = {}
|
@@ -103,25 +108,6 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
|
|
103
108
|
kwargs.update({"species": species_record})
|
104
109
|
condition.update({"species__name": species_record.name})
|
105
110
|
|
106
|
-
# map synonyms based on the DB reference
|
107
|
-
syn_mapper = model.map_synonyms(
|
108
|
-
iterable_idx, species=kwargs.get("species"), return_mapper=True
|
109
|
-
)
|
110
|
-
|
111
|
-
syn_msg = ""
|
112
|
-
if len(syn_mapper) > 0:
|
113
|
-
s = "" if len(syn_mapper) == 1 else "s"
|
114
|
-
names = list(syn_mapper.keys())
|
115
|
-
print_values = ", ".join(names[:5])
|
116
|
-
if len(names) > 5:
|
117
|
-
print_values += ", ..."
|
118
|
-
syn_msg = (
|
119
|
-
"Loaded"
|
120
|
-
f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')} that" # noqa
|
121
|
-
f" matched {colors.green('synonyms')}: {print_values}"
|
122
|
-
)
|
123
|
-
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
124
|
-
|
125
111
|
# get all existing records in the db
|
126
112
|
# if necessary, create records for the values in kwargs
|
127
113
|
# k:v -> k:v_record
|
@@ -129,32 +115,31 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
|
|
129
115
|
condition.update({f"{field_name}__in": iterable_idx.values})
|
130
116
|
|
131
117
|
query_set = model.filter(**condition)
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
118
|
+
records = query_set.list()
|
119
|
+
|
120
|
+
# now we have to sort the list of queried records
|
121
|
+
# preserved = Case(
|
122
|
+
# *[
|
123
|
+
# When(**{field_name: value}, then=pos)
|
124
|
+
# for pos, value in enumerate(iterable_idx)
|
125
|
+
# ]
|
126
|
+
# )
|
127
|
+
# order by causes a factor 10 in runtime
|
128
|
+
# records = query_set.order_by(preserved).list()
|
129
|
+
|
130
|
+
n_name = len(records)
|
143
131
|
names = [getattr(record, field_name) for record in records]
|
144
|
-
names = [name for name in names
|
132
|
+
names = [name for name in names]
|
145
133
|
if n_name > 0:
|
146
134
|
s = "" if n_name == 1 else "s"
|
147
|
-
print_values = ", ".join(names[:
|
148
|
-
if len(names) >
|
135
|
+
print_values = ", ".join(names[:20])
|
136
|
+
if len(names) > 20:
|
149
137
|
print_values += ", ..."
|
150
|
-
logger.
|
151
|
-
"
|
152
|
-
f" {colors.green(f'{n_name} {model.__name__} record{s}')}
|
153
|
-
f"
|
138
|
+
logger.success(
|
139
|
+
"validated"
|
140
|
+
f" {colors.green(f'{n_name} {model.__name__} record{s}')}"
|
141
|
+
f" on {colors.green(f'{field_name}')}: {print_values}"
|
154
142
|
)
|
155
|
-
# make sure that synonyms logging appears after the field logging
|
156
|
-
if len(syn_msg) > 0:
|
157
|
-
logger.info(syn_msg)
|
158
143
|
|
159
144
|
existing_values = iterable_idx.intersection(
|
160
145
|
query_set.values_list(field_name, flat=True)
|
@@ -183,30 +168,10 @@ def create_records_from_bionty(
|
|
183
168
|
# filter the columns in bionty df based on fields
|
184
169
|
bionty_df = _filter_bionty_df_columns(model=model, bionty_object=bionty_object)
|
185
170
|
|
186
|
-
# map synonyms in the bionty reference
|
187
|
-
try:
|
188
|
-
syn_mapper = bionty_object.map_synonyms(iterable_idx, return_mapper=True)
|
189
|
-
except KeyError:
|
190
|
-
# no synonyms column
|
191
|
-
syn_mapper = {}
|
192
|
-
msg_syn: str = ""
|
193
|
-
if len(syn_mapper) > 0:
|
194
|
-
s = "" if len(syn_mapper) == 1 else "s"
|
195
|
-
names = list(syn_mapper.keys())
|
196
|
-
print_values = ", ".join(names[:5])
|
197
|
-
if len(names) > 5:
|
198
|
-
print_values += ", ..."
|
199
|
-
msg_syn = (
|
200
|
-
"Loaded"
|
201
|
-
f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')} that" # noqa
|
202
|
-
f" matched {colors.purple('synonyms')}: {print_values}"
|
203
|
-
)
|
204
|
-
|
205
|
-
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
206
|
-
|
207
171
|
# create records for values that are found in the bionty reference
|
208
172
|
mapped_values = iterable_idx.intersection(bionty_df[field_name])
|
209
173
|
|
174
|
+
multi_msg = ""
|
210
175
|
if len(mapped_values) > 0:
|
211
176
|
bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
|
212
177
|
keys=mapped_values, column_name=field_name, df=bionty_df
|
@@ -215,26 +180,24 @@ def create_records_from_bionty(
|
|
215
180
|
records.append(model(**bk, **kwargs))
|
216
181
|
|
217
182
|
# number of records that matches field (not synonyms)
|
218
|
-
n_name = len(records)
|
183
|
+
n_name = len(records)
|
219
184
|
names = [getattr(record, field_name) for record in records]
|
220
|
-
names = [name for name in names
|
185
|
+
names = [name for name in names]
|
221
186
|
if n_name > 0:
|
222
187
|
s = "" if n_name == 1 else "s"
|
223
|
-
print_values = ", ".join(names[:
|
224
|
-
if len(names) >
|
188
|
+
print_values = ", ".join(names[:20])
|
189
|
+
if len(names) > 20:
|
225
190
|
print_values += ", ..."
|
226
191
|
msg = (
|
227
|
-
"
|
228
|
-
f" {colors.purple(f'{n_name} {model.__name__} record{s} from Bionty')}
|
229
|
-
f"
|
192
|
+
"validated"
|
193
|
+
f" {colors.purple(f'{n_name} {model.__name__} record{s} from Bionty')}" # noqa
|
194
|
+
f" on {colors.purple(f'{field_name}')}: {print_values}"
|
230
195
|
)
|
231
|
-
logger.
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
if len(multi_msg) > 0:
|
237
|
-
logger.warning(multi_msg)
|
196
|
+
logger.success(msg)
|
197
|
+
|
198
|
+
# warning about multi matches
|
199
|
+
if len(multi_msg) > 0:
|
200
|
+
logger.warning(multi_msg)
|
238
201
|
|
239
202
|
# return the values that are not found in the bionty reference
|
240
203
|
unmapped_values = iterable_idx.difference(mapped_values)
|
@@ -248,7 +211,7 @@ def index_iterable(iterable: Iterable) -> pd.Index:
|
|
248
211
|
return idx[(idx != "") & (~idx.isnull())]
|
249
212
|
|
250
213
|
|
251
|
-
def _filter_bionty_df_columns(model:
|
214
|
+
def _filter_bionty_df_columns(model: Registry, bionty_object: Any) -> pd.DataFrame:
|
252
215
|
bionty_df = pd.DataFrame()
|
253
216
|
if bionty_object is not None:
|
254
217
|
model_field_names = {i.name for i in model._meta.fields}
|
@@ -297,18 +260,18 @@ def _bulk_create_dicts_from_df(
|
|
297
260
|
dup = df.index[df.index.duplicated()].unique().tolist()
|
298
261
|
if len(dup) > 0:
|
299
262
|
s = "" if len(dup) == 1 else "s"
|
300
|
-
print_values = ", ".join(dup[:
|
301
|
-
if len(dup) >
|
263
|
+
print_values = ", ".join(dup[:20])
|
264
|
+
if len(dup) > 20:
|
302
265
|
print_values += ", ..."
|
303
266
|
multi_msg = (
|
304
|
-
f"
|
267
|
+
f"ambiguous validation in Bionty for {len(dup)} record{s}:"
|
305
268
|
f" {print_values}"
|
306
269
|
)
|
307
270
|
|
308
271
|
return df.reset_index().to_dict(orient="records"), multi_msg
|
309
272
|
|
310
273
|
|
311
|
-
def _has_species_field(orm:
|
274
|
+
def _has_species_field(orm: Registry) -> bool:
|
312
275
|
try:
|
313
276
|
orm._meta.get_field("species")
|
314
277
|
return True
|