lamindb 0.49.3__py3-none-any.whl → 0.50.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_file.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from itertools import islice
2
2
  from pathlib import Path, PurePath, PurePosixPath
3
- from typing import Any, List, Optional, Tuple, Union
3
+ from typing import Any, List, Optional, Set, Tuple, Union
4
4
 
5
5
  import anndata as ad
6
6
  import lamindb_setup
@@ -17,7 +17,6 @@ from lnschema_core import Feature, FeatureSet, File, Run, Storage, ids
17
17
  from lnschema_core.types import AnnDataLike, DataLike, PathLike
18
18
 
19
19
  from lamindb._context import context
20
- from lamindb.dev import FeatureManager
21
20
  from lamindb.dev._settings import settings
22
21
  from lamindb.dev.hashing import b16_to_b64, hash_file
23
22
  from lamindb.dev.storage import (
@@ -33,6 +32,7 @@ from lamindb.dev.storage.file import (
33
32
  ProgressCallback,
34
33
  _str_to_path,
35
34
  auto_storage_key_from_file,
35
+ extract_suffix_from_path,
36
36
  filepath_from_file,
37
37
  )
38
38
  from lamindb.dev.utils import attach_func_to_class_method
@@ -76,7 +76,7 @@ def process_pathlike(
76
76
  new_root = list(filepath.parents)[-1]
77
77
  new_root_str = new_root.as_posix()
78
78
  logger.warning(
79
- f"Creating new storage location for root: {new_root_str}"
79
+ f"creating new storage location for root: {new_root_str}"
80
80
  )
81
81
  storage_settings = StorageSettings(new_root_str)
82
82
  register_storage(storage_settings)
@@ -110,7 +110,7 @@ def process_data(
110
110
  storage, use_existing_storage_key = process_pathlike(
111
111
  filepath, skip_existence_check=skip_existence_check
112
112
  )
113
- suffix = suffix = "".join(filepath.suffixes)
113
+ suffix = extract_suffix_from_path(filepath)
114
114
  memory_rep = None
115
115
  elif isinstance(data, (pd.DataFrame, AnnData)): # DataLike, spelled out
116
116
  storage = lamindb_setup.settings.storage.record
@@ -162,7 +162,7 @@ def get_hash(
162
162
  hash = f"{b16_to_b64(stripped_etag)}-{suffix}"
163
163
  hash_type = "md5-n" # this is the S3 chunk-hashing strategy
164
164
  else:
165
- logger.warning(f"Did not add hash for {filepath}")
165
+ logger.warning(f"did not add hash for {filepath}")
166
166
  return None, None
167
167
  else:
168
168
  hash, hash_type = hash_file(filepath)
@@ -171,20 +171,20 @@ def get_hash(
171
171
  result = File.filter(hash=hash).list()
172
172
  if len(result) > 0:
173
173
  if settings.upon_file_create_if_hash_exists == "error":
174
- msg = f"A file with same hash exists: {result[0]}"
174
+ msg = f"file with same hash exists: {result[0]}"
175
175
  hint = (
176
- "💡 You can make this error a warning:\n"
176
+ "💡 you can make this error a warning:\n"
177
177
  " ln.settings.upon_file_create_if_hash_exists"
178
178
  )
179
179
  raise RuntimeError(f"{msg}\n{hint}")
180
180
  elif settings.upon_file_create_if_hash_exists == "warn_create_new":
181
181
  logger.warning(
182
- "Creating new File object despite existing file with same hash:"
182
+ "creating new File object despite existing file with same hash:"
183
183
  f" {result[0]}"
184
184
  )
185
185
  return hash, hash_type
186
186
  else:
187
- logger.warning(f"Returning existing file with same hash: {result[0]}")
187
+ logger.warning(f"returning existing file with same hash: {result[0]}")
188
188
  return result[0]
189
189
  else:
190
190
  return hash, hash_type
@@ -295,7 +295,7 @@ def get_relative_path_to_directory(
295
295
  elif isinstance(directory, PurePath):
296
296
  relpath = path.relative_to(directory)
297
297
  else:
298
- raise TypeError("directory not of type Path or UPath")
298
+ raise TypeError("Directory not of type Path or UPath")
299
299
  return relpath
300
300
 
301
301
 
@@ -377,13 +377,13 @@ def log_storage_hint(
377
377
  ) -> None:
378
378
  hint = ""
379
379
  if check_path_in_storage:
380
- hint += f"file in storage {storage.root}" # type: ignore
380
+ hint += f"file in storage '{storage.root}'" # type: ignore
381
381
  else:
382
382
  hint += "file will be copied to default storage upon `save()`"
383
383
  if key is None:
384
- hint += f" with key = {id}{suffix}"
384
+ hint += f" with key '{id}{suffix}'"
385
385
  else:
386
- hint += f" with key = {key}"
386
+ hint += f" with key '{key}'"
387
387
  logger.hint(hint)
388
388
 
389
389
 
@@ -441,7 +441,7 @@ def __init__(file: File, *args, **kwargs):
441
441
  if name is not None and description is not None:
442
442
  raise ValueError("Only pass description, do not pass a name")
443
443
  if name is not None:
444
- logger.warning("Argument `name` is deprecated, please use `description`")
444
+ logger.warning("argument `name` is deprecated, please use `description`")
445
445
  description = name
446
446
 
447
447
  provisional_id = ids.base62_20()
@@ -473,15 +473,15 @@ def __init__(file: File, *args, **kwargs):
473
473
  if isinstance(data, pd.DataFrame):
474
474
  if log_hint:
475
475
  logger.hint(
476
- "This is a dataframe, consider using File.from_df() to link column"
477
- " names as features!"
476
+ "file is a dataframe, consider using File.from_df() to link column"
477
+ " names as features"
478
478
  )
479
479
  kwargs["accessor"] = "DataFrame"
480
480
  elif data_is_anndata(data):
481
481
  if log_hint:
482
482
  logger.hint(
483
- "This is AnnDataLike, consider using File.from_anndata() to link"
484
- " var_names and obs.columns as features!"
483
+ "file is AnnDataLike, consider using File.from_anndata() to link"
484
+ " var_names and obs.columns as features"
485
485
  )
486
486
  kwargs["accessor"] = "AnnData"
487
487
  elif data_is_mudata(data):
@@ -524,7 +524,10 @@ def from_df(
524
524
  """{}"""
525
525
  file = File(data=df, key=key, run=run, description=description, log_hint=False)
526
526
  feature_set = FeatureSet.from_df(df)
527
- file._feature_sets = {"columns": feature_set}
527
+ if feature_set is not None:
528
+ file._feature_sets = {"columns": feature_set}
529
+ else:
530
+ file._feature_sets = {}
528
531
  return file
529
532
 
530
533
 
@@ -554,20 +557,25 @@ def from_anndata(
554
557
  else:
555
558
  type = convert_numpy_dtype_to_lamin_feature_type(adata.X.dtype)
556
559
  feature_sets = {}
557
- logger.info("Parsing feature names of X, stored in slot .var")
560
+ logger.info("parsing feature names of X stored in slot 'var'")
558
561
  logger.indent = " "
559
- feature_set_x = FeatureSet.from_values(
562
+ feature_set_var = FeatureSet.from_values(
560
563
  data_parse.var.index,
561
564
  var_ref,
562
565
  type=type,
563
566
  )
564
- feature_sets["var"] = feature_set_x
567
+
568
+ if feature_set_var is not None:
569
+ feature_sets["var"] = feature_set_var
570
+ logger.save(f"linked: {feature_set_var}")
565
571
  logger.indent = ""
566
572
  if len(data_parse.obs.columns) > 0:
567
- logger.info("Parsing feature names of slot .obs")
573
+ logger.info("parsing feature names of slot 'obs'")
568
574
  logger.indent = " "
569
575
  feature_set_obs = FeatureSet.from_df(data_parse.obs)
570
- feature_sets["obs"] = feature_set_obs
576
+ if feature_set_obs is not None:
577
+ feature_sets["obs"] = feature_set_obs
578
+ logger.save(f"linked: {feature_set_obs}")
571
579
  logger.indent = ""
572
580
  file._feature_sets = feature_sets
573
581
  return file
@@ -598,7 +606,7 @@ def from_dir(
598
606
  if key is None:
599
607
  if not use_existing_storage:
600
608
  logger.warning(
601
- "Folder is outside existing storage location, will copy files from"
609
+ "folder is outside existing storage location, will copy files from"
602
610
  f" {path} to {storage}/{folderpath.name}"
603
611
  )
604
612
  folder_key_path = Path(folderpath.name)
@@ -612,7 +620,6 @@ def from_dir(
612
620
 
613
621
  # always sanitize by stripping a trailing slash
614
622
  folder_key = folder_key_path.as_posix().rstrip("/")
615
- logger.hint(f"using storage {storage.root} and key prefix = {folder_key}/")
616
623
 
617
624
  # TODO: UPath doesn't list the first level files and dirs with "*"
618
625
  pattern = "" if isinstance(folderpath, UPath) else "*"
@@ -629,7 +636,10 @@ def from_dir(
629
636
  file = File(filepath, run=run, key=file_key, skip_check_exists=True)
630
637
  files.append(file)
631
638
  settings.verbosity = verbosity
632
- logger.info(f"→ {len(files)} files")
639
+ logger.success(
640
+ f"created {len(files)} files from directory using storage"
641
+ f" {storage.root} and key = {folder_key}/"
642
+ )
633
643
  return files
634
644
 
635
645
 
@@ -654,7 +664,7 @@ def replace(
654
664
  self._clear_storagekey = self.key
655
665
  self.key = str(key_path.with_name(new_filename))
656
666
  logger.warning(
657
- f"Replacing the file will replace key '{key_path}' with '{self.key}'"
667
+ f"replacing the file will replace key '{key_path}' with '{self.key}'"
658
668
  f" and delete '{key_path}' upon `save()`"
659
669
  )
660
670
  else:
@@ -716,18 +726,18 @@ def _track_run_input(file: File, is_run_input: Optional[bool] = None):
716
726
  f", adding parent transform {file.transform.id}"
717
727
  )
718
728
  logger.info(
719
- f"Adding file {file.id} as input for run"
729
+ f"adding file {file.id} as input for run"
720
730
  f" {context.run.id}{transform_note}"
721
731
  )
722
732
  track_run_input = True
723
733
  else:
724
734
  logger.hint(
725
- "Track this file as a run input by passing `is_run_input=True`"
735
+ "track this file as a run input by passing `is_run_input=True`"
726
736
  )
727
737
  else:
728
738
  if settings.track_run_inputs:
729
739
  logger.hint(
730
- "You can auto-track this file as a run input by calling"
740
+ "you can auto-track this file as a run input by calling"
731
741
  " `ln.track()`"
732
742
  )
733
743
  else:
@@ -771,9 +781,9 @@ def delete(self, storage: Optional[bool] = None) -> None:
771
781
  delete_in_storage = storage
772
782
 
773
783
  if delete_in_storage:
774
- filepath = self.path()
784
+ filepath = self.path
775
785
  delete_storage(filepath)
776
- logger.success(f"Deleted stored object {colors.yellow(f'{filepath}')}")
786
+ logger.success(f"deleted stored object {colors.yellow(f'{filepath}')}")
777
787
  self._delete_skip_storage()
778
788
 
779
789
 
@@ -802,6 +812,11 @@ def _save_skip_storage(file, *args, **kwargs) -> None:
802
812
  if hasattr(file, "_feature_sets"):
803
813
  for feature_set in file._feature_sets.values():
804
814
  feature_set.save()
815
+ s = "s" if len(file._feature_sets) > 1 else ""
816
+ logger.save(
817
+ f"saved {len(file._feature_sets)} feature set{s} for slot{s}:"
818
+ f" {list(file._feature_sets.keys())}"
819
+ )
805
820
  super(File, file).save(*args, **kwargs)
806
821
  if hasattr(file, "_feature_sets"):
807
822
  links = []
@@ -817,11 +832,14 @@ def _save_skip_storage(file, *args, **kwargs) -> None:
817
832
  bulk_create(links)
818
833
 
819
834
 
835
+ @property # type: ignore
836
+ @doc_args(File.path.__doc__)
820
837
  def path(self) -> Union[Path, UPath]:
838
+ """{}"""
821
839
  return filepath_from_file(self)
822
840
 
823
841
 
824
- # adapted from: https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python # noqa
842
+ # adapted from: https://stackoverflow.com/questions/9727673
825
843
  @classmethod # type: ignore
826
844
  @doc_args(File.tree.__doc__)
827
845
  def tree(
@@ -831,7 +849,7 @@ def tree(
831
849
  level: int = -1,
832
850
  limit_to_directories: bool = False,
833
851
  length_limit: int = 1000,
834
- ):
852
+ ) -> None:
835
853
  """{}"""
836
854
  space = " "
837
855
  branch = "│ "
@@ -842,11 +860,21 @@ def tree(
842
860
  dir_path = settings.storage
843
861
  else:
844
862
  dir_path = path if isinstance(path, (Path, UPath)) else _str_to_path(path)
845
- files = 0
846
- directories = 0
863
+ n_files = 0
864
+ n_directories = 0
865
+
866
+ # by default only including registered files
867
+ # need a flag and a proper implementation
868
+ registered_paths: Set[Any] = set()
869
+ registered_dirs: Set[Any] = set()
870
+ if path is None:
871
+ registered_paths = {
872
+ file.path for file in cls.filter(storage_id=setup_settings.storage.id).all()
873
+ }
874
+ registered_dirs = {d for p in registered_paths for d in p.parents}
847
875
 
848
876
  def inner(dir_path: Union[Path, UPath], prefix: str = "", level=-1):
849
- nonlocal files, directories
877
+ nonlocal n_files, n_directories
850
878
  if not level:
851
879
  return # 0, stop iterating
852
880
  stripped_dir_path = dir_path.as_posix().rstrip("/")
@@ -864,22 +892,29 @@ def tree(
864
892
  pointers = [tee] * (len(contents) - 1) + [last]
865
893
  for pointer, path in zip(pointers, contents):
866
894
  if path.is_dir():
895
+ if registered_dirs and path not in registered_dirs:
896
+ continue
867
897
  yield prefix + pointer + path.name
868
- directories += 1
898
+ n_directories += 1
869
899
  extension = branch if pointer == tee else space
870
900
  yield from inner(path, prefix=prefix + extension, level=level - 1)
871
901
  elif not limit_to_directories:
902
+ if registered_paths and path not in registered_paths:
903
+ continue
872
904
  yield prefix + pointer + path.name
873
- files += 1
905
+ n_files += 1
874
906
 
875
- folder_tree = f"{dir_path.name}"
907
+ folder_tree = ""
876
908
  iterator = inner(dir_path, level=level)
877
909
  for line in islice(iterator, length_limit):
878
910
  folder_tree += f"\n{line}"
879
911
  if next(iterator, None):
880
912
  folder_tree += f"... length_limit, {length_limit}, reached, counted:"
881
- print(folder_tree)
882
- print(f"\n{directories} directories" + (f", {files} files" if files else ""))
913
+ directory_info = "directory" if n_directories == 1 else "directories"
914
+ print(
915
+ f"{dir_path.name} ({n_directories} sub-{directory_info} & {n_files} files):"
916
+ f" {folder_tree}"
917
+ )
883
918
 
884
919
 
885
920
  def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
@@ -922,22 +957,13 @@ def inherit_relations(self, file: File, fields: Optional[List[str]] = None):
922
957
  ]
923
958
 
924
959
  s = "s" if len(inherit_names) > 1 else ""
925
- logger.info(f"Inheriting {len(inherit_names)} field{s}: {inherit_names}")
960
+ logger.info(f"inheriting {len(inherit_names)} field{s}: {inherit_names}")
926
961
  for related_name in inherit_names:
927
962
  self.__getattribute__(related_name).set(
928
963
  file.__getattribute__(related_name).all()
929
964
  )
930
965
 
931
966
 
932
- @property # type: ignore
933
- @doc_args(File.features.__doc__)
934
- def features(self) -> "FeatureManager":
935
- """{}"""
936
- from lamindb._feature_manager import FeatureManager
937
-
938
- return FeatureManager(self)
939
-
940
-
941
967
  METHOD_NAMES = [
942
968
  "__init__",
943
969
  "from_anndata",
@@ -948,7 +974,6 @@ METHOD_NAMES = [
948
974
  "delete",
949
975
  "save",
950
976
  "replace",
951
- "path",
952
977
  "from_dir",
953
978
  "tree",
954
979
  ]
@@ -971,5 +996,4 @@ File._save_skip_storage = _save_skip_storage
971
996
  # TODO: move these to METHOD_NAMES
972
997
  setattr(File, "view_lineage", view_lineage)
973
998
  setattr(File, "inherit_relations", inherit_relations)
974
- # property signature is not tested:
975
- setattr(File, "features", features)
999
+ setattr(File, "path", path)
lamindb/_filter.py CHANGED
@@ -1,13 +1,13 @@
1
1
  from typing import Type
2
2
 
3
- from lnschema_core import ORM
3
+ from lnschema_core import Registry
4
4
 
5
- from lamindb._queryset import QuerySet
5
+ from lamindb._query_set import QuerySet
6
6
 
7
7
 
8
- def filter(ORM: Type[ORM], **expressions) -> QuerySet:
9
- """See :meth:`~lamindb.dev.ORM.filter`."""
10
- qs = QuerySet(model=ORM)
8
+ def filter(Registry: Type[Registry], **expressions) -> QuerySet:
9
+ """See :meth:`~lamindb.dev.Registry.filter`."""
10
+ qs = QuerySet(model=Registry)
11
11
  if len(expressions) > 0:
12
12
  return qs.filter(**expressions)
13
13
  else:
lamindb/_from_values.py CHANGED
@@ -2,10 +2,9 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
2
2
 
3
3
  import pandas as pd
4
4
  from django.core.exceptions import FieldDoesNotExist
5
- from django.db.models import Case, When
6
5
  from django.db.models.query_utils import DeferredAttribute as Field
7
6
  from lamin_utils import colors, logger
8
- from lnschema_core.models import ORM, Feature, Label
7
+ from lnschema_core.models import Feature, Label, Registry
9
8
  from lnschema_core.types import ListLike
10
9
 
11
10
  from .dev._settings import settings
@@ -18,7 +17,7 @@ def get_or_create_records(
18
17
  *,
19
18
  from_bionty: bool = False,
20
19
  **kwargs,
21
- ) -> List[ORM]:
20
+ ) -> List[Registry]:
22
21
  """Get or create records from iterables."""
23
22
  upon_create_search_names = settings.upon_create_search_names
24
23
  settings.upon_create_search_names = False
@@ -31,10 +30,10 @@ def get_or_create_records(
31
30
  types = kwargs.pop("types")
32
31
  try:
33
32
  field_name = field.field.name
34
- ORM = field.field.model
33
+ Registry = field.field.model
35
34
  iterable_idx = index_iterable(iterable)
36
35
 
37
- if isinstance(ORM, Feature):
36
+ if isinstance(Registry, Feature):
38
37
  if types is None:
39
38
  raise ValueError("Please pass types as {} or use FeatureSet.from_df()")
40
39
 
@@ -49,6 +48,8 @@ def get_or_create_records(
49
48
  records_bionty, unmapped_values = create_records_from_bionty(
50
49
  iterable_idx=nonexist_values, field=field, **kwargs
51
50
  )
51
+ for record in records_bionty:
52
+ record._from_bionty = True
52
53
  records += records_bionty
53
54
  else:
54
55
  unmapped_values = nonexist_values
@@ -58,19 +59,19 @@ def get_or_create_records(
58
59
  params = {field_name: value}
59
60
  if types is not None:
60
61
  params["type"] = str(types[value])
61
- records.append(ORM(**params, **kwargs))
62
+ records.append(Registry(**params, **kwargs))
62
63
  s = "" if len(unmapped_values) == 1 else "s"
63
- print_unmapped_values = ", ".join(unmapped_values[:5])
64
- if len(unmapped_values) > 10:
64
+ print_unmapped_values = ", ".join(unmapped_values[:20])
65
+ if len(unmapped_values) > 20:
65
66
  print_unmapped_values += ", ..."
66
67
  additional_info = " "
67
68
  if feature is not None:
68
69
  additional_info = f" Feature {feature.name} and "
69
70
  logger.warning(
70
- f"Created {colors.yellow(f'{len(unmapped_values)} {ORM.__name__} record{s}')} for{additional_info}" # noqa
71
+ f"did not validate {colors.yellow(f'{len(unmapped_values)} {Registry.__name__} record{s}')} for{additional_info}" # noqa
71
72
  f"{colors.yellow(f'{field_name}{s}')}: {print_unmapped_values}" # noqa
72
73
  )
73
- if ORM.__module__.startswith("lnschema_bionty.") or ORM == Label:
74
+ if Registry.__module__.startswith("lnschema_bionty.") or Registry == Label:
74
75
  if isinstance(iterable, pd.Series):
75
76
  feature = iterable.name
76
77
  feature_name = None
@@ -82,13 +83,17 @@ def get_or_create_records(
82
83
  if feature_name is not None:
83
84
  for record in records:
84
85
  record._feature = feature_name
85
- logger.hint(f"Added default feature '{feature_name}'")
86
+ logger.debug(f"added default feature '{feature_name}'")
86
87
  return records
87
88
  finally:
88
89
  settings.upon_create_search_names = upon_create_search_names
89
90
 
90
91
 
91
- def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}):
92
+ def get_existing_records(
93
+ iterable_idx: pd.Index,
94
+ field: Field,
95
+ kwargs: Dict = {},
96
+ ):
92
97
  field_name = field.field.name
93
98
  model = field.field.model
94
99
  condition: Dict = {}
@@ -103,25 +108,6 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
103
108
  kwargs.update({"species": species_record})
104
109
  condition.update({"species__name": species_record.name})
105
110
 
106
- # map synonyms based on the DB reference
107
- syn_mapper = model.map_synonyms(
108
- iterable_idx, species=kwargs.get("species"), return_mapper=True
109
- )
110
-
111
- syn_msg = ""
112
- if len(syn_mapper) > 0:
113
- s = "" if len(syn_mapper) == 1 else "s"
114
- names = list(syn_mapper.keys())
115
- print_values = ", ".join(names[:5])
116
- if len(names) > 5:
117
- print_values += ", ..."
118
- syn_msg = (
119
- "Loaded"
120
- f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')} that" # noqa
121
- f" matched {colors.green('synonyms')}: {print_values}"
122
- )
123
- iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
124
-
125
111
  # get all existing records in the db
126
112
  # if necessary, create records for the values in kwargs
127
113
  # k:v -> k:v_record
@@ -129,32 +115,31 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
129
115
  condition.update({f"{field_name}__in": iterable_idx.values})
130
116
 
131
117
  query_set = model.filter(**condition)
132
-
133
- # new we have to sort the list of queried records
134
- preserved = Case(
135
- *[
136
- When(**{field_name: value}, then=pos)
137
- for pos, value in enumerate(iterable_idx)
138
- ]
139
- )
140
- records = query_set.order_by(preserved).list()
141
-
142
- n_name = len(records) - len(syn_mapper)
118
+ records = query_set.list()
119
+
120
+ # now we have to sort the list of queried records
121
+ # preserved = Case(
122
+ # *[
123
+ # When(**{field_name: value}, then=pos)
124
+ # for pos, value in enumerate(iterable_idx)
125
+ # ]
126
+ # )
127
+ # order by causes a factor 10 in runtime
128
+ # records = query_set.order_by(preserved).list()
129
+
130
+ n_name = len(records)
143
131
  names = [getattr(record, field_name) for record in records]
144
- names = [name for name in names if name not in syn_mapper.values()]
132
+ names = [name for name in names]
145
133
  if n_name > 0:
146
134
  s = "" if n_name == 1 else "s"
147
- print_values = ", ".join(names[:5])
148
- if len(names) > 5:
135
+ print_values = ", ".join(names[:20])
136
+ if len(names) > 20:
149
137
  print_values += ", ..."
150
- logger.info(
151
- "Loaded"
152
- f" {colors.green(f'{n_name} {model.__name__} record{s}')} that"
153
- f" matched {colors.green(f'{field_name}')}: {print_values}"
138
+ logger.success(
139
+ "validated"
140
+ f" {colors.green(f'{n_name} {model.__name__} record{s}')}"
141
+ f" on {colors.green(f'{field_name}')}: {print_values}"
154
142
  )
155
- # make sure that synonyms logging appears after the field logging
156
- if len(syn_msg) > 0:
157
- logger.info(syn_msg)
158
143
 
159
144
  existing_values = iterable_idx.intersection(
160
145
  query_set.values_list(field_name, flat=True)
@@ -183,30 +168,10 @@ def create_records_from_bionty(
183
168
  # filter the columns in bionty df based on fields
184
169
  bionty_df = _filter_bionty_df_columns(model=model, bionty_object=bionty_object)
185
170
 
186
- # map synonyms in the bionty reference
187
- try:
188
- syn_mapper = bionty_object.map_synonyms(iterable_idx, return_mapper=True)
189
- except KeyError:
190
- # no synonyms column
191
- syn_mapper = {}
192
- msg_syn: str = ""
193
- if len(syn_mapper) > 0:
194
- s = "" if len(syn_mapper) == 1 else "s"
195
- names = list(syn_mapper.keys())
196
- print_values = ", ".join(names[:5])
197
- if len(names) > 5:
198
- print_values += ", ..."
199
- msg_syn = (
200
- "Loaded"
201
- f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')} that" # noqa
202
- f" matched {colors.purple('synonyms')}: {print_values}"
203
- )
204
-
205
- iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
206
-
207
171
  # create records for values that are found in the bionty reference
208
172
  mapped_values = iterable_idx.intersection(bionty_df[field_name])
209
173
 
174
+ multi_msg = ""
210
175
  if len(mapped_values) > 0:
211
176
  bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
212
177
  keys=mapped_values, column_name=field_name, df=bionty_df
@@ -215,26 +180,24 @@ def create_records_from_bionty(
215
180
  records.append(model(**bk, **kwargs))
216
181
 
217
182
  # number of records that matches field (not synonyms)
218
- n_name = len(records) - len(syn_mapper)
183
+ n_name = len(records)
219
184
  names = [getattr(record, field_name) for record in records]
220
- names = [name for name in names if name not in syn_mapper.values()]
185
+ names = [name for name in names]
221
186
  if n_name > 0:
222
187
  s = "" if n_name == 1 else "s"
223
- print_values = ", ".join(names[:5])
224
- if len(names) > 5:
188
+ print_values = ", ".join(names[:20])
189
+ if len(names) > 20:
225
190
  print_values += ", ..."
226
191
  msg = (
227
- "Loaded"
228
- f" {colors.purple(f'{n_name} {model.__name__} record{s} from Bionty')} that" # noqa
229
- f" matched {colors.purple(f'{field_name}')}: {print_values}"
192
+ "validated"
193
+ f" {colors.purple(f'{n_name} {model.__name__} record{s} from Bionty')}" # noqa
194
+ f" on {colors.purple(f'{field_name}')}: {print_values}"
230
195
  )
231
- logger.info(msg)
232
- # make sure that synonyms logging appears after the field logging
233
- if len(msg_syn) > 0:
234
- logger.info(msg_syn)
235
- # warning about multi matches
236
- if len(multi_msg) > 0:
237
- logger.warning(multi_msg)
196
+ logger.success(msg)
197
+
198
+ # warning about multi matches
199
+ if len(multi_msg) > 0:
200
+ logger.warning(multi_msg)
238
201
 
239
202
  # return the values that are not found in the bionty reference
240
203
  unmapped_values = iterable_idx.difference(mapped_values)
@@ -248,7 +211,7 @@ def index_iterable(iterable: Iterable) -> pd.Index:
248
211
  return idx[(idx != "") & (~idx.isnull())]
249
212
 
250
213
 
251
- def _filter_bionty_df_columns(model: ORM, bionty_object: Any) -> pd.DataFrame:
214
+ def _filter_bionty_df_columns(model: Registry, bionty_object: Any) -> pd.DataFrame:
252
215
  bionty_df = pd.DataFrame()
253
216
  if bionty_object is not None:
254
217
  model_field_names = {i.name for i in model._meta.fields}
@@ -297,18 +260,18 @@ def _bulk_create_dicts_from_df(
297
260
  dup = df.index[df.index.duplicated()].unique().tolist()
298
261
  if len(dup) > 0:
299
262
  s = "" if len(dup) == 1 else "s"
300
- print_values = ", ".join(dup[:5])
301
- if len(dup) > 5:
263
+ print_values = ", ".join(dup[:20])
264
+ if len(dup) > 20:
302
265
  print_values += ", ..."
303
266
  multi_msg = (
304
- f"Multiple matches found in Bionty for {len(dup)} record{s}:"
267
+ f"ambiguous validation in Bionty for {len(dup)} record{s}:"
305
268
  f" {print_values}"
306
269
  )
307
270
 
308
271
  return df.reset_index().to_dict(orient="records"), multi_msg
309
272
 
310
273
 
311
- def _has_species_field(orm: ORM) -> bool:
274
+ def _has_species_field(orm: Registry) -> bool:
312
275
  try:
313
276
  orm._meta.get_field("species")
314
277
  return True