lamindb 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lamindb/__init__.py +1 -3
  2. lamindb/_finish.py +32 -16
  3. lamindb/base/types.py +6 -4
  4. lamindb/core/_context.py +127 -57
  5. lamindb/core/_mapped_collection.py +1 -1
  6. lamindb/core/_settings.py +44 -4
  7. lamindb/core/_track_environment.py +5 -2
  8. lamindb/core/loaders.py +1 -1
  9. lamindb/core/storage/_anndata_accessor.py +1 -1
  10. lamindb/core/storage/_tiledbsoma.py +14 -8
  11. lamindb/core/storage/_valid_suffixes.py +0 -1
  12. lamindb/core/storage/_zarr.py +1 -1
  13. lamindb/core/storage/objects.py +13 -8
  14. lamindb/core/storage/paths.py +9 -6
  15. lamindb/core/types.py +1 -1
  16. lamindb/curators/_legacy.py +2 -1
  17. lamindb/curators/core.py +106 -105
  18. lamindb/errors.py +9 -0
  19. lamindb/examples/fixtures/__init__.py +0 -0
  20. lamindb/examples/fixtures/sheets.py +224 -0
  21. lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +1 -1
  22. lamindb/migrations/0105_record_unique_name.py +20 -0
  23. lamindb/migrations/0106_transfer_data_migration.py +25 -0
  24. lamindb/migrations/0107_add_schema_to_record.py +68 -0
  25. lamindb/migrations/0108_remove_record_sheet_remove_sheetproject_sheet_and_more.py +30 -0
  26. lamindb/migrations/0109_record_input_of_runs_alter_record_run_and_more.py +123 -0
  27. lamindb/migrations/0110_rename_values_artifacts_record_linked_artifacts.py +17 -0
  28. lamindb/migrations/0111_remove_record__sort_order.py +148 -0
  29. lamindb/migrations/0112_alter_recordartifact_feature_and_more.py +105 -0
  30. lamindb/migrations/0113_lower_case_branch_and_space_names.py +62 -0
  31. lamindb/migrations/0114_alter_run__status_code.py +24 -0
  32. lamindb/migrations/0115_alter_space_uid.py +52 -0
  33. lamindb/migrations/{0104_squashed.py → 0115_squashed.py} +261 -257
  34. lamindb/models/__init__.py +4 -3
  35. lamindb/models/_describe.py +88 -31
  36. lamindb/models/_feature_manager.py +627 -658
  37. lamindb/models/_label_manager.py +1 -3
  38. lamindb/models/artifact.py +214 -99
  39. lamindb/models/collection.py +7 -1
  40. lamindb/models/feature.py +288 -60
  41. lamindb/models/has_parents.py +3 -3
  42. lamindb/models/project.py +32 -15
  43. lamindb/models/query_manager.py +7 -1
  44. lamindb/models/query_set.py +118 -41
  45. lamindb/models/record.py +140 -94
  46. lamindb/models/run.py +42 -42
  47. lamindb/models/save.py +102 -16
  48. lamindb/models/schema.py +41 -8
  49. lamindb/models/sqlrecord.py +105 -40
  50. lamindb/models/storage.py +278 -0
  51. lamindb/models/transform.py +10 -2
  52. lamindb/models/ulabel.py +9 -1
  53. lamindb/py.typed +0 -0
  54. lamindb/setup/__init__.py +2 -1
  55. lamindb/setup/_switch.py +16 -0
  56. lamindb/setup/errors/__init__.py +4 -0
  57. lamindb/setup/types/__init__.py +4 -0
  58. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/METADATA +5 -5
  59. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/RECORD +61 -44
  60. lamindb/models/core.py +0 -135
  61. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/LICENSE +0 -0
  62. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/WHEEL +0 -0
@@ -14,9 +14,11 @@ from anndata import AnnData
14
14
  from django.contrib.postgres.aggregates import ArrayAgg
15
15
  from django.db import connections
16
16
  from django.db.models import Aggregate, ProtectedError, Subquery
17
+ from django.db.utils import IntegrityError
17
18
  from lamin_utils import logger
18
19
  from lamindb_setup.core.hashing import hash_set
19
20
  from lamindb_setup.core.upath import create_path
21
+ from lamindb_setup.errors import ModuleWasntConfigured
20
22
  from rich.table import Column, Table
21
23
  from rich.text import Text
22
24
 
@@ -31,6 +33,7 @@ from lamindb.models.save import save
31
33
  from lamindb.models.schema import DICT_KEYS_TYPE, Schema
32
34
  from lamindb.models.sqlrecord import (
33
35
  REGISTRY_UNIQUE_FIELD,
36
+ Registry,
34
37
  get_name_field,
35
38
  transfer_fk_to_default_db_bulk,
36
39
  transfer_to_default_db,
@@ -50,7 +53,6 @@ from ._relations import (
50
53
  dict_related_model_to_related_name,
51
54
  )
52
55
  from .feature import Feature, FeatureValue, parse_dtype
53
- from .run import FeatureManager, FeatureManagerRun, Run
54
56
  from .sqlrecord import SQLRecord
55
57
  from .ulabel import ULabel
56
58
 
@@ -65,11 +67,7 @@ if TYPE_CHECKING:
65
67
  )
66
68
  from lamindb.models.query_set import QuerySet
67
69
 
68
-
69
- class FeatureManagerArtifact(FeatureManager):
70
- """Feature manager."""
71
-
72
- pass
70
+ from .run import Run
73
71
 
74
72
 
75
73
  def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
@@ -82,7 +80,7 @@ def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
82
80
  return dictionary
83
81
 
84
82
 
85
- def get_schema_by_slot_(host: Artifact) -> dict:
83
+ def get_schema_by_slot_(host: Artifact) -> dict[str, Schema]:
86
84
  # if the host is not yet saved
87
85
  if host._state.adding:
88
86
  if hasattr(host, "_staged_feature_sets"):
@@ -325,7 +323,7 @@ def describe_features(
325
323
  feature_data[feature_name] = (slot, registry_str)
326
324
  schema_data.update(
327
325
  {
328
- slot: (schema, schema.n)
326
+ slot: (schema, schema.n) # type: ignore
329
327
  for slot, schema in get_schema_by_slot_(self).items()
330
328
  if slot not in schema_data
331
329
  }
@@ -569,40 +567,9 @@ def infer_feature_type_convert_json(
569
567
  return "?", value, message
570
568
 
571
569
 
572
- def __init__(self, host: Artifact | Collection | Run):
573
- self._host = host
574
- self._slots = None
575
- self._accessor_by_registry_ = None
576
-
577
-
578
- def __repr__(self) -> str:
579
- return describe(self, return_str=True) # type: ignore
580
-
581
-
582
- def describe(self, return_str: bool = False) -> str | None:
583
- tree = describe_features(self._host) # type: ignore
584
- return format_rich_tree(tree, fallback="no linked features", return_str=return_str)
585
-
586
-
587
- def get_values(self) -> dict[str, Any]:
588
- """Get feature values as a dictionary."""
589
- return describe_features(self._host, to_dict=True) # type: ignore
590
-
591
-
592
- @deprecated("slots[slot].members")
593
- def __getitem__(self, slot) -> QuerySet:
594
- if slot not in self.slots:
595
- raise ValueError(
596
- f"No linked feature set for slot: {slot}\nDid you get validation"
597
- " warnings? Only features that match registered features get validated"
598
- " and linked."
599
- )
600
- schema = self.slots[slot]
601
- orm_name = schema.itype
602
- return getattr(schema, self._accessor_by_registry[orm_name]).all()
603
-
604
-
605
- def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
570
+ def filter_base(
571
+ registry: Registry, _skip_validation: bool = True, **expression
572
+ ) -> QuerySet:
606
573
  from .artifact import Artifact
607
574
 
608
575
  model = Feature
@@ -629,7 +596,7 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
629
596
  "list[cat"
630
597
  ):
631
598
  if comparator == "__isnull":
632
- if cls == FeatureManagerArtifact:
599
+ if registry is Artifact:
633
600
  from .artifact import ArtifactFeatureValue
634
601
 
635
602
  if value: # True
@@ -659,7 +626,7 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
659
626
  # categorical features
660
627
  elif isinstance(value, (str, SQLRecord, bool)):
661
628
  if comparator == "__isnull":
662
- if cls == FeatureManagerArtifact:
629
+ if registry is Artifact:
663
630
  result = parse_dtype(feature.dtype)[0]
664
631
  kwargs = {
665
632
  f"links_{result['registry'].__name__.lower()}__feature": feature
@@ -709,380 +676,10 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
709
676
  # https://laminlabs.slack.com/archives/C04FPE8V01W/p1688328084810609
710
677
  if not (new_expression):
711
678
  raise NotImplementedError
712
- if cls == FeatureManagerArtifact:
713
- return Artifact.objects.filter(**new_expression)
714
- elif cls == FeatureManagerRun:
715
- return Run.objects.filter(**new_expression)
716
-
717
-
718
- @classmethod # type: ignore
719
- @deprecated("the filter() registry classmethod")
720
- def filter(cls, **expression) -> QuerySet:
721
- """Query artifacts by features."""
722
- return filter_base(cls, _skip_validation=False, **expression)
723
-
724
-
725
- @classmethod # type: ignore
726
- @deprecated("the filter() registry classmethod")
727
- def get(cls, **expression) -> SQLRecord:
728
- """Query a single artifact by feature."""
729
- return filter_base(cls, _skip_validation=False, **expression).one()
730
-
731
-
732
- @property # type: ignore
733
- def slots(self) -> dict[str, Schema]:
734
- """Schema by slot.
735
-
736
- Example::
737
-
738
- artifact.features.slots
739
- #> {'var': <Schema: var>, 'obs': <Schema: obs>}
740
- """
741
- if self._slots is None:
742
- self._slots = get_schema_by_slot_(self._host)
743
- return self._slots
744
-
745
-
746
- @property # type: ignore
747
- def _accessor_by_registry(self):
748
- """Accessor by ORM."""
749
- if self._accessor_by_registry_ is None:
750
- self._accessor_by_registry_ = get_accessor_by_registry_(self._host)
751
- return self._accessor_by_registry_
752
-
753
-
754
- def add_label_feature_links(
755
- self,
756
- features_labels,
757
- *,
758
- label_ref_is_name: bool | None = None,
759
- feature_ref_is_name: bool | None = None,
760
- ):
761
- if list(features_labels.keys()) != ["ULabel"]:
762
- related_names = dict_related_model_to_related_name(self._host.__class__)
763
- else:
764
- related_names = {"ULabel": "ulabels"}
765
- for class_name, registry_features_labels in features_labels.items():
766
- related_name = related_names[class_name] # e.g., "ulabels"
767
- IsLink = getattr(self._host, related_name).through
768
- field_name = f"{get_link_attr(IsLink, self._host)}_id" # e.g., ulabel_id
769
- links = [
770
- IsLink(
771
- **{
772
- "artifact_id": self._host.id,
773
- "feature_id": feature.id,
774
- field_name: label.id,
775
- "feature_ref_is_name": feature_ref_is_name,
776
- "label_ref_is_name": label_ref_is_name,
777
- }
778
- )
779
- for (feature, label) in registry_features_labels
780
- ]
781
- # a link might already exist
782
- try:
783
- save(links, ignore_conflicts=False)
784
- except Exception:
785
- save(links, ignore_conflicts=True)
786
- # now delete links that were previously saved without a feature
787
- IsLink.filter(
788
- **{
789
- "artifact_id": self._host.id,
790
- "feature_id": None,
791
- f"{field_name}__in": [l.id for _, l in registry_features_labels],
792
- }
793
- ).all().delete()
794
-
795
-
796
- def _add_values(
797
- self,
798
- values: dict[str, str | int | float | bool],
799
- feature_param_field: FieldAttr,
800
- str_as_ulabel: bool = True,
801
- ) -> None:
802
- """Curate artifact with features & values.
803
-
804
- Args:
805
- values: A dictionary of keys (features) & values (labels, numbers, booleans).
806
- feature_param_field: The field of a reference registry to map keys of the
807
- dictionary.
808
- """
809
- from .._tracked import get_current_tracked_run
810
-
811
- # rename to distinguish from the values inside the dict
812
- dictionary = values
813
- keys = dictionary.keys()
814
- if isinstance(keys, DICT_KEYS_TYPE):
815
- keys = list(keys) # type: ignore
816
- # deal with other cases later
817
- assert all(isinstance(key, str) for key in keys) # noqa: S101
818
- registry = feature_param_field.field.model
819
- value_model = FeatureValue
820
- model_name = "Feature"
821
- records = registry.from_values(keys, field=feature_param_field, mute=True)
822
- if len(records) != len(keys):
823
- not_validated_keys = [key for key in keys if key not in records.list("name")]
824
- not_validated_keys_dtype_message = [
825
- (key, infer_feature_type_convert_json(key, dictionary[key]))
826
- for key in not_validated_keys
827
- ]
828
- run = get_current_tracked_run()
829
- if run is not None:
830
- name = f"{run.transform.type}[{run.transform.key}]"
831
- type_hint = f""" {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
832
- elements = [type_hint]
833
- type_kwarg = f", type={model_name.lower()}_type"
834
- else:
835
- elements = []
836
- type_kwarg = ""
837
- elements += [
838
- f" ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
839
- for key, (dtype, _, message) in not_validated_keys_dtype_message
840
- ]
841
- hint = "\n".join(elements)
842
- msg = (
843
- f"These keys could not be validated: {not_validated_keys}\n"
844
- f"Here is how to create a {model_name.lower()}:\n\n{hint}"
845
- )
846
- raise ValidationError(msg)
847
-
848
- # figure out which of the values go where
849
- features_labels = defaultdict(list)
850
- _feature_values = []
851
- not_validated_values = []
852
- for feature in records:
853
- value = dictionary[feature.name]
854
- inferred_type, converted_value, _ = infer_feature_type_convert_json(
855
- feature.name,
856
- value,
857
- mute=True,
858
- str_as_ulabel=str_as_ulabel,
859
- )
860
- if feature.dtype == "num":
861
- if inferred_type not in {"int", "float"}:
862
- raise TypeError(
863
- f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
864
- )
865
- elif feature.dtype.startswith("cat"):
866
- if inferred_type != "?":
867
- if not (
868
- inferred_type.startswith("cat") or isinstance(value, SQLRecord)
869
- ):
870
- raise TypeError(
871
- f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
872
- )
873
- elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
874
- feature.dtype != "str" and feature.dtype != inferred_type
875
- ):
876
- raise ValidationError(
877
- f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
878
- )
879
- if not feature.dtype.startswith("cat"):
880
- filter_kwargs = {model_name.lower(): feature, "value": converted_value}
881
- feature_value, _ = value_model.get_or_create(**filter_kwargs)
882
- _feature_values.append(feature_value)
883
- else:
884
- if isinstance(value, SQLRecord) or (
885
- isinstance(value, Iterable) and isinstance(next(iter(value)), SQLRecord)
886
- ):
887
- if isinstance(value, SQLRecord):
888
- label_records = [value]
889
- else:
890
- label_records = value # type: ignore
891
- for record in label_records:
892
- if record._state.adding:
893
- raise ValidationError(
894
- f"Please save {record} before annotation."
895
- )
896
- features_labels[record.__class__.__get_name_with_module__()].append(
897
- (feature, record)
898
- )
899
- else:
900
- if isinstance(value, str):
901
- values = [value] # type: ignore
902
- else:
903
- values = value # type: ignore
904
- if "ULabel" not in feature.dtype:
905
- feature.dtype += "[ULabel]"
906
- feature.save()
907
- validated = ULabel.validate(values, field=ULabel.name, mute=True)
908
- values_array = np.array(values)
909
- validated_values = values_array[validated]
910
- if validated.sum() != len(values):
911
- not_validated_values += values_array[~validated].tolist()
912
- label_records = ULabel.from_values(
913
- validated_values, field=ULabel.name, mute=True
914
- ) # type: ignore
915
- features_labels["ULabel"] += [
916
- (feature, label_record) for label_record in label_records
917
- ]
918
- if not_validated_values:
919
- not_validated_values.sort()
920
- hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
921
- msg = (
922
- f"These values could not be validated: {not_validated_values}\n"
923
- f"Here is how to create ulabels for them:\n\n{hint}"
924
- )
925
- raise ValidationError(msg)
926
- # TODO: create an explicit version of this
927
- # if not is_param:
928
- # # check if _expect_many is false for _all_ records
929
- # if any(record._expect_many for record in records):
930
- # updated_features = []
931
- # for record in records:
932
- # if record._expect_many:
933
- # record._expect_many = False
934
- # record.save()
935
- # updated_features.append(record.name)
936
- # if any(updated_features):
937
- # logger.important(
938
- # f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
939
- # )
940
- # bulk add all links
941
- if features_labels:
942
- add_label_feature_links(self, features_labels)
943
- if _feature_values:
944
- to_insert_feature_values = [
945
- record for record in _feature_values if record._state.adding
946
- ]
947
- if to_insert_feature_values:
948
- save(to_insert_feature_values)
949
- dict_typed_features = [
950
- getattr(record, model_name.lower())
951
- for record in _feature_values
952
- if getattr(record, model_name.lower()).dtype == "dict"
953
- ]
954
- IsLink = self._host._feature_values.through
955
- valuefield_id = "featurevalue_id"
956
- host_class_lower = self._host.__class__.__get_name_with_module__().lower()
957
- if dict_typed_features:
958
- # delete all previously existing anotations with dictionaries
959
- kwargs = {
960
- f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
961
- f"{model_name.lower()}__in": dict_typed_features,
962
- }
963
- try:
964
- value_model.filter(**kwargs).all().delete()
965
- except ProtectedError:
966
- pass
967
- # add new feature links
968
- links = [
969
- IsLink(
970
- **{
971
- f"{host_class_lower}_id": self._host.id,
972
- valuefield_id: feature_value.id,
973
- }
974
- )
975
- for feature_value in _feature_values
976
- ]
977
- # a link might already exist, to avoid raising a unique constraint
978
- # error, ignore_conflicts
979
- save(links, ignore_conflicts=True)
980
-
981
-
982
- def add_values_features(
983
- self,
984
- values: dict[str, str | int | float | bool],
985
- feature_field: FieldAttr = Feature.name,
986
- str_as_ulabel: bool = True,
987
- ) -> None:
988
- """Curate artifact with features & values.
989
-
990
- Args:
991
- values: A dictionary of keys (features) & values (labels, numbers, booleans).
992
- feature_field: The field of a reference registry to map keys of the
993
- dictionary.
994
- str_as_ulabel: Whether to interpret string values as ulabels.
995
- """
996
- _add_values(self, values, feature_field, str_as_ulabel=str_as_ulabel)
997
-
998
-
999
- def remove_values(
1000
- self,
1001
- feature: str | Feature,
1002
- *,
1003
- value: Any | None = None,
1004
- ):
1005
- """Remove value annotations for a given feature.
1006
-
1007
- Args:
1008
- feature: The feature for which to remove values.
1009
- value: An optional value to restrict removal to a single value.
1010
-
1011
- """
1012
- from .artifact import Artifact
1013
-
1014
- if isinstance(feature, str):
1015
- feature = Feature.get(name=feature)
1016
- filter_kwargs = {"feature": feature}
1017
- if feature.dtype.startswith("cat["): # type: ignore
1018
- feature_registry = feature.dtype.replace("cat[", "").replace("]", "") # type: ignore
1019
- if value is not None:
1020
- assert isinstance(value, SQLRecord) # noqa: S101
1021
- # the below uses our convention for field names in link models
1022
- link_name = (
1023
- feature_registry.split(".")[1]
1024
- if "." in feature_registry
1025
- else feature_registry
1026
- ).lower()
1027
- filter_kwargs[link_name] = value
1028
- if feature_registry == "ULabel":
1029
- link_attribute = "links_ulabel"
1030
- else:
1031
- link_models_on_models = {
1032
- getattr(
1033
- Artifact, obj.related_name
1034
- ).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
1035
- for obj in Artifact._meta.related_objects
1036
- if obj.related_model.__get_name_with_module__() == feature_registry
1037
- }
1038
- link_attribute = {
1039
- obj.related_name
1040
- for obj in Artifact._meta.related_objects
1041
- if obj.related_model.__get_name_with_module__() in link_models_on_models
1042
- }.pop()
1043
- getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
1044
- else:
1045
- if value is not None:
1046
- filter_kwargs["value"] = value
1047
- feature_values = self._host._feature_values.filter(**filter_kwargs)
1048
- self._host._feature_values.remove(*feature_values)
1049
- # this might leave a dangling feature_value record
1050
- # but we don't want to pay the price of making another query just to remove this annotation
1051
- # we can clean the FeatureValue registry periodically if we want to
1052
-
1053
-
1054
- def _add_schema(self, schema: Schema, slot: str) -> None:
1055
- """Annotate artifact with a schema.
1056
-
1057
- Args:
1058
- schema: `Schema` A schema record.
1059
- slot: `str` The slot that marks where the schema is stored in
1060
- the artifact.
1061
- """
1062
- # TODO: deprecate as soon as we have the Schema-based curators
1063
- if self._host._state.adding:
1064
- raise ValueError(
1065
- "Please save the artifact or collection before adding a feature set!"
1066
- )
1067
- host_db = self._host._state.db
1068
- schema.save(using=host_db)
1069
- kwargs = {
1070
- "artifact_id": self._host.id,
1071
- "schema": schema,
1072
- "slot": slot,
1073
- }
1074
- link_record = (
1075
- self._host.feature_sets.through.objects.using(host_db)
1076
- .filter(**kwargs)
1077
- .one_or_none()
1078
- )
1079
- if link_record is None:
1080
- self._host.feature_sets.through(**kwargs).save(using=host_db)
1081
- if slot in self.slots:
1082
- logger.debug(f"replaced existing {slot} feature set")
1083
- self._slots[slot] = schema # type: ignore
679
+ return registry.objects.filter(**new_expression)
1084
680
 
1085
681
 
682
+ # for deprecated functionality
1086
683
  def _unify_staged_feature_sets_by_hash(
1087
684
  feature_sets: MutableMapping[str, Schema],
1088
685
  ):
@@ -1098,114 +695,7 @@ def _unify_staged_feature_sets_by_hash(
1098
695
  return feature_sets
1099
696
 
1100
697
 
1101
- def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1102
- """Transfer features from a artifact or collection."""
1103
- # This only covers feature sets
1104
- if transfer_logs is None:
1105
- transfer_logs = {"mapped": [], "transferred": [], "run": None}
1106
- from lamindb import settings
1107
-
1108
- using_key = settings._using_key
1109
- for slot, schema in data.features.slots.items(): # type: ignore
1110
- members = schema.members
1111
- if len(members) == 0:
1112
- continue
1113
- registry = members[0].__class__
1114
- # note here the features are transferred based on an unique field
1115
- field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
1116
- # this will be e.g. be a list of ontology_ids or uids
1117
- member_uids = list(members.values_list(field, flat=True))
1118
- validated = registry.validate(member_uids, field=field, mute=True)
1119
- new_members_uids = list(compress(member_uids, ~validated))
1120
- new_members = members.filter(**{f"{field}__in": new_members_uids}).all()
1121
- n_new_members = len(new_members)
1122
- if n_new_members > 0:
1123
- # transfer foreign keys needs to be run before transfer to default db
1124
- transfer_fk_to_default_db_bulk(
1125
- new_members, using_key, transfer_logs=transfer_logs
1126
- )
1127
- for feature in new_members:
1128
- # not calling save=True here as in labels, because want to
1129
- # bulk save below
1130
- # transfer_fk is set to False because they are already transferred
1131
- # in the previous step transfer_fk_to_default_db_bulk
1132
- transfer_to_default_db(
1133
- feature, using_key, transfer_fk=False, transfer_logs=transfer_logs
1134
- )
1135
- logger.info(f"saving {n_new_members} new {registry.__name__} records")
1136
- save(
1137
- new_members, ignore_conflicts=True
1138
- ) # conflicts arising from existing records are ignored
1139
-
1140
- # create a new feature set from feature values using the same uid
1141
- schema_self = Schema.from_values(member_uids, field=getattr(registry, field))
1142
- if schema_self is None:
1143
- if hasattr(registry, "organism_id"):
1144
- logger.warning(
1145
- f"Schema is not transferred, check if organism is set correctly: {schema}"
1146
- )
1147
- continue
1148
- # make sure the uid matches if schema is composed of same features
1149
- if schema_self.hash == schema.hash:
1150
- schema_self.uid = schema.uid
1151
- logger.info(f"saving {slot} schema: {schema_self}")
1152
- self._host.features._add_schema(schema_self, slot)
1153
-
1154
-
1155
- def make_external(self, feature: Feature) -> None:
1156
- """Make a feature external, aka, remove feature from feature sets.
1157
-
1158
- Args:
1159
- feature: `Feature` A feature record.
1160
-
1161
- """
1162
- if not isinstance(feature, Feature):
1163
- raise TypeError("feature must be a Feature record!")
1164
- feature_sets = Schema.filter(features=feature).all()
1165
- for fs in feature_sets:
1166
- f = Feature.filter(uid=feature.uid).all()
1167
- features_updated = fs.members.difference(f)
1168
- if len(features_updated) > 0:
1169
- # re-compute the hash of feature sets based on the updated members
1170
- features_hash = hash_set({feature.uid for feature in features_updated})
1171
- fs.hash = features_hash
1172
- fs.n = len(features_updated)
1173
- fs.save()
1174
- # delete the link between the feature and the feature set
1175
- Schema.features.through.objects.filter(
1176
- feature_id=feature.id, schema_id=fs.id
1177
- ).delete()
1178
- # if no members are left in the schema, delete it
1179
- if len(features_updated) == 0:
1180
- logger.warning(f"deleting empty feature set: {fs}")
1181
- fs.artifacts.set([])
1182
- fs.delete()
1183
-
1184
-
1185
- @deprecated("_add_schema")
1186
- def add_schema(self, schema: Schema, slot: str) -> None:
1187
- return self._add_schema(schema, slot)
1188
-
1189
-
1190
- @deprecated("_add_schema")
1191
- def add_feature_set(self, schema: Schema, slot: str) -> None:
1192
- return self._add_schema(schema, slot)
1193
-
1194
-
1195
- @property
1196
- @deprecated("slots")
1197
- def _schema_by_slot(self):
1198
- return self.slots
1199
-
1200
-
1201
- @property
1202
- def _feature_set_by_slot(self):
1203
- return self.slots
1204
-
1205
-
1206
- # deprecated: feature set parsing
1207
-
1208
-
698
+ # for deprecated functionality
1209
699
  def parse_staged_feature_sets_from_anndata(
1210
700
  adata: AnnData,
1211
701
  var_field: FieldAttr | None = None,
@@ -1259,153 +749,632 @@ def parse_staged_feature_sets_from_anndata(
1259
749
  return feature_sets
1260
750
 
1261
751
 
1262
- # no longer called from within curator
1263
- # might deprecate in the future?
1264
- def _add_set_from_df(
1265
- self,
1266
- field: FieldAttr = Feature.name,
1267
- organism: str | None = None,
1268
- mute: bool = False,
1269
- ):
1270
- """Add feature set corresponding to column names of DataFrame."""
1271
- assert self._host.otype == "DataFrame" # noqa: S101
1272
- df = self._host.load(is_run_input=False)
1273
- schema = Schema.from_df(
1274
- df=df,
1275
- field=field,
1276
- mute=mute,
1277
- organism=organism,
1278
- )
1279
- self._host._staged_feature_sets = {"columns": schema}
1280
- self._host.save()
752
+ class FeatureManager:
753
+ """Feature manager."""
1281
754
 
755
+ def __init__(self, host: Artifact | Run):
756
+ self._host = host
757
+ self._slots: dict[str, Schema] | None = None
758
+ self._accessor_by_registry_ = None
1282
759
 
1283
- def _add_set_from_anndata(
1284
- self,
1285
- var_field: FieldAttr | None = None,
1286
- obs_field: FieldAttr | None = Feature.name,
1287
- uns_field: FieldAttr | None = None,
1288
- mute: bool = False,
1289
- organism: str | SQLRecord | None = None,
1290
- ):
1291
- """Add features from AnnData."""
1292
- assert self._host.otype == "AnnData" # noqa: S101
1293
-
1294
- # parse and register features
1295
- adata = self._host.load(is_run_input=False)
1296
- feature_sets = parse_staged_feature_sets_from_anndata(
1297
- adata,
1298
- var_field=var_field,
1299
- obs_field=obs_field,
1300
- uns_field=uns_field,
1301
- mute=mute,
1302
- organism=organism,
1303
- )
760
+ def __repr__(self) -> str:
761
+ return self.describe(return_str=True) # type: ignore
1304
762
 
1305
- # link feature sets
1306
- self._host._staged_feature_sets = feature_sets
1307
- self._host.save()
763
+ def describe(self, return_str: bool = False) -> str | None:
764
+ tree = describe_features(self._host) # type: ignore
765
+ return format_rich_tree(
766
+ tree, fallback="no linked features", return_str=return_str
767
+ )
1308
768
 
769
+ def get_values(self) -> dict[str, Any]:
770
+ """Get feature values as a dictionary."""
771
+ return describe_features(self._host, to_dict=True) # type: ignore
772
+
773
+ @deprecated("slots[slot].members")
774
+ def __getitem__(self, slot) -> QuerySet:
775
+ if slot not in self.slots:
776
+ raise ValueError(
777
+ f"No linked feature set for slot: {slot}\nDid you get validation"
778
+ " warnings? Only features that match registered features get validated"
779
+ " and linked."
780
+ )
781
+ schema = self.slots[slot]
782
+ orm_name = schema.itype
783
+ return getattr(schema, self._accessor_by_registry[orm_name]).all()
784
+
785
+ @property
786
+ def slots(self) -> dict[str, Schema]:
787
+ """Schema by slot.
788
+
789
+ Example::
790
+
791
+ artifact.features.slots
792
+ #> {'var': <Schema: var>, 'obs': <Schema: obs>}
793
+ """
794
+ if self._slots is None:
795
+ self._slots = get_schema_by_slot_(self._host)
796
+ return self._slots
797
+
798
+ @property
799
+ def _accessor_by_registry(self):
800
+ """Accessor by registry."""
801
+ if self._accessor_by_registry_ is None:
802
+ self._accessor_by_registry_ = get_accessor_by_registry_(self._host)
803
+ return self._accessor_by_registry_
804
+
805
+ def _add_label_feature_links(
806
+ self,
807
+ features_labels,
808
+ *,
809
+ label_ref_is_name: bool | None = None,
810
+ feature_ref_is_name: bool | None = None,
811
+ ):
812
+ if list(features_labels.keys()) != ["ULabel"]:
813
+ related_names = dict_related_model_to_related_name(self._host.__class__)
814
+ else:
815
+ related_names = {"ULabel": "ulabels"}
816
+ for class_name, registry_features_labels in features_labels.items():
817
+ related_name = related_names[class_name] # e.g., "ulabels"
818
+ IsLink = getattr(self._host, related_name).through
819
+ field_name = f"{get_link_attr(IsLink, self._host)}_id" # e.g., ulabel_id
820
+ links = [
821
+ IsLink(
822
+ **{
823
+ "artifact_id": self._host.id,
824
+ "feature_id": feature.id,
825
+ field_name: label.id,
826
+ "feature_ref_is_name": feature_ref_is_name,
827
+ "label_ref_is_name": label_ref_is_name,
828
+ }
829
+ )
830
+ for (feature, label) in registry_features_labels
831
+ ]
832
+ # a link might already exist
833
+ try:
834
+ save(links, ignore_conflicts=False)
835
+ except Exception:
836
+ save(links, ignore_conflicts=True)
837
+ # now delete links that were previously saved without a feature
838
+ IsLink.filter(
839
+ **{
840
+ "artifact_id": self._host.id,
841
+ "feature_id": None,
842
+ f"{field_name}__in": [l.id for _, l in registry_features_labels],
843
+ }
844
+ ).all().delete()
1309
845
 
1310
- def _add_set_from_mudata(
1311
- self,
1312
- var_fields: dict[str, FieldAttr] | None = None,
1313
- obs_fields: dict[str, FieldAttr] | None = None,
1314
- mute: bool = False,
1315
- organism: str | SQLRecord | None = None,
1316
- ):
1317
- """Add features from MuData."""
1318
- if obs_fields is None:
1319
- obs_fields = {}
1320
- assert self._host.otype == "MuData" # noqa: S101
846
+ def add_values(
847
+ self,
848
+ values: dict[str, str | int | float | bool],
849
+ feature_field: FieldAttr = Feature.name,
850
+ str_as_ulabel: bool = True,
851
+ ) -> None:
852
+ """Curate artifact with features & values.
853
+
854
+ Args:
855
+ values: A dictionary of keys (features) & values (labels, numbers, booleans).
856
+ feature_field: The field of a reference registry to map keys of the
857
+ dictionary.
858
+ str_as_ulabel: Whether to interpret string values as ulabels.
859
+ """
860
+ from .._tracked import get_current_tracked_run
861
+
862
+ # rename to distinguish from the values inside the dict
863
+ dictionary = values
864
+ keys = dictionary.keys()
865
+ if isinstance(keys, DICT_KEYS_TYPE):
866
+ keys = list(keys) # type: ignore
867
+ # deal with other cases later
868
+ assert all(isinstance(key, str) for key in keys) # noqa: S101
869
+ registry = feature_field.field.model
870
+ value_model = FeatureValue
871
+ model_name = "Feature"
872
+ records = registry.from_values(keys, field=feature_field, mute=True)
873
+ if len(records) != len(keys):
874
+ not_validated_keys = [
875
+ key for key in keys if key not in records.list("name")
876
+ ]
877
+ not_validated_keys_dtype_message = [
878
+ (key, infer_feature_type_convert_json(key, dictionary[key]))
879
+ for key in not_validated_keys
880
+ ]
881
+ run = get_current_tracked_run()
882
+ if run is not None:
883
+ name = f"{run.transform.type}[{run.transform.key}]"
884
+ type_hint = f""" {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
885
+ elements = [type_hint]
886
+ type_kwarg = f", type={model_name.lower()}_type"
887
+ else:
888
+ elements = []
889
+ type_kwarg = ""
890
+ elements += [
891
+ f" ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
892
+ for key, (dtype, _, message) in not_validated_keys_dtype_message
893
+ ]
894
+ hint = "\n".join(elements)
895
+ msg = (
896
+ f"These keys could not be validated: {not_validated_keys}\n"
897
+ f"Here is how to create a {model_name.lower()}:\n\n{hint}"
898
+ )
899
+ raise ValidationError(msg)
900
+
901
+ # figure out which of the values go where
902
+ features_labels = defaultdict(list)
903
+ _feature_values = []
904
+ not_validated_values = []
905
+ for feature in records:
906
+ value = dictionary[feature.name]
907
+ inferred_type, converted_value, _ = infer_feature_type_convert_json(
908
+ feature.name,
909
+ value,
910
+ mute=True,
911
+ str_as_ulabel=str_as_ulabel,
912
+ )
913
+ if feature.dtype == "num":
914
+ if inferred_type not in {"int", "float"}:
915
+ raise TypeError(
916
+ f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
917
+ )
918
+ elif feature.dtype.startswith("cat"):
919
+ if inferred_type != "?":
920
+ if not (
921
+ inferred_type.startswith("cat") or isinstance(value, SQLRecord)
922
+ ):
923
+ raise TypeError(
924
+ f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
925
+ )
926
+ elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
927
+ feature.dtype != "str" and feature.dtype != inferred_type
928
+ ):
929
+ raise ValidationError(
930
+ f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
931
+ )
932
+ if not feature.dtype.startswith("cat"):
933
+ filter_kwargs = {model_name.lower(): feature, "value": converted_value}
934
+ feature_value, _ = value_model.get_or_create(**filter_kwargs)
935
+ _feature_values.append(feature_value)
936
+ else:
937
+ if isinstance(value, SQLRecord) or (
938
+ isinstance(value, Iterable)
939
+ and isinstance(next(iter(value)), SQLRecord)
940
+ ):
941
+ if isinstance(value, SQLRecord):
942
+ label_records = [value]
943
+ else:
944
+ label_records = value # type: ignore
945
+ for record in label_records:
946
+ if record._state.adding:
947
+ raise ValidationError(
948
+ f"Please save {record} before annotation."
949
+ )
950
+ features_labels[
951
+ record.__class__.__get_name_with_module__()
952
+ ].append((feature, record))
953
+ else:
954
+ if isinstance(value, str):
955
+ values = [value] # type: ignore
956
+ else:
957
+ values = value # type: ignore
958
+ if "ULabel" not in feature.dtype:
959
+ feature.dtype += "[ULabel]"
960
+ feature.save()
961
+ validated = ULabel.validate(values, field=ULabel.name, mute=True)
962
+ values_array = np.array(values)
963
+ validated_values = values_array[validated]
964
+ if validated.sum() != len(values):
965
+ not_validated_values += values_array[~validated].tolist()
966
+ label_records = ULabel.from_values(
967
+ validated_values, field=ULabel.name, mute=True
968
+ ) # type: ignore
969
+ features_labels["ULabel"] += [
970
+ (feature, label_record) for label_record in label_records
971
+ ]
972
+ if not_validated_values:
973
+ not_validated_values.sort()
974
+ hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
975
+ msg = (
976
+ f"These values could not be validated: {not_validated_values}\n"
977
+ f"Here is how to create ulabels for them:\n\n{hint}"
978
+ )
979
+ raise ValidationError(msg)
980
+ # TODO: create an explicit version of this
981
+ # if not is_param:
982
+ # # check if _expect_many is false for _all_ records
983
+ # if any(record._expect_many for record in records):
984
+ # updated_features = []
985
+ # for record in records:
986
+ # if record._expect_many:
987
+ # record._expect_many = False
988
+ # record.save()
989
+ # updated_features.append(record.name)
990
+ # if any(updated_features):
991
+ # logger.important(
992
+ # f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
993
+ # )
994
+ # bulk add all links
995
+ if features_labels:
996
+ self._add_label_feature_links(features_labels)
997
+ if _feature_values:
998
+ to_insert_feature_values = [
999
+ record for record in _feature_values if record._state.adding
1000
+ ]
1001
+ if to_insert_feature_values:
1002
+ save(to_insert_feature_values)
1003
+ dict_typed_features = [
1004
+ getattr(record, model_name.lower())
1005
+ for record in _feature_values
1006
+ if getattr(record, model_name.lower()).dtype == "dict"
1007
+ ]
1008
+ IsLink = self._host._feature_values.through
1009
+ valuefield_id = "featurevalue_id"
1010
+ host_class_lower = self._host.__class__.__get_name_with_module__().lower()
1011
+ if dict_typed_features:
1012
+ # delete all previously existing anotations with dictionaries
1013
+ kwargs = {
1014
+ f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
1015
+ f"{model_name.lower()}__in": dict_typed_features,
1016
+ }
1017
+ try:
1018
+ value_model.filter(**kwargs).all().delete()
1019
+ except ProtectedError:
1020
+ pass
1021
+ # add new feature links
1022
+ links = [
1023
+ IsLink(
1024
+ **{
1025
+ f"{host_class_lower}_id": self._host.id,
1026
+ valuefield_id: feature_value.id,
1027
+ }
1028
+ )
1029
+ for feature_value in _feature_values
1030
+ ]
1031
+ # a link might already exist, to avoid raising a unique constraint
1032
+ # error, ignore_conflicts
1033
+ save(links, ignore_conflicts=True)
1321
1034
 
1322
- # parse and register features
1323
- mdata = self._host.load(is_run_input=False)
1324
- feature_sets = {}
1035
+ def remove_values(
1036
+ self,
1037
+ feature: str | Feature,
1038
+ *,
1039
+ value: Any | None = None,
1040
+ ):
1041
+ """Remove value annotations for a given feature.
1042
+
1043
+ Args:
1044
+ feature: The feature for which to remove values.
1045
+ value: An optional value to restrict removal to a single value.
1046
+
1047
+ """
1048
+ from .artifact import Artifact
1049
+
1050
+ if isinstance(feature, str):
1051
+ feature = Feature.get(name=feature)
1052
+ filter_kwargs = {"feature": feature}
1053
+ if feature.dtype.startswith("cat["): # type: ignore
1054
+ feature_registry = feature.dtype.replace("cat[", "").replace("]", "") # type: ignore
1055
+ if value is not None:
1056
+ assert isinstance(value, SQLRecord) # noqa: S101
1057
+ # the below uses our convention for field names in link models
1058
+ link_name = (
1059
+ feature_registry.split(".")[1]
1060
+ if "." in feature_registry
1061
+ else feature_registry
1062
+ ).lower()
1063
+ filter_kwargs[link_name] = value
1064
+ if feature_registry == "ULabel":
1065
+ link_attribute = "links_ulabel"
1066
+ else:
1067
+ link_models_on_models = {
1068
+ getattr(
1069
+ Artifact, obj.related_name
1070
+ ).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
1071
+ for obj in Artifact._meta.related_objects
1072
+ if obj.related_model.__get_name_with_module__() == feature_registry
1073
+ }
1074
+ link_attribute = {
1075
+ obj.related_name
1076
+ for obj in Artifact._meta.related_objects
1077
+ if obj.related_model.__get_name_with_module__()
1078
+ in link_models_on_models
1079
+ }.pop()
1080
+ getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
1081
+ else:
1082
+ if value is not None:
1083
+ filter_kwargs["value"] = value
1084
+ feature_values = self._host._feature_values.filter(**filter_kwargs)
1085
+ self._host._feature_values.remove(*feature_values)
1086
+ # this might leave a dangling feature_value record
1087
+ # but we don't want to pay the price of making another query just to remove this annotation
1088
+ # we can clean the FeatureValue registry periodically if we want to
1089
+
1090
+ def _add_schema(self, schema: Schema, slot: str) -> None:
1091
+ """Annotate artifact with a schema.
1092
+
1093
+ Args:
1094
+ schema: `Schema` A schema record.
1095
+ slot: `str` The slot that marks where the schema is stored in
1096
+ the artifact.
1097
+ """
1098
+ # TODO: deprecate as soon as we have the Schema-based curators
1099
+ if self._host._state.adding:
1100
+ raise ValueError(
1101
+ "Please save the artifact or collection before adding a feature set!"
1102
+ )
1103
+ host_db = self._host._state.db
1104
+ schema.save(using=host_db)
1105
+ kwargs = {
1106
+ "artifact_id": self._host.id,
1107
+ "schema": schema,
1108
+ "slot": slot,
1109
+ }
1110
+ link_record = (
1111
+ self._host.feature_sets.through.objects.using(host_db)
1112
+ .filter(**kwargs)
1113
+ .one_or_none()
1114
+ )
1115
+ if link_record is None:
1116
+ self._host.feature_sets.through(**kwargs).save(using=host_db)
1117
+ if slot in self.slots:
1118
+ logger.debug(f"replaced existing {slot} feature set")
1119
+ self._slots[slot] = schema # type: ignore
1120
+
1121
+ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1122
+ """Transfer features from a artifact or collection."""
1123
+ # This only covers feature sets
1124
+ if transfer_logs is None:
1125
+ transfer_logs = {"mapped": [], "transferred": [], "run": None}
1126
+ from lamindb import settings
1127
+
1128
+ using_key = settings._using_key
1129
+ for slot, schema in data.features.slots.items(): # type: ignore
1130
+ try:
1131
+ members = schema.members
1132
+ except ModuleWasntConfigured as err:
1133
+ logger.warning(f"skipping transfer of {slot} schema because {err}")
1134
+ continue
1135
+ if len(members) == 0:
1136
+ continue
1137
+ if len(members) > settings.annotation.n_max_records:
1138
+ logger.warning(
1139
+ f"skipping creating {len(members)} > {settings.annotation.n_max_records} new {members[0].__class__.__name__} records"
1140
+ )
1141
+ schema_self = schema
1142
+ schema_exists = Schema.filter(hash=schema_self.hash).one_or_none()
1143
+ if schema_exists is not None:
1144
+ schema_self = schema_exists
1145
+ else:
1146
+ schema_self.save()
1147
+ else:
1148
+ registry = members[0].__class__
1149
+ # note here the features are transferred based on an unique field
1150
+ field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
1151
+ # this will be e.g. be a list of ontology_ids or uids
1152
+ member_uids = list(members.values_list(field, flat=True))
1153
+ validated = registry.validate(member_uids, field=field, mute=True)
1154
+ new_members_uids = list(compress(member_uids, ~validated))
1155
+ new_members = members.filter(**{f"{field}__in": new_members_uids}).all()
1156
+ n_new_members = len(new_members)
1157
+ if len(members) > settings.annotation.n_max_records:
1158
+ logger.warning(
1159
+ f"skipping creating {n_new_members} > {settings.annotation.n_max_records} new {registry.__name__} records"
1160
+ )
1161
+ if n_new_members > 0:
1162
+ # transfer foreign keys needs to be run before transfer to default db
1163
+ transfer_fk_to_default_db_bulk(
1164
+ new_members, using_key, transfer_logs=transfer_logs
1165
+ )
1166
+ for feature in new_members:
1167
+ # not calling save=True here as in labels, because want to
1168
+ # bulk save below
1169
+ # transfer_fk is set to False because they are already transferred
1170
+ # in the previous step transfer_fk_to_default_db_bulk
1171
+ transfer_to_default_db(
1172
+ feature,
1173
+ using_key,
1174
+ transfer_fk=False,
1175
+ transfer_logs=transfer_logs,
1176
+ )
1177
+ save(
1178
+ new_members, ignore_conflicts=True
1179
+ ) # conflicts arising from existing records are ignored
1325
1180
 
1326
- obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1327
- if len(obs_features) > 0:
1328
- feature_sets["obs"] = Schema(features=obs_features)
1329
- for modality, field in var_fields.items():
1330
- modality_fs = parse_staged_feature_sets_from_anndata(
1331
- mdata[modality],
1332
- var_field=field,
1333
- obs_field=obs_fields.get(modality, Feature.name),
1181
+ # create a new feature set from feature values using the same uid
1182
+ schema_self = Schema.from_values(
1183
+ member_uids, field=getattr(registry, field)
1184
+ )
1185
+ if schema_self is None:
1186
+ if hasattr(registry, "organism_id"):
1187
+ logger.warning(
1188
+ f"Schema is not transferred, check if organism is set correctly: {schema}"
1189
+ )
1190
+ continue
1191
+ # make sure the uid matches if schema is composed of same features
1192
+ if schema_self.hash == schema.hash:
1193
+ schema_self.uid = schema.uid
1194
+ logger.info(f"saving {slot} schema: {schema_self}")
1195
+ try:
1196
+ self._host.features._add_schema(schema_self, slot)
1197
+ except IntegrityError:
1198
+ logger.warning(
1199
+ f"updating annotation of artifact {self._host.uid} with feature set for slot: {slot}"
1200
+ )
1201
+ self._host.feature_sets.through.objects.get(
1202
+ artifact_id=self._host.id, slot=slot
1203
+ ).delete()
1204
+ self._host.features._add_schema(schema_self, slot)
1205
+
1206
+ def make_external(self, feature: Feature) -> None:
1207
+ """Make a feature external, aka, remove feature from feature sets.
1208
+
1209
+ Args:
1210
+ feature: `Feature` A feature record.
1211
+
1212
+ """
1213
+ if not isinstance(feature, Feature):
1214
+ raise TypeError("feature must be a Feature record!")
1215
+ feature_sets = Schema.filter(features=feature).all()
1216
+ for fs in feature_sets:
1217
+ f = Feature.filter(uid=feature.uid).all()
1218
+ features_updated = fs.members.difference(f)
1219
+ if len(features_updated) > 0:
1220
+ # re-compute the hash of feature sets based on the updated members
1221
+ features_hash = hash_set({feature.uid for feature in features_updated})
1222
+ fs.hash = features_hash
1223
+ fs.n = len(features_updated)
1224
+ fs.save()
1225
+ # delete the link between the feature and the feature set
1226
+ Schema.features.through.objects.filter(
1227
+ feature_id=feature.id, schema_id=fs.id
1228
+ ).delete()
1229
+ # if no members are left in the schema, delete it
1230
+ if len(features_updated) == 0:
1231
+ logger.warning(f"deleting empty feature set: {fs}")
1232
+ fs.artifacts.set([])
1233
+ fs.delete()
1234
+
1235
+ @deprecated("_add_schema")
1236
+ def add_schema(self, schema: Schema, slot: str) -> None:
1237
+ return self._add_schema(schema, slot)
1238
+
1239
+ @deprecated("_add_schema")
1240
+ def add_feature_set(self, schema: Schema, slot: str) -> None:
1241
+ return self._add_schema(schema, slot)
1242
+
1243
+ @property
1244
+ @deprecated("slots")
1245
+ def _schema_by_slot(self):
1246
+ return self.slots
1247
+
1248
+ @property
1249
+ def _feature_set_by_slot(self):
1250
+ return self.slots
1251
+
1252
+ # no longer called from within curator
1253
+ # deprecated
1254
+ def _add_set_from_df(
1255
+ self,
1256
+ field: FieldAttr = Feature.name,
1257
+ organism: str | None = None,
1258
+ mute: bool = False,
1259
+ ):
1260
+ """Add feature set corresponding to column names of DataFrame."""
1261
+ assert self._host.otype == "DataFrame" # noqa: S101
1262
+ df = self._host.load(is_run_input=False)
1263
+ schema = Schema.from_df(
1264
+ df=df,
1265
+ field=field,
1334
1266
  mute=mute,
1335
1267
  organism=organism,
1336
1268
  )
1337
- for k, v in modality_fs.items():
1338
- feature_sets[f"['{modality}'].{k}"] = v
1269
+ self._host._staged_feature_sets = {"columns": schema}
1270
+ self._host.save()
1339
1271
 
1340
- # link feature sets
1341
- self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1342
- self._host.save()
1272
+ # deprecated
1273
+ def _add_set_from_anndata(
1274
+ self,
1275
+ var_field: FieldAttr | None = None,
1276
+ obs_field: FieldAttr | None = Feature.name,
1277
+ uns_field: FieldAttr | None = None,
1278
+ mute: bool = False,
1279
+ organism: str | SQLRecord | None = None,
1280
+ ):
1281
+ """Add features from AnnData."""
1282
+ assert self._host.otype == "AnnData" # noqa: S101
1283
+
1284
+ # parse and register features
1285
+ adata = self._host.load(is_run_input=False)
1286
+ feature_sets = parse_staged_feature_sets_from_anndata(
1287
+ adata,
1288
+ var_field=var_field,
1289
+ obs_field=obs_field,
1290
+ uns_field=uns_field,
1291
+ mute=mute,
1292
+ organism=organism,
1293
+ )
1343
1294
 
1295
+ # link feature sets
1296
+ self._host._staged_feature_sets = feature_sets
1297
+ self._host.save()
1344
1298
 
1345
- def _add_set_from_spatialdata(
1346
- self,
1347
- sample_metadata_key: str,
1348
- sample_metadata_field: FieldAttr = Feature.name,
1349
- var_fields: dict[str, FieldAttr] | None = None,
1350
- obs_fields: dict[str, FieldAttr] | None = None,
1351
- mute: bool = False,
1352
- organism: str | SQLRecord | None = None,
1353
- ):
1354
- """Add features from SpatialData."""
1355
- obs_fields, var_fields = obs_fields or {}, var_fields or {}
1356
- assert self._host.otype == "SpatialData" # noqa: S101
1299
+ # deprecated
1300
+ def _add_set_from_mudata(
1301
+ self,
1302
+ var_fields: dict[str, FieldAttr] | None = None,
1303
+ obs_fields: dict[str, FieldAttr] | None = None,
1304
+ mute: bool = False,
1305
+ organism: str | SQLRecord | None = None,
1306
+ ):
1307
+ """Add features from MuData."""
1308
+ if obs_fields is None:
1309
+ obs_fields = {}
1310
+ assert self._host.otype == "MuData" # noqa: S101
1311
+
1312
+ # parse and register features
1313
+ mdata = self._host.load(is_run_input=False)
1314
+ feature_sets = {}
1315
+
1316
+ obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1317
+ if len(obs_features) > 0:
1318
+ feature_sets["obs"] = Schema(features=obs_features)
1319
+ for modality, field in var_fields.items():
1320
+ modality_fs = parse_staged_feature_sets_from_anndata(
1321
+ mdata[modality],
1322
+ var_field=field,
1323
+ obs_field=obs_fields.get(modality, Feature.name),
1324
+ mute=mute,
1325
+ organism=organism,
1326
+ )
1327
+ for k, v in modality_fs.items():
1328
+ feature_sets[f"['{modality}'].{k}"] = v
1357
1329
 
1358
- # parse and register features
1359
- sdata = self._host.load(is_run_input=False)
1360
- feature_sets = {}
1330
+ # link feature sets
1331
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
1332
+ feature_sets
1333
+ )
1334
+ self._host.save()
1361
1335
 
1362
- # sample features
1363
- sample_features = Feature.from_values(
1364
- sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
1365
- field=sample_metadata_field,
1366
- ) # type: ignore
1367
- if len(sample_features) > 0:
1368
- feature_sets[sample_metadata_key] = Schema(features=sample_features)
1369
-
1370
- # table features
1371
- for table, field in var_fields.items():
1372
- table_fs = parse_staged_feature_sets_from_anndata(
1373
- sdata[table],
1374
- var_field=field,
1375
- obs_field=obs_fields.get(table, Feature.name),
1376
- mute=mute,
1377
- organism=organism,
1336
+ # deprecated
1337
+ def _add_set_from_spatialdata(
1338
+ self,
1339
+ sample_metadata_key: str,
1340
+ sample_metadata_field: FieldAttr = Feature.name,
1341
+ var_fields: dict[str, FieldAttr] | None = None,
1342
+ obs_fields: dict[str, FieldAttr] | None = None,
1343
+ mute: bool = False,
1344
+ organism: str | SQLRecord | None = None,
1345
+ ):
1346
+ """Add features from SpatialData."""
1347
+ obs_fields, var_fields = obs_fields or {}, var_fields or {}
1348
+ assert self._host.otype == "SpatialData" # noqa: S101
1349
+
1350
+ # parse and register features
1351
+ sdata = self._host.load(is_run_input=False)
1352
+ feature_sets = {}
1353
+
1354
+ # sample features
1355
+ sample_features = Feature.from_values(
1356
+ sdata.get_attrs(
1357
+ key=sample_metadata_key, return_as="df", flatten=True
1358
+ ).columns,
1359
+ field=sample_metadata_field,
1360
+ ) # type: ignore
1361
+ if len(sample_features) > 0:
1362
+ feature_sets[sample_metadata_key] = Schema(features=sample_features)
1363
+
1364
+ # table features
1365
+ for table, field in var_fields.items():
1366
+ table_fs = parse_staged_feature_sets_from_anndata(
1367
+ sdata[table],
1368
+ var_field=field,
1369
+ obs_field=obs_fields.get(table, Feature.name),
1370
+ mute=mute,
1371
+ organism=organism,
1372
+ )
1373
+ for k, v in table_fs.items():
1374
+ feature_sets[f"['{table}'].{k}"] = v
1375
+
1376
+ # link feature sets
1377
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
1378
+ feature_sets
1378
1379
  )
1379
- for k, v in table_fs.items():
1380
- feature_sets[f"['{table}'].{k}"] = v
1381
-
1382
- # link feature sets
1383
- self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1384
- self._host.save()
1385
-
1386
-
1387
- # mypy: ignore-errors
1388
- FeatureManager.__init__ = __init__
1389
- FeatureManager.__repr__ = __repr__
1390
- FeatureManager.describe = describe
1391
- FeatureManager.__getitem__ = __getitem__
1392
- FeatureManager.get_values = get_values
1393
- FeatureManager.slots = slots
1394
- FeatureManager.add_values = add_values_features
1395
- FeatureManager._add_schema = _add_schema
1396
- FeatureManager._accessor_by_registry = _accessor_by_registry
1397
- FeatureManager._add_from = _add_from
1398
- FeatureManager.filter = filter
1399
- FeatureManager.get = get
1400
- FeatureManager.make_external = make_external
1401
- FeatureManager.remove_values = remove_values
1402
-
1403
- # deprecated
1404
- FeatureManager._add_set_from_df = _add_set_from_df
1405
- FeatureManager._add_set_from_anndata = _add_set_from_anndata
1406
- FeatureManager._add_set_from_mudata = _add_set_from_mudata
1407
- FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
1408
- FeatureManager.add_schema = add_schema
1409
- FeatureManager.add_feature_set = add_feature_set
1410
- FeatureManager._schema_by_slot = _schema_by_slot
1411
- FeatureManager._feature_set_by_slot = _feature_set_by_slot
1380
+ self._host.save()