lamindb 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -3
- lamindb/_finish.py +32 -16
- lamindb/base/types.py +6 -4
- lamindb/core/_context.py +127 -57
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_settings.py +44 -4
- lamindb/core/_track_environment.py +5 -2
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_anndata_accessor.py +1 -1
- lamindb/core/storage/_tiledbsoma.py +14 -8
- lamindb/core/storage/_valid_suffixes.py +0 -1
- lamindb/core/storage/_zarr.py +1 -1
- lamindb/core/storage/objects.py +13 -8
- lamindb/core/storage/paths.py +9 -6
- lamindb/core/types.py +1 -1
- lamindb/curators/_legacy.py +2 -1
- lamindb/curators/core.py +106 -105
- lamindb/errors.py +9 -0
- lamindb/examples/fixtures/__init__.py +0 -0
- lamindb/examples/fixtures/sheets.py +224 -0
- lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +1 -1
- lamindb/migrations/0105_record_unique_name.py +20 -0
- lamindb/migrations/0106_transfer_data_migration.py +25 -0
- lamindb/migrations/0107_add_schema_to_record.py +68 -0
- lamindb/migrations/0108_remove_record_sheet_remove_sheetproject_sheet_and_more.py +30 -0
- lamindb/migrations/0109_record_input_of_runs_alter_record_run_and_more.py +123 -0
- lamindb/migrations/0110_rename_values_artifacts_record_linked_artifacts.py +17 -0
- lamindb/migrations/0111_remove_record__sort_order.py +148 -0
- lamindb/migrations/0112_alter_recordartifact_feature_and_more.py +105 -0
- lamindb/migrations/0113_lower_case_branch_and_space_names.py +62 -0
- lamindb/migrations/0114_alter_run__status_code.py +24 -0
- lamindb/migrations/0115_alter_space_uid.py +52 -0
- lamindb/migrations/{0104_squashed.py → 0115_squashed.py} +261 -257
- lamindb/models/__init__.py +4 -3
- lamindb/models/_describe.py +88 -31
- lamindb/models/_feature_manager.py +627 -658
- lamindb/models/_label_manager.py +1 -3
- lamindb/models/artifact.py +214 -99
- lamindb/models/collection.py +7 -1
- lamindb/models/feature.py +288 -60
- lamindb/models/has_parents.py +3 -3
- lamindb/models/project.py +32 -15
- lamindb/models/query_manager.py +7 -1
- lamindb/models/query_set.py +118 -41
- lamindb/models/record.py +140 -94
- lamindb/models/run.py +42 -42
- lamindb/models/save.py +102 -16
- lamindb/models/schema.py +41 -8
- lamindb/models/sqlrecord.py +105 -40
- lamindb/models/storage.py +278 -0
- lamindb/models/transform.py +10 -2
- lamindb/models/ulabel.py +9 -1
- lamindb/py.typed +0 -0
- lamindb/setup/__init__.py +2 -1
- lamindb/setup/_switch.py +16 -0
- lamindb/setup/errors/__init__.py +4 -0
- lamindb/setup/types/__init__.py +4 -0
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/METADATA +5 -5
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/RECORD +61 -44
- lamindb/models/core.py +0 -135
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/LICENSE +0 -0
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/WHEEL +0 -0
@@ -14,9 +14,11 @@ from anndata import AnnData
|
|
14
14
|
from django.contrib.postgres.aggregates import ArrayAgg
|
15
15
|
from django.db import connections
|
16
16
|
from django.db.models import Aggregate, ProtectedError, Subquery
|
17
|
+
from django.db.utils import IntegrityError
|
17
18
|
from lamin_utils import logger
|
18
19
|
from lamindb_setup.core.hashing import hash_set
|
19
20
|
from lamindb_setup.core.upath import create_path
|
21
|
+
from lamindb_setup.errors import ModuleWasntConfigured
|
20
22
|
from rich.table import Column, Table
|
21
23
|
from rich.text import Text
|
22
24
|
|
@@ -31,6 +33,7 @@ from lamindb.models.save import save
|
|
31
33
|
from lamindb.models.schema import DICT_KEYS_TYPE, Schema
|
32
34
|
from lamindb.models.sqlrecord import (
|
33
35
|
REGISTRY_UNIQUE_FIELD,
|
36
|
+
Registry,
|
34
37
|
get_name_field,
|
35
38
|
transfer_fk_to_default_db_bulk,
|
36
39
|
transfer_to_default_db,
|
@@ -50,7 +53,6 @@ from ._relations import (
|
|
50
53
|
dict_related_model_to_related_name,
|
51
54
|
)
|
52
55
|
from .feature import Feature, FeatureValue, parse_dtype
|
53
|
-
from .run import FeatureManager, FeatureManagerRun, Run
|
54
56
|
from .sqlrecord import SQLRecord
|
55
57
|
from .ulabel import ULabel
|
56
58
|
|
@@ -65,11 +67,7 @@ if TYPE_CHECKING:
|
|
65
67
|
)
|
66
68
|
from lamindb.models.query_set import QuerySet
|
67
69
|
|
68
|
-
|
69
|
-
class FeatureManagerArtifact(FeatureManager):
|
70
|
-
"""Feature manager."""
|
71
|
-
|
72
|
-
pass
|
70
|
+
from .run import Run
|
73
71
|
|
74
72
|
|
75
73
|
def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
|
@@ -82,7 +80,7 @@ def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
|
|
82
80
|
return dictionary
|
83
81
|
|
84
82
|
|
85
|
-
def get_schema_by_slot_(host: Artifact) -> dict:
|
83
|
+
def get_schema_by_slot_(host: Artifact) -> dict[str, Schema]:
|
86
84
|
# if the host is not yet saved
|
87
85
|
if host._state.adding:
|
88
86
|
if hasattr(host, "_staged_feature_sets"):
|
@@ -325,7 +323,7 @@ def describe_features(
|
|
325
323
|
feature_data[feature_name] = (slot, registry_str)
|
326
324
|
schema_data.update(
|
327
325
|
{
|
328
|
-
slot: (schema, schema.n)
|
326
|
+
slot: (schema, schema.n) # type: ignore
|
329
327
|
for slot, schema in get_schema_by_slot_(self).items()
|
330
328
|
if slot not in schema_data
|
331
329
|
}
|
@@ -569,40 +567,9 @@ def infer_feature_type_convert_json(
|
|
569
567
|
return "?", value, message
|
570
568
|
|
571
569
|
|
572
|
-
def
|
573
|
-
|
574
|
-
|
575
|
-
self._accessor_by_registry_ = None
|
576
|
-
|
577
|
-
|
578
|
-
def __repr__(self) -> str:
|
579
|
-
return describe(self, return_str=True) # type: ignore
|
580
|
-
|
581
|
-
|
582
|
-
def describe(self, return_str: bool = False) -> str | None:
|
583
|
-
tree = describe_features(self._host) # type: ignore
|
584
|
-
return format_rich_tree(tree, fallback="no linked features", return_str=return_str)
|
585
|
-
|
586
|
-
|
587
|
-
def get_values(self) -> dict[str, Any]:
|
588
|
-
"""Get feature values as a dictionary."""
|
589
|
-
return describe_features(self._host, to_dict=True) # type: ignore
|
590
|
-
|
591
|
-
|
592
|
-
@deprecated("slots[slot].members")
|
593
|
-
def __getitem__(self, slot) -> QuerySet:
|
594
|
-
if slot not in self.slots:
|
595
|
-
raise ValueError(
|
596
|
-
f"No linked feature set for slot: {slot}\nDid you get validation"
|
597
|
-
" warnings? Only features that match registered features get validated"
|
598
|
-
" and linked."
|
599
|
-
)
|
600
|
-
schema = self.slots[slot]
|
601
|
-
orm_name = schema.itype
|
602
|
-
return getattr(schema, self._accessor_by_registry[orm_name]).all()
|
603
|
-
|
604
|
-
|
605
|
-
def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
|
570
|
+
def filter_base(
|
571
|
+
registry: Registry, _skip_validation: bool = True, **expression
|
572
|
+
) -> QuerySet:
|
606
573
|
from .artifact import Artifact
|
607
574
|
|
608
575
|
model = Feature
|
@@ -629,7 +596,7 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
|
|
629
596
|
"list[cat"
|
630
597
|
):
|
631
598
|
if comparator == "__isnull":
|
632
|
-
if
|
599
|
+
if registry is Artifact:
|
633
600
|
from .artifact import ArtifactFeatureValue
|
634
601
|
|
635
602
|
if value: # True
|
@@ -659,7 +626,7 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
|
|
659
626
|
# categorical features
|
660
627
|
elif isinstance(value, (str, SQLRecord, bool)):
|
661
628
|
if comparator == "__isnull":
|
662
|
-
if
|
629
|
+
if registry is Artifact:
|
663
630
|
result = parse_dtype(feature.dtype)[0]
|
664
631
|
kwargs = {
|
665
632
|
f"links_{result['registry'].__name__.lower()}__feature": feature
|
@@ -709,380 +676,10 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
|
|
709
676
|
# https://laminlabs.slack.com/archives/C04FPE8V01W/p1688328084810609
|
710
677
|
if not (new_expression):
|
711
678
|
raise NotImplementedError
|
712
|
-
|
713
|
-
return Artifact.objects.filter(**new_expression)
|
714
|
-
elif cls == FeatureManagerRun:
|
715
|
-
return Run.objects.filter(**new_expression)
|
716
|
-
|
717
|
-
|
718
|
-
@classmethod # type: ignore
|
719
|
-
@deprecated("the filter() registry classmethod")
|
720
|
-
def filter(cls, **expression) -> QuerySet:
|
721
|
-
"""Query artifacts by features."""
|
722
|
-
return filter_base(cls, _skip_validation=False, **expression)
|
723
|
-
|
724
|
-
|
725
|
-
@classmethod # type: ignore
|
726
|
-
@deprecated("the filter() registry classmethod")
|
727
|
-
def get(cls, **expression) -> SQLRecord:
|
728
|
-
"""Query a single artifact by feature."""
|
729
|
-
return filter_base(cls, _skip_validation=False, **expression).one()
|
730
|
-
|
731
|
-
|
732
|
-
@property # type: ignore
|
733
|
-
def slots(self) -> dict[str, Schema]:
|
734
|
-
"""Schema by slot.
|
735
|
-
|
736
|
-
Example::
|
737
|
-
|
738
|
-
artifact.features.slots
|
739
|
-
#> {'var': <Schema: var>, 'obs': <Schema: obs>}
|
740
|
-
"""
|
741
|
-
if self._slots is None:
|
742
|
-
self._slots = get_schema_by_slot_(self._host)
|
743
|
-
return self._slots
|
744
|
-
|
745
|
-
|
746
|
-
@property # type: ignore
|
747
|
-
def _accessor_by_registry(self):
|
748
|
-
"""Accessor by ORM."""
|
749
|
-
if self._accessor_by_registry_ is None:
|
750
|
-
self._accessor_by_registry_ = get_accessor_by_registry_(self._host)
|
751
|
-
return self._accessor_by_registry_
|
752
|
-
|
753
|
-
|
754
|
-
def add_label_feature_links(
|
755
|
-
self,
|
756
|
-
features_labels,
|
757
|
-
*,
|
758
|
-
label_ref_is_name: bool | None = None,
|
759
|
-
feature_ref_is_name: bool | None = None,
|
760
|
-
):
|
761
|
-
if list(features_labels.keys()) != ["ULabel"]:
|
762
|
-
related_names = dict_related_model_to_related_name(self._host.__class__)
|
763
|
-
else:
|
764
|
-
related_names = {"ULabel": "ulabels"}
|
765
|
-
for class_name, registry_features_labels in features_labels.items():
|
766
|
-
related_name = related_names[class_name] # e.g., "ulabels"
|
767
|
-
IsLink = getattr(self._host, related_name).through
|
768
|
-
field_name = f"{get_link_attr(IsLink, self._host)}_id" # e.g., ulabel_id
|
769
|
-
links = [
|
770
|
-
IsLink(
|
771
|
-
**{
|
772
|
-
"artifact_id": self._host.id,
|
773
|
-
"feature_id": feature.id,
|
774
|
-
field_name: label.id,
|
775
|
-
"feature_ref_is_name": feature_ref_is_name,
|
776
|
-
"label_ref_is_name": label_ref_is_name,
|
777
|
-
}
|
778
|
-
)
|
779
|
-
for (feature, label) in registry_features_labels
|
780
|
-
]
|
781
|
-
# a link might already exist
|
782
|
-
try:
|
783
|
-
save(links, ignore_conflicts=False)
|
784
|
-
except Exception:
|
785
|
-
save(links, ignore_conflicts=True)
|
786
|
-
# now delete links that were previously saved without a feature
|
787
|
-
IsLink.filter(
|
788
|
-
**{
|
789
|
-
"artifact_id": self._host.id,
|
790
|
-
"feature_id": None,
|
791
|
-
f"{field_name}__in": [l.id for _, l in registry_features_labels],
|
792
|
-
}
|
793
|
-
).all().delete()
|
794
|
-
|
795
|
-
|
796
|
-
def _add_values(
|
797
|
-
self,
|
798
|
-
values: dict[str, str | int | float | bool],
|
799
|
-
feature_param_field: FieldAttr,
|
800
|
-
str_as_ulabel: bool = True,
|
801
|
-
) -> None:
|
802
|
-
"""Curate artifact with features & values.
|
803
|
-
|
804
|
-
Args:
|
805
|
-
values: A dictionary of keys (features) & values (labels, numbers, booleans).
|
806
|
-
feature_param_field: The field of a reference registry to map keys of the
|
807
|
-
dictionary.
|
808
|
-
"""
|
809
|
-
from .._tracked import get_current_tracked_run
|
810
|
-
|
811
|
-
# rename to distinguish from the values inside the dict
|
812
|
-
dictionary = values
|
813
|
-
keys = dictionary.keys()
|
814
|
-
if isinstance(keys, DICT_KEYS_TYPE):
|
815
|
-
keys = list(keys) # type: ignore
|
816
|
-
# deal with other cases later
|
817
|
-
assert all(isinstance(key, str) for key in keys) # noqa: S101
|
818
|
-
registry = feature_param_field.field.model
|
819
|
-
value_model = FeatureValue
|
820
|
-
model_name = "Feature"
|
821
|
-
records = registry.from_values(keys, field=feature_param_field, mute=True)
|
822
|
-
if len(records) != len(keys):
|
823
|
-
not_validated_keys = [key for key in keys if key not in records.list("name")]
|
824
|
-
not_validated_keys_dtype_message = [
|
825
|
-
(key, infer_feature_type_convert_json(key, dictionary[key]))
|
826
|
-
for key in not_validated_keys
|
827
|
-
]
|
828
|
-
run = get_current_tracked_run()
|
829
|
-
if run is not None:
|
830
|
-
name = f"{run.transform.type}[{run.transform.key}]"
|
831
|
-
type_hint = f""" {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
|
832
|
-
elements = [type_hint]
|
833
|
-
type_kwarg = f", type={model_name.lower()}_type"
|
834
|
-
else:
|
835
|
-
elements = []
|
836
|
-
type_kwarg = ""
|
837
|
-
elements += [
|
838
|
-
f" ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
|
839
|
-
for key, (dtype, _, message) in not_validated_keys_dtype_message
|
840
|
-
]
|
841
|
-
hint = "\n".join(elements)
|
842
|
-
msg = (
|
843
|
-
f"These keys could not be validated: {not_validated_keys}\n"
|
844
|
-
f"Here is how to create a {model_name.lower()}:\n\n{hint}"
|
845
|
-
)
|
846
|
-
raise ValidationError(msg)
|
847
|
-
|
848
|
-
# figure out which of the values go where
|
849
|
-
features_labels = defaultdict(list)
|
850
|
-
_feature_values = []
|
851
|
-
not_validated_values = []
|
852
|
-
for feature in records:
|
853
|
-
value = dictionary[feature.name]
|
854
|
-
inferred_type, converted_value, _ = infer_feature_type_convert_json(
|
855
|
-
feature.name,
|
856
|
-
value,
|
857
|
-
mute=True,
|
858
|
-
str_as_ulabel=str_as_ulabel,
|
859
|
-
)
|
860
|
-
if feature.dtype == "num":
|
861
|
-
if inferred_type not in {"int", "float"}:
|
862
|
-
raise TypeError(
|
863
|
-
f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
|
864
|
-
)
|
865
|
-
elif feature.dtype.startswith("cat"):
|
866
|
-
if inferred_type != "?":
|
867
|
-
if not (
|
868
|
-
inferred_type.startswith("cat") or isinstance(value, SQLRecord)
|
869
|
-
):
|
870
|
-
raise TypeError(
|
871
|
-
f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
|
872
|
-
)
|
873
|
-
elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
|
874
|
-
feature.dtype != "str" and feature.dtype != inferred_type
|
875
|
-
):
|
876
|
-
raise ValidationError(
|
877
|
-
f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
|
878
|
-
)
|
879
|
-
if not feature.dtype.startswith("cat"):
|
880
|
-
filter_kwargs = {model_name.lower(): feature, "value": converted_value}
|
881
|
-
feature_value, _ = value_model.get_or_create(**filter_kwargs)
|
882
|
-
_feature_values.append(feature_value)
|
883
|
-
else:
|
884
|
-
if isinstance(value, SQLRecord) or (
|
885
|
-
isinstance(value, Iterable) and isinstance(next(iter(value)), SQLRecord)
|
886
|
-
):
|
887
|
-
if isinstance(value, SQLRecord):
|
888
|
-
label_records = [value]
|
889
|
-
else:
|
890
|
-
label_records = value # type: ignore
|
891
|
-
for record in label_records:
|
892
|
-
if record._state.adding:
|
893
|
-
raise ValidationError(
|
894
|
-
f"Please save {record} before annotation."
|
895
|
-
)
|
896
|
-
features_labels[record.__class__.__get_name_with_module__()].append(
|
897
|
-
(feature, record)
|
898
|
-
)
|
899
|
-
else:
|
900
|
-
if isinstance(value, str):
|
901
|
-
values = [value] # type: ignore
|
902
|
-
else:
|
903
|
-
values = value # type: ignore
|
904
|
-
if "ULabel" not in feature.dtype:
|
905
|
-
feature.dtype += "[ULabel]"
|
906
|
-
feature.save()
|
907
|
-
validated = ULabel.validate(values, field=ULabel.name, mute=True)
|
908
|
-
values_array = np.array(values)
|
909
|
-
validated_values = values_array[validated]
|
910
|
-
if validated.sum() != len(values):
|
911
|
-
not_validated_values += values_array[~validated].tolist()
|
912
|
-
label_records = ULabel.from_values(
|
913
|
-
validated_values, field=ULabel.name, mute=True
|
914
|
-
) # type: ignore
|
915
|
-
features_labels["ULabel"] += [
|
916
|
-
(feature, label_record) for label_record in label_records
|
917
|
-
]
|
918
|
-
if not_validated_values:
|
919
|
-
not_validated_values.sort()
|
920
|
-
hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
|
921
|
-
msg = (
|
922
|
-
f"These values could not be validated: {not_validated_values}\n"
|
923
|
-
f"Here is how to create ulabels for them:\n\n{hint}"
|
924
|
-
)
|
925
|
-
raise ValidationError(msg)
|
926
|
-
# TODO: create an explicit version of this
|
927
|
-
# if not is_param:
|
928
|
-
# # check if _expect_many is false for _all_ records
|
929
|
-
# if any(record._expect_many for record in records):
|
930
|
-
# updated_features = []
|
931
|
-
# for record in records:
|
932
|
-
# if record._expect_many:
|
933
|
-
# record._expect_many = False
|
934
|
-
# record.save()
|
935
|
-
# updated_features.append(record.name)
|
936
|
-
# if any(updated_features):
|
937
|
-
# logger.important(
|
938
|
-
# f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
|
939
|
-
# )
|
940
|
-
# bulk add all links
|
941
|
-
if features_labels:
|
942
|
-
add_label_feature_links(self, features_labels)
|
943
|
-
if _feature_values:
|
944
|
-
to_insert_feature_values = [
|
945
|
-
record for record in _feature_values if record._state.adding
|
946
|
-
]
|
947
|
-
if to_insert_feature_values:
|
948
|
-
save(to_insert_feature_values)
|
949
|
-
dict_typed_features = [
|
950
|
-
getattr(record, model_name.lower())
|
951
|
-
for record in _feature_values
|
952
|
-
if getattr(record, model_name.lower()).dtype == "dict"
|
953
|
-
]
|
954
|
-
IsLink = self._host._feature_values.through
|
955
|
-
valuefield_id = "featurevalue_id"
|
956
|
-
host_class_lower = self._host.__class__.__get_name_with_module__().lower()
|
957
|
-
if dict_typed_features:
|
958
|
-
# delete all previously existing anotations with dictionaries
|
959
|
-
kwargs = {
|
960
|
-
f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
|
961
|
-
f"{model_name.lower()}__in": dict_typed_features,
|
962
|
-
}
|
963
|
-
try:
|
964
|
-
value_model.filter(**kwargs).all().delete()
|
965
|
-
except ProtectedError:
|
966
|
-
pass
|
967
|
-
# add new feature links
|
968
|
-
links = [
|
969
|
-
IsLink(
|
970
|
-
**{
|
971
|
-
f"{host_class_lower}_id": self._host.id,
|
972
|
-
valuefield_id: feature_value.id,
|
973
|
-
}
|
974
|
-
)
|
975
|
-
for feature_value in _feature_values
|
976
|
-
]
|
977
|
-
# a link might already exist, to avoid raising a unique constraint
|
978
|
-
# error, ignore_conflicts
|
979
|
-
save(links, ignore_conflicts=True)
|
980
|
-
|
981
|
-
|
982
|
-
def add_values_features(
|
983
|
-
self,
|
984
|
-
values: dict[str, str | int | float | bool],
|
985
|
-
feature_field: FieldAttr = Feature.name,
|
986
|
-
str_as_ulabel: bool = True,
|
987
|
-
) -> None:
|
988
|
-
"""Curate artifact with features & values.
|
989
|
-
|
990
|
-
Args:
|
991
|
-
values: A dictionary of keys (features) & values (labels, numbers, booleans).
|
992
|
-
feature_field: The field of a reference registry to map keys of the
|
993
|
-
dictionary.
|
994
|
-
str_as_ulabel: Whether to interpret string values as ulabels.
|
995
|
-
"""
|
996
|
-
_add_values(self, values, feature_field, str_as_ulabel=str_as_ulabel)
|
997
|
-
|
998
|
-
|
999
|
-
def remove_values(
|
1000
|
-
self,
|
1001
|
-
feature: str | Feature,
|
1002
|
-
*,
|
1003
|
-
value: Any | None = None,
|
1004
|
-
):
|
1005
|
-
"""Remove value annotations for a given feature.
|
1006
|
-
|
1007
|
-
Args:
|
1008
|
-
feature: The feature for which to remove values.
|
1009
|
-
value: An optional value to restrict removal to a single value.
|
1010
|
-
|
1011
|
-
"""
|
1012
|
-
from .artifact import Artifact
|
1013
|
-
|
1014
|
-
if isinstance(feature, str):
|
1015
|
-
feature = Feature.get(name=feature)
|
1016
|
-
filter_kwargs = {"feature": feature}
|
1017
|
-
if feature.dtype.startswith("cat["): # type: ignore
|
1018
|
-
feature_registry = feature.dtype.replace("cat[", "").replace("]", "") # type: ignore
|
1019
|
-
if value is not None:
|
1020
|
-
assert isinstance(value, SQLRecord) # noqa: S101
|
1021
|
-
# the below uses our convention for field names in link models
|
1022
|
-
link_name = (
|
1023
|
-
feature_registry.split(".")[1]
|
1024
|
-
if "." in feature_registry
|
1025
|
-
else feature_registry
|
1026
|
-
).lower()
|
1027
|
-
filter_kwargs[link_name] = value
|
1028
|
-
if feature_registry == "ULabel":
|
1029
|
-
link_attribute = "links_ulabel"
|
1030
|
-
else:
|
1031
|
-
link_models_on_models = {
|
1032
|
-
getattr(
|
1033
|
-
Artifact, obj.related_name
|
1034
|
-
).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
|
1035
|
-
for obj in Artifact._meta.related_objects
|
1036
|
-
if obj.related_model.__get_name_with_module__() == feature_registry
|
1037
|
-
}
|
1038
|
-
link_attribute = {
|
1039
|
-
obj.related_name
|
1040
|
-
for obj in Artifact._meta.related_objects
|
1041
|
-
if obj.related_model.__get_name_with_module__() in link_models_on_models
|
1042
|
-
}.pop()
|
1043
|
-
getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
|
1044
|
-
else:
|
1045
|
-
if value is not None:
|
1046
|
-
filter_kwargs["value"] = value
|
1047
|
-
feature_values = self._host._feature_values.filter(**filter_kwargs)
|
1048
|
-
self._host._feature_values.remove(*feature_values)
|
1049
|
-
# this might leave a dangling feature_value record
|
1050
|
-
# but we don't want to pay the price of making another query just to remove this annotation
|
1051
|
-
# we can clean the FeatureValue registry periodically if we want to
|
1052
|
-
|
1053
|
-
|
1054
|
-
def _add_schema(self, schema: Schema, slot: str) -> None:
|
1055
|
-
"""Annotate artifact with a schema.
|
1056
|
-
|
1057
|
-
Args:
|
1058
|
-
schema: `Schema` A schema record.
|
1059
|
-
slot: `str` The slot that marks where the schema is stored in
|
1060
|
-
the artifact.
|
1061
|
-
"""
|
1062
|
-
# TODO: deprecate as soon as we have the Schema-based curators
|
1063
|
-
if self._host._state.adding:
|
1064
|
-
raise ValueError(
|
1065
|
-
"Please save the artifact or collection before adding a feature set!"
|
1066
|
-
)
|
1067
|
-
host_db = self._host._state.db
|
1068
|
-
schema.save(using=host_db)
|
1069
|
-
kwargs = {
|
1070
|
-
"artifact_id": self._host.id,
|
1071
|
-
"schema": schema,
|
1072
|
-
"slot": slot,
|
1073
|
-
}
|
1074
|
-
link_record = (
|
1075
|
-
self._host.feature_sets.through.objects.using(host_db)
|
1076
|
-
.filter(**kwargs)
|
1077
|
-
.one_or_none()
|
1078
|
-
)
|
1079
|
-
if link_record is None:
|
1080
|
-
self._host.feature_sets.through(**kwargs).save(using=host_db)
|
1081
|
-
if slot in self.slots:
|
1082
|
-
logger.debug(f"replaced existing {slot} feature set")
|
1083
|
-
self._slots[slot] = schema # type: ignore
|
679
|
+
return registry.objects.filter(**new_expression)
|
1084
680
|
|
1085
681
|
|
682
|
+
# for deprecated functionality
|
1086
683
|
def _unify_staged_feature_sets_by_hash(
|
1087
684
|
feature_sets: MutableMapping[str, Schema],
|
1088
685
|
):
|
@@ -1098,114 +695,7 @@ def _unify_staged_feature_sets_by_hash(
|
|
1098
695
|
return feature_sets
|
1099
696
|
|
1100
697
|
|
1101
|
-
|
1102
|
-
"""Transfer features from a artifact or collection."""
|
1103
|
-
# This only covers feature sets
|
1104
|
-
if transfer_logs is None:
|
1105
|
-
transfer_logs = {"mapped": [], "transferred": [], "run": None}
|
1106
|
-
from lamindb import settings
|
1107
|
-
|
1108
|
-
using_key = settings._using_key
|
1109
|
-
for slot, schema in data.features.slots.items(): # type: ignore
|
1110
|
-
members = schema.members
|
1111
|
-
if len(members) == 0:
|
1112
|
-
continue
|
1113
|
-
registry = members[0].__class__
|
1114
|
-
# note here the features are transferred based on an unique field
|
1115
|
-
field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
|
1116
|
-
# this will be e.g. be a list of ontology_ids or uids
|
1117
|
-
member_uids = list(members.values_list(field, flat=True))
|
1118
|
-
validated = registry.validate(member_uids, field=field, mute=True)
|
1119
|
-
new_members_uids = list(compress(member_uids, ~validated))
|
1120
|
-
new_members = members.filter(**{f"{field}__in": new_members_uids}).all()
|
1121
|
-
n_new_members = len(new_members)
|
1122
|
-
if n_new_members > 0:
|
1123
|
-
# transfer foreign keys needs to be run before transfer to default db
|
1124
|
-
transfer_fk_to_default_db_bulk(
|
1125
|
-
new_members, using_key, transfer_logs=transfer_logs
|
1126
|
-
)
|
1127
|
-
for feature in new_members:
|
1128
|
-
# not calling save=True here as in labels, because want to
|
1129
|
-
# bulk save below
|
1130
|
-
# transfer_fk is set to False because they are already transferred
|
1131
|
-
# in the previous step transfer_fk_to_default_db_bulk
|
1132
|
-
transfer_to_default_db(
|
1133
|
-
feature, using_key, transfer_fk=False, transfer_logs=transfer_logs
|
1134
|
-
)
|
1135
|
-
logger.info(f"saving {n_new_members} new {registry.__name__} records")
|
1136
|
-
save(
|
1137
|
-
new_members, ignore_conflicts=True
|
1138
|
-
) # conflicts arising from existing records are ignored
|
1139
|
-
|
1140
|
-
# create a new feature set from feature values using the same uid
|
1141
|
-
schema_self = Schema.from_values(member_uids, field=getattr(registry, field))
|
1142
|
-
if schema_self is None:
|
1143
|
-
if hasattr(registry, "organism_id"):
|
1144
|
-
logger.warning(
|
1145
|
-
f"Schema is not transferred, check if organism is set correctly: {schema}"
|
1146
|
-
)
|
1147
|
-
continue
|
1148
|
-
# make sure the uid matches if schema is composed of same features
|
1149
|
-
if schema_self.hash == schema.hash:
|
1150
|
-
schema_self.uid = schema.uid
|
1151
|
-
logger.info(f"saving {slot} schema: {schema_self}")
|
1152
|
-
self._host.features._add_schema(schema_self, slot)
|
1153
|
-
|
1154
|
-
|
1155
|
-
def make_external(self, feature: Feature) -> None:
|
1156
|
-
"""Make a feature external, aka, remove feature from feature sets.
|
1157
|
-
|
1158
|
-
Args:
|
1159
|
-
feature: `Feature` A feature record.
|
1160
|
-
|
1161
|
-
"""
|
1162
|
-
if not isinstance(feature, Feature):
|
1163
|
-
raise TypeError("feature must be a Feature record!")
|
1164
|
-
feature_sets = Schema.filter(features=feature).all()
|
1165
|
-
for fs in feature_sets:
|
1166
|
-
f = Feature.filter(uid=feature.uid).all()
|
1167
|
-
features_updated = fs.members.difference(f)
|
1168
|
-
if len(features_updated) > 0:
|
1169
|
-
# re-compute the hash of feature sets based on the updated members
|
1170
|
-
features_hash = hash_set({feature.uid for feature in features_updated})
|
1171
|
-
fs.hash = features_hash
|
1172
|
-
fs.n = len(features_updated)
|
1173
|
-
fs.save()
|
1174
|
-
# delete the link between the feature and the feature set
|
1175
|
-
Schema.features.through.objects.filter(
|
1176
|
-
feature_id=feature.id, schema_id=fs.id
|
1177
|
-
).delete()
|
1178
|
-
# if no members are left in the schema, delete it
|
1179
|
-
if len(features_updated) == 0:
|
1180
|
-
logger.warning(f"deleting empty feature set: {fs}")
|
1181
|
-
fs.artifacts.set([])
|
1182
|
-
fs.delete()
|
1183
|
-
|
1184
|
-
|
1185
|
-
@deprecated("_add_schema")
|
1186
|
-
def add_schema(self, schema: Schema, slot: str) -> None:
|
1187
|
-
return self._add_schema(schema, slot)
|
1188
|
-
|
1189
|
-
|
1190
|
-
@deprecated("_add_schema")
|
1191
|
-
def add_feature_set(self, schema: Schema, slot: str) -> None:
|
1192
|
-
return self._add_schema(schema, slot)
|
1193
|
-
|
1194
|
-
|
1195
|
-
@property
|
1196
|
-
@deprecated("slots")
|
1197
|
-
def _schema_by_slot(self):
|
1198
|
-
return self.slots
|
1199
|
-
|
1200
|
-
|
1201
|
-
@property
|
1202
|
-
def _feature_set_by_slot(self):
|
1203
|
-
return self.slots
|
1204
|
-
|
1205
|
-
|
1206
|
-
# deprecated: feature set parsing
|
1207
|
-
|
1208
|
-
|
698
|
+
# for deprecated functionality
|
1209
699
|
def parse_staged_feature_sets_from_anndata(
|
1210
700
|
adata: AnnData,
|
1211
701
|
var_field: FieldAttr | None = None,
|
@@ -1259,153 +749,632 @@ def parse_staged_feature_sets_from_anndata(
|
|
1259
749
|
return feature_sets
|
1260
750
|
|
1261
751
|
|
1262
|
-
|
1263
|
-
|
1264
|
-
def _add_set_from_df(
|
1265
|
-
self,
|
1266
|
-
field: FieldAttr = Feature.name,
|
1267
|
-
organism: str | None = None,
|
1268
|
-
mute: bool = False,
|
1269
|
-
):
|
1270
|
-
"""Add feature set corresponding to column names of DataFrame."""
|
1271
|
-
assert self._host.otype == "DataFrame" # noqa: S101
|
1272
|
-
df = self._host.load(is_run_input=False)
|
1273
|
-
schema = Schema.from_df(
|
1274
|
-
df=df,
|
1275
|
-
field=field,
|
1276
|
-
mute=mute,
|
1277
|
-
organism=organism,
|
1278
|
-
)
|
1279
|
-
self._host._staged_feature_sets = {"columns": schema}
|
1280
|
-
self._host.save()
|
752
|
+
class FeatureManager:
|
753
|
+
"""Feature manager."""
|
1281
754
|
|
755
|
+
def __init__(self, host: Artifact | Run):
|
756
|
+
self._host = host
|
757
|
+
self._slots: dict[str, Schema] | None = None
|
758
|
+
self._accessor_by_registry_ = None
|
1282
759
|
|
1283
|
-
def
|
1284
|
-
|
1285
|
-
var_field: FieldAttr | None = None,
|
1286
|
-
obs_field: FieldAttr | None = Feature.name,
|
1287
|
-
uns_field: FieldAttr | None = None,
|
1288
|
-
mute: bool = False,
|
1289
|
-
organism: str | SQLRecord | None = None,
|
1290
|
-
):
|
1291
|
-
"""Add features from AnnData."""
|
1292
|
-
assert self._host.otype == "AnnData" # noqa: S101
|
1293
|
-
|
1294
|
-
# parse and register features
|
1295
|
-
adata = self._host.load(is_run_input=False)
|
1296
|
-
feature_sets = parse_staged_feature_sets_from_anndata(
|
1297
|
-
adata,
|
1298
|
-
var_field=var_field,
|
1299
|
-
obs_field=obs_field,
|
1300
|
-
uns_field=uns_field,
|
1301
|
-
mute=mute,
|
1302
|
-
organism=organism,
|
1303
|
-
)
|
760
|
+
def __repr__(self) -> str:
|
761
|
+
return self.describe(return_str=True) # type: ignore
|
1304
762
|
|
1305
|
-
|
1306
|
-
|
1307
|
-
|
763
|
+
def describe(self, return_str: bool = False) -> str | None:
|
764
|
+
tree = describe_features(self._host) # type: ignore
|
765
|
+
return format_rich_tree(
|
766
|
+
tree, fallback="no linked features", return_str=return_str
|
767
|
+
)
|
1308
768
|
|
769
|
+
def get_values(self) -> dict[str, Any]:
|
770
|
+
"""Get feature values as a dictionary."""
|
771
|
+
return describe_features(self._host, to_dict=True) # type: ignore
|
772
|
+
|
773
|
+
@deprecated("slots[slot].members")
|
774
|
+
def __getitem__(self, slot) -> QuerySet:
|
775
|
+
if slot not in self.slots:
|
776
|
+
raise ValueError(
|
777
|
+
f"No linked feature set for slot: {slot}\nDid you get validation"
|
778
|
+
" warnings? Only features that match registered features get validated"
|
779
|
+
" and linked."
|
780
|
+
)
|
781
|
+
schema = self.slots[slot]
|
782
|
+
orm_name = schema.itype
|
783
|
+
return getattr(schema, self._accessor_by_registry[orm_name]).all()
|
784
|
+
|
785
|
+
@property
|
786
|
+
def slots(self) -> dict[str, Schema]:
|
787
|
+
"""Schema by slot.
|
788
|
+
|
789
|
+
Example::
|
790
|
+
|
791
|
+
artifact.features.slots
|
792
|
+
#> {'var': <Schema: var>, 'obs': <Schema: obs>}
|
793
|
+
"""
|
794
|
+
if self._slots is None:
|
795
|
+
self._slots = get_schema_by_slot_(self._host)
|
796
|
+
return self._slots
|
797
|
+
|
798
|
+
@property
|
799
|
+
def _accessor_by_registry(self):
|
800
|
+
"""Accessor by registry."""
|
801
|
+
if self._accessor_by_registry_ is None:
|
802
|
+
self._accessor_by_registry_ = get_accessor_by_registry_(self._host)
|
803
|
+
return self._accessor_by_registry_
|
804
|
+
|
805
|
+
def _add_label_feature_links(
|
806
|
+
self,
|
807
|
+
features_labels,
|
808
|
+
*,
|
809
|
+
label_ref_is_name: bool | None = None,
|
810
|
+
feature_ref_is_name: bool | None = None,
|
811
|
+
):
|
812
|
+
if list(features_labels.keys()) != ["ULabel"]:
|
813
|
+
related_names = dict_related_model_to_related_name(self._host.__class__)
|
814
|
+
else:
|
815
|
+
related_names = {"ULabel": "ulabels"}
|
816
|
+
for class_name, registry_features_labels in features_labels.items():
|
817
|
+
related_name = related_names[class_name] # e.g., "ulabels"
|
818
|
+
IsLink = getattr(self._host, related_name).through
|
819
|
+
field_name = f"{get_link_attr(IsLink, self._host)}_id" # e.g., ulabel_id
|
820
|
+
links = [
|
821
|
+
IsLink(
|
822
|
+
**{
|
823
|
+
"artifact_id": self._host.id,
|
824
|
+
"feature_id": feature.id,
|
825
|
+
field_name: label.id,
|
826
|
+
"feature_ref_is_name": feature_ref_is_name,
|
827
|
+
"label_ref_is_name": label_ref_is_name,
|
828
|
+
}
|
829
|
+
)
|
830
|
+
for (feature, label) in registry_features_labels
|
831
|
+
]
|
832
|
+
# a link might already exist
|
833
|
+
try:
|
834
|
+
save(links, ignore_conflicts=False)
|
835
|
+
except Exception:
|
836
|
+
save(links, ignore_conflicts=True)
|
837
|
+
# now delete links that were previously saved without a feature
|
838
|
+
IsLink.filter(
|
839
|
+
**{
|
840
|
+
"artifact_id": self._host.id,
|
841
|
+
"feature_id": None,
|
842
|
+
f"{field_name}__in": [l.id for _, l in registry_features_labels],
|
843
|
+
}
|
844
|
+
).all().delete()
|
1309
845
|
|
1310
|
-
def
|
1311
|
-
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
|
1316
|
-
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
846
|
+
def add_values(
|
847
|
+
self,
|
848
|
+
values: dict[str, str | int | float | bool],
|
849
|
+
feature_field: FieldAttr = Feature.name,
|
850
|
+
str_as_ulabel: bool = True,
|
851
|
+
) -> None:
|
852
|
+
"""Curate artifact with features & values.
|
853
|
+
|
854
|
+
Args:
|
855
|
+
values: A dictionary of keys (features) & values (labels, numbers, booleans).
|
856
|
+
feature_field: The field of a reference registry to map keys of the
|
857
|
+
dictionary.
|
858
|
+
str_as_ulabel: Whether to interpret string values as ulabels.
|
859
|
+
"""
|
860
|
+
from .._tracked import get_current_tracked_run
|
861
|
+
|
862
|
+
# rename to distinguish from the values inside the dict
|
863
|
+
dictionary = values
|
864
|
+
keys = dictionary.keys()
|
865
|
+
if isinstance(keys, DICT_KEYS_TYPE):
|
866
|
+
keys = list(keys) # type: ignore
|
867
|
+
# deal with other cases later
|
868
|
+
assert all(isinstance(key, str) for key in keys) # noqa: S101
|
869
|
+
registry = feature_field.field.model
|
870
|
+
value_model = FeatureValue
|
871
|
+
model_name = "Feature"
|
872
|
+
records = registry.from_values(keys, field=feature_field, mute=True)
|
873
|
+
if len(records) != len(keys):
|
874
|
+
not_validated_keys = [
|
875
|
+
key for key in keys if key not in records.list("name")
|
876
|
+
]
|
877
|
+
not_validated_keys_dtype_message = [
|
878
|
+
(key, infer_feature_type_convert_json(key, dictionary[key]))
|
879
|
+
for key in not_validated_keys
|
880
|
+
]
|
881
|
+
run = get_current_tracked_run()
|
882
|
+
if run is not None:
|
883
|
+
name = f"{run.transform.type}[{run.transform.key}]"
|
884
|
+
type_hint = f""" {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
|
885
|
+
elements = [type_hint]
|
886
|
+
type_kwarg = f", type={model_name.lower()}_type"
|
887
|
+
else:
|
888
|
+
elements = []
|
889
|
+
type_kwarg = ""
|
890
|
+
elements += [
|
891
|
+
f" ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
|
892
|
+
for key, (dtype, _, message) in not_validated_keys_dtype_message
|
893
|
+
]
|
894
|
+
hint = "\n".join(elements)
|
895
|
+
msg = (
|
896
|
+
f"These keys could not be validated: {not_validated_keys}\n"
|
897
|
+
f"Here is how to create a {model_name.lower()}:\n\n{hint}"
|
898
|
+
)
|
899
|
+
raise ValidationError(msg)
|
900
|
+
|
901
|
+
# figure out which of the values go where
|
902
|
+
features_labels = defaultdict(list)
|
903
|
+
_feature_values = []
|
904
|
+
not_validated_values = []
|
905
|
+
for feature in records:
|
906
|
+
value = dictionary[feature.name]
|
907
|
+
inferred_type, converted_value, _ = infer_feature_type_convert_json(
|
908
|
+
feature.name,
|
909
|
+
value,
|
910
|
+
mute=True,
|
911
|
+
str_as_ulabel=str_as_ulabel,
|
912
|
+
)
|
913
|
+
if feature.dtype == "num":
|
914
|
+
if inferred_type not in {"int", "float"}:
|
915
|
+
raise TypeError(
|
916
|
+
f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
|
917
|
+
)
|
918
|
+
elif feature.dtype.startswith("cat"):
|
919
|
+
if inferred_type != "?":
|
920
|
+
if not (
|
921
|
+
inferred_type.startswith("cat") or isinstance(value, SQLRecord)
|
922
|
+
):
|
923
|
+
raise TypeError(
|
924
|
+
f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
|
925
|
+
)
|
926
|
+
elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
|
927
|
+
feature.dtype != "str" and feature.dtype != inferred_type
|
928
|
+
):
|
929
|
+
raise ValidationError(
|
930
|
+
f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
|
931
|
+
)
|
932
|
+
if not feature.dtype.startswith("cat"):
|
933
|
+
filter_kwargs = {model_name.lower(): feature, "value": converted_value}
|
934
|
+
feature_value, _ = value_model.get_or_create(**filter_kwargs)
|
935
|
+
_feature_values.append(feature_value)
|
936
|
+
else:
|
937
|
+
if isinstance(value, SQLRecord) or (
|
938
|
+
isinstance(value, Iterable)
|
939
|
+
and isinstance(next(iter(value)), SQLRecord)
|
940
|
+
):
|
941
|
+
if isinstance(value, SQLRecord):
|
942
|
+
label_records = [value]
|
943
|
+
else:
|
944
|
+
label_records = value # type: ignore
|
945
|
+
for record in label_records:
|
946
|
+
if record._state.adding:
|
947
|
+
raise ValidationError(
|
948
|
+
f"Please save {record} before annotation."
|
949
|
+
)
|
950
|
+
features_labels[
|
951
|
+
record.__class__.__get_name_with_module__()
|
952
|
+
].append((feature, record))
|
953
|
+
else:
|
954
|
+
if isinstance(value, str):
|
955
|
+
values = [value] # type: ignore
|
956
|
+
else:
|
957
|
+
values = value # type: ignore
|
958
|
+
if "ULabel" not in feature.dtype:
|
959
|
+
feature.dtype += "[ULabel]"
|
960
|
+
feature.save()
|
961
|
+
validated = ULabel.validate(values, field=ULabel.name, mute=True)
|
962
|
+
values_array = np.array(values)
|
963
|
+
validated_values = values_array[validated]
|
964
|
+
if validated.sum() != len(values):
|
965
|
+
not_validated_values += values_array[~validated].tolist()
|
966
|
+
label_records = ULabel.from_values(
|
967
|
+
validated_values, field=ULabel.name, mute=True
|
968
|
+
) # type: ignore
|
969
|
+
features_labels["ULabel"] += [
|
970
|
+
(feature, label_record) for label_record in label_records
|
971
|
+
]
|
972
|
+
if not_validated_values:
|
973
|
+
not_validated_values.sort()
|
974
|
+
hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
|
975
|
+
msg = (
|
976
|
+
f"These values could not be validated: {not_validated_values}\n"
|
977
|
+
f"Here is how to create ulabels for them:\n\n{hint}"
|
978
|
+
)
|
979
|
+
raise ValidationError(msg)
|
980
|
+
# TODO: create an explicit version of this
|
981
|
+
# if not is_param:
|
982
|
+
# # check if _expect_many is false for _all_ records
|
983
|
+
# if any(record._expect_many for record in records):
|
984
|
+
# updated_features = []
|
985
|
+
# for record in records:
|
986
|
+
# if record._expect_many:
|
987
|
+
# record._expect_many = False
|
988
|
+
# record.save()
|
989
|
+
# updated_features.append(record.name)
|
990
|
+
# if any(updated_features):
|
991
|
+
# logger.important(
|
992
|
+
# f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
|
993
|
+
# )
|
994
|
+
# bulk add all links
|
995
|
+
if features_labels:
|
996
|
+
self._add_label_feature_links(features_labels)
|
997
|
+
if _feature_values:
|
998
|
+
to_insert_feature_values = [
|
999
|
+
record for record in _feature_values if record._state.adding
|
1000
|
+
]
|
1001
|
+
if to_insert_feature_values:
|
1002
|
+
save(to_insert_feature_values)
|
1003
|
+
dict_typed_features = [
|
1004
|
+
getattr(record, model_name.lower())
|
1005
|
+
for record in _feature_values
|
1006
|
+
if getattr(record, model_name.lower()).dtype == "dict"
|
1007
|
+
]
|
1008
|
+
IsLink = self._host._feature_values.through
|
1009
|
+
valuefield_id = "featurevalue_id"
|
1010
|
+
host_class_lower = self._host.__class__.__get_name_with_module__().lower()
|
1011
|
+
if dict_typed_features:
|
1012
|
+
# delete all previously existing anotations with dictionaries
|
1013
|
+
kwargs = {
|
1014
|
+
f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
|
1015
|
+
f"{model_name.lower()}__in": dict_typed_features,
|
1016
|
+
}
|
1017
|
+
try:
|
1018
|
+
value_model.filter(**kwargs).all().delete()
|
1019
|
+
except ProtectedError:
|
1020
|
+
pass
|
1021
|
+
# add new feature links
|
1022
|
+
links = [
|
1023
|
+
IsLink(
|
1024
|
+
**{
|
1025
|
+
f"{host_class_lower}_id": self._host.id,
|
1026
|
+
valuefield_id: feature_value.id,
|
1027
|
+
}
|
1028
|
+
)
|
1029
|
+
for feature_value in _feature_values
|
1030
|
+
]
|
1031
|
+
# a link might already exist, to avoid raising a unique constraint
|
1032
|
+
# error, ignore_conflicts
|
1033
|
+
save(links, ignore_conflicts=True)
|
1321
1034
|
|
1322
|
-
|
1323
|
-
|
1324
|
-
|
1035
|
+
def remove_values(
|
1036
|
+
self,
|
1037
|
+
feature: str | Feature,
|
1038
|
+
*,
|
1039
|
+
value: Any | None = None,
|
1040
|
+
):
|
1041
|
+
"""Remove value annotations for a given feature.
|
1042
|
+
|
1043
|
+
Args:
|
1044
|
+
feature: The feature for which to remove values.
|
1045
|
+
value: An optional value to restrict removal to a single value.
|
1046
|
+
|
1047
|
+
"""
|
1048
|
+
from .artifact import Artifact
|
1049
|
+
|
1050
|
+
if isinstance(feature, str):
|
1051
|
+
feature = Feature.get(name=feature)
|
1052
|
+
filter_kwargs = {"feature": feature}
|
1053
|
+
if feature.dtype.startswith("cat["): # type: ignore
|
1054
|
+
feature_registry = feature.dtype.replace("cat[", "").replace("]", "") # type: ignore
|
1055
|
+
if value is not None:
|
1056
|
+
assert isinstance(value, SQLRecord) # noqa: S101
|
1057
|
+
# the below uses our convention for field names in link models
|
1058
|
+
link_name = (
|
1059
|
+
feature_registry.split(".")[1]
|
1060
|
+
if "." in feature_registry
|
1061
|
+
else feature_registry
|
1062
|
+
).lower()
|
1063
|
+
filter_kwargs[link_name] = value
|
1064
|
+
if feature_registry == "ULabel":
|
1065
|
+
link_attribute = "links_ulabel"
|
1066
|
+
else:
|
1067
|
+
link_models_on_models = {
|
1068
|
+
getattr(
|
1069
|
+
Artifact, obj.related_name
|
1070
|
+
).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
|
1071
|
+
for obj in Artifact._meta.related_objects
|
1072
|
+
if obj.related_model.__get_name_with_module__() == feature_registry
|
1073
|
+
}
|
1074
|
+
link_attribute = {
|
1075
|
+
obj.related_name
|
1076
|
+
for obj in Artifact._meta.related_objects
|
1077
|
+
if obj.related_model.__get_name_with_module__()
|
1078
|
+
in link_models_on_models
|
1079
|
+
}.pop()
|
1080
|
+
getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
|
1081
|
+
else:
|
1082
|
+
if value is not None:
|
1083
|
+
filter_kwargs["value"] = value
|
1084
|
+
feature_values = self._host._feature_values.filter(**filter_kwargs)
|
1085
|
+
self._host._feature_values.remove(*feature_values)
|
1086
|
+
# this might leave a dangling feature_value record
|
1087
|
+
# but we don't want to pay the price of making another query just to remove this annotation
|
1088
|
+
# we can clean the FeatureValue registry periodically if we want to
|
1089
|
+
|
1090
|
+
def _add_schema(self, schema: Schema, slot: str) -> None:
|
1091
|
+
"""Annotate artifact with a schema.
|
1092
|
+
|
1093
|
+
Args:
|
1094
|
+
schema: `Schema` A schema record.
|
1095
|
+
slot: `str` The slot that marks where the schema is stored in
|
1096
|
+
the artifact.
|
1097
|
+
"""
|
1098
|
+
# TODO: deprecate as soon as we have the Schema-based curators
|
1099
|
+
if self._host._state.adding:
|
1100
|
+
raise ValueError(
|
1101
|
+
"Please save the artifact or collection before adding a feature set!"
|
1102
|
+
)
|
1103
|
+
host_db = self._host._state.db
|
1104
|
+
schema.save(using=host_db)
|
1105
|
+
kwargs = {
|
1106
|
+
"artifact_id": self._host.id,
|
1107
|
+
"schema": schema,
|
1108
|
+
"slot": slot,
|
1109
|
+
}
|
1110
|
+
link_record = (
|
1111
|
+
self._host.feature_sets.through.objects.using(host_db)
|
1112
|
+
.filter(**kwargs)
|
1113
|
+
.one_or_none()
|
1114
|
+
)
|
1115
|
+
if link_record is None:
|
1116
|
+
self._host.feature_sets.through(**kwargs).save(using=host_db)
|
1117
|
+
if slot in self.slots:
|
1118
|
+
logger.debug(f"replaced existing {slot} feature set")
|
1119
|
+
self._slots[slot] = schema # type: ignore
|
1120
|
+
|
1121
|
+
def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
1122
|
+
"""Transfer features from a artifact or collection."""
|
1123
|
+
# This only covers feature sets
|
1124
|
+
if transfer_logs is None:
|
1125
|
+
transfer_logs = {"mapped": [], "transferred": [], "run": None}
|
1126
|
+
from lamindb import settings
|
1127
|
+
|
1128
|
+
using_key = settings._using_key
|
1129
|
+
for slot, schema in data.features.slots.items(): # type: ignore
|
1130
|
+
try:
|
1131
|
+
members = schema.members
|
1132
|
+
except ModuleWasntConfigured as err:
|
1133
|
+
logger.warning(f"skipping transfer of {slot} schema because {err}")
|
1134
|
+
continue
|
1135
|
+
if len(members) == 0:
|
1136
|
+
continue
|
1137
|
+
if len(members) > settings.annotation.n_max_records:
|
1138
|
+
logger.warning(
|
1139
|
+
f"skipping creating {len(members)} > {settings.annotation.n_max_records} new {members[0].__class__.__name__} records"
|
1140
|
+
)
|
1141
|
+
schema_self = schema
|
1142
|
+
schema_exists = Schema.filter(hash=schema_self.hash).one_or_none()
|
1143
|
+
if schema_exists is not None:
|
1144
|
+
schema_self = schema_exists
|
1145
|
+
else:
|
1146
|
+
schema_self.save()
|
1147
|
+
else:
|
1148
|
+
registry = members[0].__class__
|
1149
|
+
# note here the features are transferred based on an unique field
|
1150
|
+
field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
|
1151
|
+
# this will be e.g. be a list of ontology_ids or uids
|
1152
|
+
member_uids = list(members.values_list(field, flat=True))
|
1153
|
+
validated = registry.validate(member_uids, field=field, mute=True)
|
1154
|
+
new_members_uids = list(compress(member_uids, ~validated))
|
1155
|
+
new_members = members.filter(**{f"{field}__in": new_members_uids}).all()
|
1156
|
+
n_new_members = len(new_members)
|
1157
|
+
if len(members) > settings.annotation.n_max_records:
|
1158
|
+
logger.warning(
|
1159
|
+
f"skipping creating {n_new_members} > {settings.annotation.n_max_records} new {registry.__name__} records"
|
1160
|
+
)
|
1161
|
+
if n_new_members > 0:
|
1162
|
+
# transfer foreign keys needs to be run before transfer to default db
|
1163
|
+
transfer_fk_to_default_db_bulk(
|
1164
|
+
new_members, using_key, transfer_logs=transfer_logs
|
1165
|
+
)
|
1166
|
+
for feature in new_members:
|
1167
|
+
# not calling save=True here as in labels, because want to
|
1168
|
+
# bulk save below
|
1169
|
+
# transfer_fk is set to False because they are already transferred
|
1170
|
+
# in the previous step transfer_fk_to_default_db_bulk
|
1171
|
+
transfer_to_default_db(
|
1172
|
+
feature,
|
1173
|
+
using_key,
|
1174
|
+
transfer_fk=False,
|
1175
|
+
transfer_logs=transfer_logs,
|
1176
|
+
)
|
1177
|
+
save(
|
1178
|
+
new_members, ignore_conflicts=True
|
1179
|
+
) # conflicts arising from existing records are ignored
|
1325
1180
|
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1181
|
+
# create a new feature set from feature values using the same uid
|
1182
|
+
schema_self = Schema.from_values(
|
1183
|
+
member_uids, field=getattr(registry, field)
|
1184
|
+
)
|
1185
|
+
if schema_self is None:
|
1186
|
+
if hasattr(registry, "organism_id"):
|
1187
|
+
logger.warning(
|
1188
|
+
f"Schema is not transferred, check if organism is set correctly: {schema}"
|
1189
|
+
)
|
1190
|
+
continue
|
1191
|
+
# make sure the uid matches if schema is composed of same features
|
1192
|
+
if schema_self.hash == schema.hash:
|
1193
|
+
schema_self.uid = schema.uid
|
1194
|
+
logger.info(f"saving {slot} schema: {schema_self}")
|
1195
|
+
try:
|
1196
|
+
self._host.features._add_schema(schema_self, slot)
|
1197
|
+
except IntegrityError:
|
1198
|
+
logger.warning(
|
1199
|
+
f"updating annotation of artifact {self._host.uid} with feature set for slot: {slot}"
|
1200
|
+
)
|
1201
|
+
self._host.feature_sets.through.objects.get(
|
1202
|
+
artifact_id=self._host.id, slot=slot
|
1203
|
+
).delete()
|
1204
|
+
self._host.features._add_schema(schema_self, slot)
|
1205
|
+
|
1206
|
+
def make_external(self, feature: Feature) -> None:
|
1207
|
+
"""Make a feature external, aka, remove feature from feature sets.
|
1208
|
+
|
1209
|
+
Args:
|
1210
|
+
feature: `Feature` A feature record.
|
1211
|
+
|
1212
|
+
"""
|
1213
|
+
if not isinstance(feature, Feature):
|
1214
|
+
raise TypeError("feature must be a Feature record!")
|
1215
|
+
feature_sets = Schema.filter(features=feature).all()
|
1216
|
+
for fs in feature_sets:
|
1217
|
+
f = Feature.filter(uid=feature.uid).all()
|
1218
|
+
features_updated = fs.members.difference(f)
|
1219
|
+
if len(features_updated) > 0:
|
1220
|
+
# re-compute the hash of feature sets based on the updated members
|
1221
|
+
features_hash = hash_set({feature.uid for feature in features_updated})
|
1222
|
+
fs.hash = features_hash
|
1223
|
+
fs.n = len(features_updated)
|
1224
|
+
fs.save()
|
1225
|
+
# delete the link between the feature and the feature set
|
1226
|
+
Schema.features.through.objects.filter(
|
1227
|
+
feature_id=feature.id, schema_id=fs.id
|
1228
|
+
).delete()
|
1229
|
+
# if no members are left in the schema, delete it
|
1230
|
+
if len(features_updated) == 0:
|
1231
|
+
logger.warning(f"deleting empty feature set: {fs}")
|
1232
|
+
fs.artifacts.set([])
|
1233
|
+
fs.delete()
|
1234
|
+
|
1235
|
+
@deprecated("_add_schema")
|
1236
|
+
def add_schema(self, schema: Schema, slot: str) -> None:
|
1237
|
+
return self._add_schema(schema, slot)
|
1238
|
+
|
1239
|
+
@deprecated("_add_schema")
|
1240
|
+
def add_feature_set(self, schema: Schema, slot: str) -> None:
|
1241
|
+
return self._add_schema(schema, slot)
|
1242
|
+
|
1243
|
+
@property
|
1244
|
+
@deprecated("slots")
|
1245
|
+
def _schema_by_slot(self):
|
1246
|
+
return self.slots
|
1247
|
+
|
1248
|
+
@property
|
1249
|
+
def _feature_set_by_slot(self):
|
1250
|
+
return self.slots
|
1251
|
+
|
1252
|
+
# no longer called from within curator
|
1253
|
+
# deprecated
|
1254
|
+
def _add_set_from_df(
|
1255
|
+
self,
|
1256
|
+
field: FieldAttr = Feature.name,
|
1257
|
+
organism: str | None = None,
|
1258
|
+
mute: bool = False,
|
1259
|
+
):
|
1260
|
+
"""Add feature set corresponding to column names of DataFrame."""
|
1261
|
+
assert self._host.otype == "DataFrame" # noqa: S101
|
1262
|
+
df = self._host.load(is_run_input=False)
|
1263
|
+
schema = Schema.from_df(
|
1264
|
+
df=df,
|
1265
|
+
field=field,
|
1334
1266
|
mute=mute,
|
1335
1267
|
organism=organism,
|
1336
1268
|
)
|
1337
|
-
|
1338
|
-
|
1269
|
+
self._host._staged_feature_sets = {"columns": schema}
|
1270
|
+
self._host.save()
|
1339
1271
|
|
1340
|
-
#
|
1341
|
-
|
1342
|
-
|
1272
|
+
# deprecated
|
1273
|
+
def _add_set_from_anndata(
|
1274
|
+
self,
|
1275
|
+
var_field: FieldAttr | None = None,
|
1276
|
+
obs_field: FieldAttr | None = Feature.name,
|
1277
|
+
uns_field: FieldAttr | None = None,
|
1278
|
+
mute: bool = False,
|
1279
|
+
organism: str | SQLRecord | None = None,
|
1280
|
+
):
|
1281
|
+
"""Add features from AnnData."""
|
1282
|
+
assert self._host.otype == "AnnData" # noqa: S101
|
1283
|
+
|
1284
|
+
# parse and register features
|
1285
|
+
adata = self._host.load(is_run_input=False)
|
1286
|
+
feature_sets = parse_staged_feature_sets_from_anndata(
|
1287
|
+
adata,
|
1288
|
+
var_field=var_field,
|
1289
|
+
obs_field=obs_field,
|
1290
|
+
uns_field=uns_field,
|
1291
|
+
mute=mute,
|
1292
|
+
organism=organism,
|
1293
|
+
)
|
1343
1294
|
|
1295
|
+
# link feature sets
|
1296
|
+
self._host._staged_feature_sets = feature_sets
|
1297
|
+
self._host.save()
|
1344
1298
|
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1299
|
+
# deprecated
|
1300
|
+
def _add_set_from_mudata(
|
1301
|
+
self,
|
1302
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1303
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1304
|
+
mute: bool = False,
|
1305
|
+
organism: str | SQLRecord | None = None,
|
1306
|
+
):
|
1307
|
+
"""Add features from MuData."""
|
1308
|
+
if obs_fields is None:
|
1309
|
+
obs_fields = {}
|
1310
|
+
assert self._host.otype == "MuData" # noqa: S101
|
1311
|
+
|
1312
|
+
# parse and register features
|
1313
|
+
mdata = self._host.load(is_run_input=False)
|
1314
|
+
feature_sets = {}
|
1315
|
+
|
1316
|
+
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1317
|
+
if len(obs_features) > 0:
|
1318
|
+
feature_sets["obs"] = Schema(features=obs_features)
|
1319
|
+
for modality, field in var_fields.items():
|
1320
|
+
modality_fs = parse_staged_feature_sets_from_anndata(
|
1321
|
+
mdata[modality],
|
1322
|
+
var_field=field,
|
1323
|
+
obs_field=obs_fields.get(modality, Feature.name),
|
1324
|
+
mute=mute,
|
1325
|
+
organism=organism,
|
1326
|
+
)
|
1327
|
+
for k, v in modality_fs.items():
|
1328
|
+
feature_sets[f"['{modality}'].{k}"] = v
|
1357
1329
|
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1330
|
+
# link feature sets
|
1331
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
|
1332
|
+
feature_sets
|
1333
|
+
)
|
1334
|
+
self._host.save()
|
1361
1335
|
|
1362
|
-
#
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1367
|
-
|
1368
|
-
|
1369
|
-
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1336
|
+
# deprecated
|
1337
|
+
def _add_set_from_spatialdata(
|
1338
|
+
self,
|
1339
|
+
sample_metadata_key: str,
|
1340
|
+
sample_metadata_field: FieldAttr = Feature.name,
|
1341
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1342
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1343
|
+
mute: bool = False,
|
1344
|
+
organism: str | SQLRecord | None = None,
|
1345
|
+
):
|
1346
|
+
"""Add features from SpatialData."""
|
1347
|
+
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1348
|
+
assert self._host.otype == "SpatialData" # noqa: S101
|
1349
|
+
|
1350
|
+
# parse and register features
|
1351
|
+
sdata = self._host.load(is_run_input=False)
|
1352
|
+
feature_sets = {}
|
1353
|
+
|
1354
|
+
# sample features
|
1355
|
+
sample_features = Feature.from_values(
|
1356
|
+
sdata.get_attrs(
|
1357
|
+
key=sample_metadata_key, return_as="df", flatten=True
|
1358
|
+
).columns,
|
1359
|
+
field=sample_metadata_field,
|
1360
|
+
) # type: ignore
|
1361
|
+
if len(sample_features) > 0:
|
1362
|
+
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1363
|
+
|
1364
|
+
# table features
|
1365
|
+
for table, field in var_fields.items():
|
1366
|
+
table_fs = parse_staged_feature_sets_from_anndata(
|
1367
|
+
sdata[table],
|
1368
|
+
var_field=field,
|
1369
|
+
obs_field=obs_fields.get(table, Feature.name),
|
1370
|
+
mute=mute,
|
1371
|
+
organism=organism,
|
1372
|
+
)
|
1373
|
+
for k, v in table_fs.items():
|
1374
|
+
feature_sets[f"['{table}'].{k}"] = v
|
1375
|
+
|
1376
|
+
# link feature sets
|
1377
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
|
1378
|
+
feature_sets
|
1378
1379
|
)
|
1379
|
-
|
1380
|
-
feature_sets[f"['{table}'].{k}"] = v
|
1381
|
-
|
1382
|
-
# link feature sets
|
1383
|
-
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1384
|
-
self._host.save()
|
1385
|
-
|
1386
|
-
|
1387
|
-
# mypy: ignore-errors
|
1388
|
-
FeatureManager.__init__ = __init__
|
1389
|
-
FeatureManager.__repr__ = __repr__
|
1390
|
-
FeatureManager.describe = describe
|
1391
|
-
FeatureManager.__getitem__ = __getitem__
|
1392
|
-
FeatureManager.get_values = get_values
|
1393
|
-
FeatureManager.slots = slots
|
1394
|
-
FeatureManager.add_values = add_values_features
|
1395
|
-
FeatureManager._add_schema = _add_schema
|
1396
|
-
FeatureManager._accessor_by_registry = _accessor_by_registry
|
1397
|
-
FeatureManager._add_from = _add_from
|
1398
|
-
FeatureManager.filter = filter
|
1399
|
-
FeatureManager.get = get
|
1400
|
-
FeatureManager.make_external = make_external
|
1401
|
-
FeatureManager.remove_values = remove_values
|
1402
|
-
|
1403
|
-
# deprecated
|
1404
|
-
FeatureManager._add_set_from_df = _add_set_from_df
|
1405
|
-
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1406
|
-
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1407
|
-
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1408
|
-
FeatureManager.add_schema = add_schema
|
1409
|
-
FeatureManager.add_feature_set = add_feature_set
|
1410
|
-
FeatureManager._schema_by_slot = _schema_by_slot
|
1411
|
-
FeatureManager._feature_set_by_slot = _feature_set_by_slot
|
1380
|
+
self._host.save()
|