lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +203 -102
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/_tiledbsoma.py +29 -13
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3349
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1534 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +382 -287
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +177 -106
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/collection.py +73 -52
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +51 -17
- lamindb/models/has_parents.py +69 -14
- lamindb/models/project.py +1 -1
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +247 -172
- lamindb/models/record.py +65 -247
- lamindb/models/run.py +4 -4
- lamindb/models/save.py +8 -2
- lamindb/models/schema.py +456 -184
- lamindb/models/transform.py +2 -2
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0
@@ -13,7 +13,7 @@ import pandas as pd
|
|
13
13
|
from anndata import AnnData
|
14
14
|
from django.contrib.postgres.aggregates import ArrayAgg
|
15
15
|
from django.db import connections
|
16
|
-
from django.db.models import Aggregate
|
16
|
+
from django.db.models import Aggregate, ProtectedError, Subquery
|
17
17
|
from lamin_utils import logger
|
18
18
|
from lamindb_setup.core.hashing import hash_set
|
19
19
|
from lamindb_setup.core.upath import create_path
|
@@ -42,14 +42,14 @@ from ._describe import (
|
|
42
42
|
TYPE_WIDTH,
|
43
43
|
VALUES_WIDTH,
|
44
44
|
describe_header,
|
45
|
-
|
45
|
+
format_rich_tree,
|
46
46
|
)
|
47
47
|
from ._django import get_artifact_with_related
|
48
48
|
from ._label_manager import _get_labels, describe_labels
|
49
49
|
from ._relations import (
|
50
50
|
dict_related_model_to_related_name,
|
51
51
|
)
|
52
|
-
from .feature import Feature, FeatureValue
|
52
|
+
from .feature import Feature, FeatureValue, parse_dtype
|
53
53
|
from .record import Record
|
54
54
|
from .run import Param, ParamManager, ParamManagerRun, ParamValue, Run
|
55
55
|
from .ulabel import ULabel
|
@@ -320,15 +320,27 @@ def describe_features(
|
|
320
320
|
schema_data[slot] = (schema, feature_names)
|
321
321
|
for feature_name in feature_names:
|
322
322
|
feature_data[feature_name] = (slot, registry_str)
|
323
|
+
schema_data.update(
|
324
|
+
{
|
325
|
+
slot: (schema, schema.n)
|
326
|
+
for slot, schema in get_schema_by_slot_(self).items()
|
327
|
+
if slot not in schema_data
|
328
|
+
}
|
329
|
+
)
|
323
330
|
else:
|
324
331
|
for slot, schema in get_schema_by_slot_(self).items():
|
325
332
|
features = schema.members
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
333
|
+
if features.exists():
|
334
|
+
# features.first() is a lot slower than features[0] here
|
335
|
+
name_field = get_name_field(features[0])
|
336
|
+
feature_names = list(
|
337
|
+
features.values_list(name_field, flat=True)[:20]
|
338
|
+
)
|
339
|
+
schema_data[slot] = (schema, feature_names)
|
340
|
+
for feature_name in feature_names:
|
341
|
+
feature_data[feature_name] = (slot, schema.itype)
|
342
|
+
else:
|
343
|
+
schema_data[slot] = (schema, schema.n)
|
332
344
|
|
333
345
|
internal_feature_names: dict[str, str] = {}
|
334
346
|
if isinstance(self, Artifact):
|
@@ -400,38 +412,44 @@ def describe_features(
|
|
400
412
|
internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
|
401
413
|
|
402
414
|
int_features_tree_children = []
|
403
|
-
for slot, (schema,
|
404
|
-
if
|
405
|
-
|
406
|
-
feature_rows = internal_feature_labels_slot[slot]
|
407
|
-
# add internal Feature features without labels
|
408
|
-
feature_rows += [
|
409
|
-
(
|
410
|
-
feature_name,
|
411
|
-
Text(str(internal_feature_names.get(feature_name)), style="dim"),
|
412
|
-
"",
|
413
|
-
)
|
414
|
-
for feature_name in feature_names
|
415
|
-
if feature_name and feature_name not in internal_feature_labels
|
416
|
-
]
|
415
|
+
for slot, (schema, feature_names_or_n) in schema_data.items():
|
416
|
+
if isinstance(feature_names_or_n, int):
|
417
|
+
feature_rows = []
|
417
418
|
else:
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
419
|
+
feature_names = feature_names_or_n
|
420
|
+
if slot in internal_feature_labels_slot:
|
421
|
+
# add internal Feature features with labels
|
422
|
+
feature_rows = internal_feature_labels_slot[slot]
|
423
|
+
# add internal Feature features without labels
|
424
|
+
feature_rows += [
|
425
|
+
(
|
426
|
+
feature_name,
|
427
|
+
Text(
|
428
|
+
str(internal_feature_names.get(feature_name)), style="dim"
|
427
429
|
),
|
428
|
-
|
429
|
-
)
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
430
|
+
"",
|
431
|
+
)
|
432
|
+
for feature_name in feature_names
|
433
|
+
if feature_name and feature_name not in internal_feature_labels
|
434
|
+
]
|
435
|
+
else:
|
436
|
+
# add internal non-Feature features without labels
|
437
|
+
feature_rows = [
|
438
|
+
(
|
439
|
+
feature_name,
|
440
|
+
Text(
|
441
|
+
str(
|
442
|
+
internal_feature_names.get(feature_name)
|
443
|
+
if feature_name in internal_feature_names
|
444
|
+
else schema.dtype
|
445
|
+
),
|
446
|
+
style="dim",
|
447
|
+
),
|
448
|
+
"",
|
449
|
+
)
|
450
|
+
for feature_name in feature_names
|
451
|
+
if feature_name
|
452
|
+
]
|
435
453
|
int_features_tree_children.append(
|
436
454
|
_create_feature_table(
|
437
455
|
Text.assemble(
|
@@ -482,59 +500,6 @@ def describe_features(
|
|
482
500
|
return tree
|
483
501
|
|
484
502
|
|
485
|
-
def parse_staged_feature_sets_from_anndata(
|
486
|
-
adata: AnnData,
|
487
|
-
var_field: FieldAttr | None = None,
|
488
|
-
obs_field: FieldAttr = Feature.name,
|
489
|
-
uns_field: FieldAttr | None = None,
|
490
|
-
mute: bool = False,
|
491
|
-
organism: str | Record | None = None,
|
492
|
-
) -> dict:
|
493
|
-
data_parse = adata
|
494
|
-
if not isinstance(adata, AnnData): # is a path
|
495
|
-
filepath = create_path(adata) # returns Path for local
|
496
|
-
if not isinstance(filepath, LocalPathClasses):
|
497
|
-
from lamindb import settings
|
498
|
-
from lamindb.core.storage._backed_access import backed_access
|
499
|
-
|
500
|
-
using_key = settings._using_key
|
501
|
-
data_parse = backed_access(filepath, using_key=using_key)
|
502
|
-
else:
|
503
|
-
data_parse = ad.read_h5ad(filepath, backed="r")
|
504
|
-
type = "float"
|
505
|
-
else:
|
506
|
-
type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
|
507
|
-
feature_sets = {}
|
508
|
-
if var_field is not None:
|
509
|
-
schema_var = Schema.from_values(
|
510
|
-
data_parse.var.index,
|
511
|
-
var_field,
|
512
|
-
type=type,
|
513
|
-
mute=mute,
|
514
|
-
organism=organism,
|
515
|
-
raise_validation_error=False,
|
516
|
-
)
|
517
|
-
if schema_var is not None:
|
518
|
-
feature_sets["var"] = schema_var
|
519
|
-
if obs_field is not None and len(data_parse.obs.columns) > 0:
|
520
|
-
schema_obs = Schema.from_df(
|
521
|
-
df=data_parse.obs,
|
522
|
-
field=obs_field,
|
523
|
-
mute=mute,
|
524
|
-
organism=organism,
|
525
|
-
)
|
526
|
-
if schema_obs is not None:
|
527
|
-
feature_sets["obs"] = schema_obs
|
528
|
-
if uns_field is not None and len(data_parse.uns) > 0:
|
529
|
-
validated_features = Feature.from_values( # type: ignore
|
530
|
-
data_parse.uns.keys(), field=uns_field, organism=organism
|
531
|
-
)
|
532
|
-
if len(validated_features) > 0:
|
533
|
-
schema_uns = Schema(validated_features, dtype=None, otype="dict")
|
534
|
-
feature_sets["uns"] = schema_uns
|
535
|
-
return feature_sets
|
536
|
-
|
537
|
-
|
538
503
|
def is_valid_datetime_str(date_string: str) -> bool | str:
|
539
504
|
try:
|
540
505
|
dt = datetime.fromisoformat(date_string)
|
@@ -625,8 +590,12 @@ def __init__(self, host: Artifact | Collection | Run):
|
|
625
590
|
|
626
591
|
|
627
592
|
def __repr__(self) -> str:
|
593
|
+
return describe(self, return_str=True) # type: ignore
|
594
|
+
|
595
|
+
|
596
|
+
def describe(self, return_str: bool = False) -> str | None:
|
628
597
|
tree = describe_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
|
629
|
-
return
|
598
|
+
return format_rich_tree(tree, fallback="no linked features", return_str=return_str)
|
630
599
|
|
631
600
|
|
632
601
|
def get_values(self) -> dict[str, Any]:
|
@@ -676,50 +645,86 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
|
|
676
645
|
comparator = f"__{split_key[1]}"
|
677
646
|
feature = features.get(name=normalized_key)
|
678
647
|
if not feature.dtype.startswith("cat"):
|
648
|
+
if comparator == "__isnull":
|
649
|
+
if cls == FeatureManager:
|
650
|
+
from .artifact import ArtifactFeatureValue
|
651
|
+
|
652
|
+
if value: # True
|
653
|
+
return Artifact.objects.exclude(
|
654
|
+
id__in=Subquery(
|
655
|
+
ArtifactFeatureValue.objects.filter(
|
656
|
+
featurevalue__feature=feature
|
657
|
+
).values("artifact_id")
|
658
|
+
)
|
659
|
+
)
|
660
|
+
else:
|
661
|
+
return Artifact.objects.exclude(
|
662
|
+
id__in=Subquery(
|
663
|
+
ArtifactFeatureValue.objects.filter(
|
664
|
+
featurevalue__feature=feature
|
665
|
+
).values("artifact_id")
|
666
|
+
)
|
667
|
+
)
|
668
|
+
if comparator in {"__startswith", "__contains"}:
|
669
|
+
logger.important(
|
670
|
+
f"currently not supporting `{comparator}`, using `__icontains` instead"
|
671
|
+
)
|
672
|
+
comparator = "__icontains"
|
679
673
|
expression = {feature_param: feature, f"value{comparator}": value}
|
680
|
-
|
681
|
-
new_expression[f"_{feature_param}
|
682
|
-
elif isinstance(value, (str, Record)):
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
if len(labels) == 0:
|
694
|
-
raise DoesNotExist(
|
695
|
-
f"Did not find a ULabel matching `name{comparator}={value}`"
|
696
|
-
)
|
697
|
-
elif len(labels) == 1:
|
698
|
-
label = labels[0]
|
699
|
-
elif isinstance(value, Record):
|
700
|
-
label = value
|
701
|
-
label_registry = (
|
702
|
-
label.__class__ if label is not None else labels[0].__class__
|
703
|
-
)
|
704
|
-
accessor_name = (
|
705
|
-
label_registry.artifacts.through.artifact.field._related_name
|
706
|
-
)
|
707
|
-
new_expression[f"{accessor_name}__feature"] = feature
|
708
|
-
if label is not None:
|
709
|
-
# simplified query if we have exactly one label
|
710
|
-
new_expression[
|
711
|
-
f"{accessor_name}__{label_registry.__name__.lower()}"
|
712
|
-
] = label
|
674
|
+
feature_values = value_model.filter(**expression)
|
675
|
+
new_expression[f"_{feature_param}_values__id__in"] = feature_values
|
676
|
+
elif isinstance(value, (str, Record, bool)):
|
677
|
+
if comparator == "__isnull":
|
678
|
+
if cls == FeatureManager:
|
679
|
+
result = parse_dtype(feature.dtype)[0]
|
680
|
+
kwargs = {
|
681
|
+
f"links_{result['registry'].__name__.lower()}__feature": feature
|
682
|
+
}
|
683
|
+
if value: # True
|
684
|
+
return Artifact.objects.exclude(**kwargs)
|
685
|
+
else:
|
686
|
+
return Artifact.objects.filter(**kwargs)
|
713
687
|
else:
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
688
|
+
# because SQL is sensitive to whether querying with __in or not
|
689
|
+
# and might return multiple equivalent records for the latter
|
690
|
+
# we distinguish cases in which we have multiple label matches vs. one
|
691
|
+
label = None
|
692
|
+
labels = None
|
693
|
+
if isinstance(value, str):
|
694
|
+
# we need the comparator here because users might query like so
|
695
|
+
# ln.Artifact.filter(experiment__contains="Experi")
|
696
|
+
expression = {f"name{comparator}": value}
|
697
|
+
labels = ULabel.filter(**expression).all()
|
698
|
+
if len(labels) == 0:
|
699
|
+
raise DoesNotExist(
|
700
|
+
f"Did not find a ULabel matching `name{comparator}={value}`"
|
701
|
+
)
|
702
|
+
elif len(labels) == 1:
|
703
|
+
label = labels[0]
|
704
|
+
elif isinstance(value, Record):
|
705
|
+
label = value
|
706
|
+
label_registry = (
|
707
|
+
label.__class__ if label is not None else labels[0].__class__
|
708
|
+
)
|
709
|
+
accessor_name = (
|
710
|
+
label_registry.artifacts.through.artifact.field._related_name
|
711
|
+
)
|
712
|
+
new_expression[f"{accessor_name}__feature"] = feature
|
713
|
+
if label is not None:
|
714
|
+
# simplified query if we have exactly one label
|
715
|
+
new_expression[
|
716
|
+
f"{accessor_name}__{label_registry.__name__.lower()}"
|
717
|
+
] = label
|
718
|
+
else:
|
719
|
+
new_expression[
|
720
|
+
f"{accessor_name}__{label_registry.__name__.lower()}__in"
|
721
|
+
] = labels
|
718
722
|
# if passing a list of records, we want to
|
719
723
|
# find artifacts that are annotated by all of them at the same
|
720
724
|
# time; hence, we don't want the __in construct that we use to match strings
|
721
725
|
# https://laminlabs.slack.com/archives/C04FPE8V01W/p1688328084810609
|
722
|
-
|
726
|
+
if not (new_expression):
|
727
|
+
raise NotImplementedError
|
723
728
|
if cls == FeatureManager or cls == ParamManagerArtifact:
|
724
729
|
return Artifact.objects.filter(**new_expression)
|
725
730
|
elif cls == ParamManagerRun:
|
@@ -821,15 +826,14 @@ def _add_values(
|
|
821
826
|
from .artifact import Artifact
|
822
827
|
|
823
828
|
# rename to distinguish from the values inside the dict
|
824
|
-
|
825
|
-
keys =
|
829
|
+
dictionary = values
|
830
|
+
keys = dictionary.keys()
|
826
831
|
if isinstance(keys, DICT_KEYS_TYPE):
|
827
832
|
keys = list(keys) # type: ignore
|
828
833
|
# deal with other cases later
|
829
834
|
assert all(isinstance(key, str) for key in keys) # noqa: S101
|
830
835
|
registry = feature_param_field.field.model
|
831
836
|
is_param = registry == Param
|
832
|
-
model = Param if is_param else Feature
|
833
837
|
value_model = ParamValue if is_param else FeatureValue
|
834
838
|
model_name = "Param" if is_param else "Feature"
|
835
839
|
if is_param:
|
@@ -842,13 +846,11 @@ def _add_values(
|
|
842
846
|
raise ValidationError(
|
843
847
|
"Can only set features for dataset-like artifacts."
|
844
848
|
)
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
if validated.sum() != len(keys):
|
849
|
-
not_validated_keys = keys_array[~validated]
|
849
|
+
records = registry.from_values(keys, field=feature_param_field, mute=True)
|
850
|
+
if len(records) != len(keys):
|
851
|
+
not_validated_keys = [key for key in keys if key not in records.list("name")]
|
850
852
|
not_validated_keys_dtype_message = [
|
851
|
-
(key, infer_feature_type_convert_json(key,
|
853
|
+
(key, infer_feature_type_convert_json(key, dictionary[key]))
|
852
854
|
for key in not_validated_keys
|
853
855
|
]
|
854
856
|
run = get_current_tracked_run()
|
@@ -866,7 +868,7 @@ def _add_values(
|
|
866
868
|
]
|
867
869
|
hint = "\n".join(elements)
|
868
870
|
msg = (
|
869
|
-
f"These keys could not be validated: {not_validated_keys
|
871
|
+
f"These keys could not be validated: {not_validated_keys}\n"
|
870
872
|
f"Here is how to create a {model_name.lower()}:\n\n{hint}"
|
871
873
|
)
|
872
874
|
raise ValidationError(msg)
|
@@ -875,10 +877,10 @@ def _add_values(
|
|
875
877
|
features_labels = defaultdict(list)
|
876
878
|
_feature_values = []
|
877
879
|
not_validated_values = []
|
878
|
-
for
|
879
|
-
|
880
|
+
for feature in records:
|
881
|
+
value = dictionary[feature.name]
|
880
882
|
inferred_type, converted_value, _ = infer_feature_type_convert_json(
|
881
|
-
|
883
|
+
feature.name,
|
882
884
|
value,
|
883
885
|
mute=True,
|
884
886
|
str_as_ulabel=str_as_ulabel,
|
@@ -886,25 +888,23 @@ def _add_values(
|
|
886
888
|
if feature.dtype == "num":
|
887
889
|
if inferred_type not in {"int", "float"}:
|
888
890
|
raise TypeError(
|
889
|
-
f"Value for feature '{
|
891
|
+
f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
|
890
892
|
)
|
891
893
|
elif feature.dtype.startswith("cat"):
|
892
894
|
if inferred_type != "?":
|
893
895
|
if not (inferred_type.startswith("cat") or isinstance(value, Record)):
|
894
896
|
raise TypeError(
|
895
|
-
f"Value for feature '{
|
897
|
+
f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
|
896
898
|
)
|
897
899
|
elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
|
898
900
|
feature.dtype != "str" and feature.dtype != inferred_type
|
899
901
|
):
|
900
902
|
raise ValidationError(
|
901
|
-
f"Expected dtype for '{
|
903
|
+
f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
|
902
904
|
)
|
903
905
|
if not feature.dtype.startswith("cat"):
|
904
906
|
filter_kwargs = {model_name.lower(): feature, "value": converted_value}
|
905
|
-
feature_value = value_model.
|
906
|
-
if feature_value is None:
|
907
|
-
feature_value = value_model(**filter_kwargs)
|
907
|
+
feature_value, _ = value_model.get_or_create(**filter_kwargs)
|
908
908
|
_feature_values.append(feature_value)
|
909
909
|
else:
|
910
910
|
if isinstance(value, Record) or (
|
@@ -942,30 +942,63 @@ def _add_values(
|
|
942
942
|
(feature, label_record) for label_record in label_records
|
943
943
|
]
|
944
944
|
if not_validated_values:
|
945
|
-
|
946
|
-
|
947
|
-
f" ln.save(ulabels)"
|
948
|
-
)
|
945
|
+
not_validated_values.sort()
|
946
|
+
hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
|
949
947
|
msg = (
|
950
948
|
f"These values could not be validated: {not_validated_values}\n"
|
951
949
|
f"Here is how to create ulabels for them:\n\n{hint}"
|
952
950
|
)
|
953
951
|
raise ValidationError(msg)
|
952
|
+
# TODO: create an explicit version of this
|
953
|
+
# if not is_param:
|
954
|
+
# # check if _expect_many is false for _all_ records
|
955
|
+
# if any(record._expect_many for record in records):
|
956
|
+
# updated_features = []
|
957
|
+
# for record in records:
|
958
|
+
# if record._expect_many:
|
959
|
+
# record._expect_many = False
|
960
|
+
# record.save()
|
961
|
+
# updated_features.append(record.name)
|
962
|
+
# if any(updated_features):
|
963
|
+
# logger.important(
|
964
|
+
# f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
|
965
|
+
# )
|
954
966
|
# bulk add all links
|
955
967
|
if features_labels:
|
956
968
|
add_label_feature_links(self, features_labels)
|
957
969
|
if _feature_values:
|
958
|
-
|
970
|
+
to_insert_feature_values = [
|
971
|
+
record for record in _feature_values if record._state.adding
|
972
|
+
]
|
973
|
+
if to_insert_feature_values:
|
974
|
+
save(to_insert_feature_values)
|
975
|
+
dict_typed_features = [
|
976
|
+
getattr(record, model_name.lower())
|
977
|
+
for record in _feature_values
|
978
|
+
if getattr(record, model_name.lower()).dtype == "dict"
|
979
|
+
]
|
959
980
|
if is_param:
|
960
981
|
LinkORM = self._host._param_values.through
|
961
982
|
valuefield_id = "paramvalue_id"
|
962
983
|
else:
|
963
984
|
LinkORM = self._host._feature_values.through
|
964
985
|
valuefield_id = "featurevalue_id"
|
986
|
+
host_class_lower = self._host.__class__.__get_name_with_module__().lower()
|
987
|
+
if dict_typed_features:
|
988
|
+
# delete all previously existing anotations with dictionaries
|
989
|
+
kwargs = {
|
990
|
+
f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
|
991
|
+
f"{model_name.lower()}__in": dict_typed_features,
|
992
|
+
}
|
993
|
+
try:
|
994
|
+
value_model.filter(**kwargs).all().delete()
|
995
|
+
except ProtectedError:
|
996
|
+
pass
|
997
|
+
# add new feature links
|
965
998
|
links = [
|
966
999
|
LinkORM(
|
967
1000
|
**{
|
968
|
-
f"{
|
1001
|
+
f"{host_class_lower}_id": self._host.id,
|
969
1002
|
valuefield_id: feature_value.id,
|
970
1003
|
}
|
971
1004
|
)
|
@@ -1092,52 +1125,6 @@ def _add_schema(self, schema: Schema, slot: str) -> None:
|
|
1092
1125
|
self._slots[slot] = schema # type: ignore
|
1093
1126
|
|
1094
1127
|
|
1095
|
-
def _add_set_from_df(
|
1096
|
-
self,
|
1097
|
-
field: FieldAttr = Feature.name,
|
1098
|
-
organism: str | None = None,
|
1099
|
-
mute: bool = False,
|
1100
|
-
):
|
1101
|
-
"""Add feature set corresponding to column names of DataFrame."""
|
1102
|
-
assert self._host.otype == "DataFrame" # noqa: S101
|
1103
|
-
df = self._host.load(is_run_input=False)
|
1104
|
-
schema = Schema.from_df(
|
1105
|
-
df=df,
|
1106
|
-
field=field,
|
1107
|
-
mute=mute,
|
1108
|
-
organism=organism,
|
1109
|
-
)
|
1110
|
-
self._host._staged_feature_sets = {"columns": schema}
|
1111
|
-
self._host.save()
|
1112
|
-
|
1113
|
-
|
1114
|
-
def _add_set_from_anndata(
|
1115
|
-
self,
|
1116
|
-
var_field: FieldAttr | None = None,
|
1117
|
-
obs_field: FieldAttr | None = Feature.name,
|
1118
|
-
uns_field: FieldAttr | None = None,
|
1119
|
-
mute: bool = False,
|
1120
|
-
organism: str | Record | None = None,
|
1121
|
-
):
|
1122
|
-
"""Add features from AnnData."""
|
1123
|
-
assert self._host.otype == "AnnData" # noqa: S101
|
1124
|
-
|
1125
|
-
# parse and register features
|
1126
|
-
adata = self._host.load(is_run_input=False)
|
1127
|
-
feature_sets = parse_staged_feature_sets_from_anndata(
|
1128
|
-
adata,
|
1129
|
-
var_field=var_field,
|
1130
|
-
obs_field=obs_field,
|
1131
|
-
uns_field=uns_field,
|
1132
|
-
mute=mute,
|
1133
|
-
organism=organism,
|
1134
|
-
)
|
1135
|
-
|
1136
|
-
# link feature sets
|
1137
|
-
self._host._staged_feature_sets = feature_sets
|
1138
|
-
self._host.save()
|
1139
|
-
|
1140
|
-
|
1141
1128
|
def _unify_staged_feature_sets_by_hash(
|
1142
1129
|
feature_sets: MutableMapping[str, Schema],
|
1143
1130
|
):
|
@@ -1153,83 +1140,6 @@ def _unify_staged_feature_sets_by_hash(
|
|
1153
1140
|
return feature_sets
|
1154
1141
|
|
1155
1142
|
|
1156
|
-
def _add_set_from_mudata(
|
1157
|
-
self,
|
1158
|
-
var_fields: dict[str, FieldAttr] | None = None,
|
1159
|
-
obs_fields: dict[str, FieldAttr] | None = None,
|
1160
|
-
mute: bool = False,
|
1161
|
-
organism: str | Record | None = None,
|
1162
|
-
):
|
1163
|
-
"""Add features from MuData."""
|
1164
|
-
if obs_fields is None:
|
1165
|
-
obs_fields = {}
|
1166
|
-
assert self._host.otype == "MuData" # noqa: S101
|
1167
|
-
|
1168
|
-
# parse and register features
|
1169
|
-
mdata = self._host.load(is_run_input=False)
|
1170
|
-
feature_sets = {}
|
1171
|
-
|
1172
|
-
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1173
|
-
if len(obs_features) > 0:
|
1174
|
-
feature_sets["obs"] = Schema(features=obs_features)
|
1175
|
-
for modality, field in var_fields.items():
|
1176
|
-
modality_fs = parse_staged_feature_sets_from_anndata(
|
1177
|
-
mdata[modality],
|
1178
|
-
var_field=field,
|
1179
|
-
obs_field=obs_fields.get(modality, Feature.name),
|
1180
|
-
mute=mute,
|
1181
|
-
organism=organism,
|
1182
|
-
)
|
1183
|
-
for k, v in modality_fs.items():
|
1184
|
-
feature_sets[f"['{modality}'].{k}"] = v
|
1185
|
-
|
1186
|
-
# link feature sets
|
1187
|
-
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1188
|
-
self._host.save()
|
1189
|
-
|
1190
|
-
|
1191
|
-
def _add_set_from_spatialdata(
|
1192
|
-
self,
|
1193
|
-
sample_metadata_key: str,
|
1194
|
-
sample_metadata_field: FieldAttr = Feature.name,
|
1195
|
-
var_fields: dict[str, FieldAttr] | None = None,
|
1196
|
-
obs_fields: dict[str, FieldAttr] | None = None,
|
1197
|
-
mute: bool = False,
|
1198
|
-
organism: str | Record | None = None,
|
1199
|
-
):
|
1200
|
-
"""Add features from SpatialData."""
|
1201
|
-
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1202
|
-
assert self._host.otype == "SpatialData" # noqa: S101
|
1203
|
-
|
1204
|
-
# parse and register features
|
1205
|
-
sdata = self._host.load(is_run_input=False)
|
1206
|
-
feature_sets = {}
|
1207
|
-
|
1208
|
-
# sample features
|
1209
|
-
sample_features = Feature.from_values(
|
1210
|
-
sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
|
1211
|
-
field=sample_metadata_field,
|
1212
|
-
) # type: ignore
|
1213
|
-
if len(sample_features) > 0:
|
1214
|
-
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1215
|
-
|
1216
|
-
# table features
|
1217
|
-
for table, field in var_fields.items():
|
1218
|
-
table_fs = parse_staged_feature_sets_from_anndata(
|
1219
|
-
sdata[table],
|
1220
|
-
var_field=field,
|
1221
|
-
obs_field=obs_fields.get(table, Feature.name),
|
1222
|
-
mute=mute,
|
1223
|
-
organism=organism,
|
1224
|
-
)
|
1225
|
-
for k, v in table_fs.items():
|
1226
|
-
feature_sets[f"['{table}'].{k}"] = v
|
1227
|
-
|
1228
|
-
# link feature sets
|
1229
|
-
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1230
|
-
self._host.save()
|
1231
|
-
|
1232
|
-
|
1233
1143
|
def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
1234
1144
|
"""Transfer features from a artifact or collection."""
|
1235
1145
|
# This only covers feature sets
|
@@ -1340,25 +1250,200 @@ def _feature_set_by_slot(self):
|
|
1340
1250
|
return self.slots
|
1341
1251
|
|
1342
1252
|
|
1253
|
+
# deprecated: feature set parsing
|
1254
|
+
|
1255
|
+
|
1256
|
+
def parse_staged_feature_sets_from_anndata(
|
1257
|
+
adata: AnnData,
|
1258
|
+
var_field: FieldAttr | None = None,
|
1259
|
+
obs_field: FieldAttr = Feature.name,
|
1260
|
+
uns_field: FieldAttr | None = None,
|
1261
|
+
mute: bool = False,
|
1262
|
+
organism: str | Record | None = None,
|
1263
|
+
) -> dict:
|
1264
|
+
data_parse = adata
|
1265
|
+
if not isinstance(adata, AnnData): # is a path
|
1266
|
+
filepath = create_path(adata) # returns Path for local
|
1267
|
+
if not isinstance(filepath, LocalPathClasses):
|
1268
|
+
from lamindb import settings
|
1269
|
+
from lamindb.core.storage._backed_access import backed_access
|
1270
|
+
|
1271
|
+
using_key = settings._using_key
|
1272
|
+
data_parse = backed_access(filepath, using_key=using_key)
|
1273
|
+
else:
|
1274
|
+
data_parse = ad.read_h5ad(filepath, backed="r")
|
1275
|
+
type = "float"
|
1276
|
+
else:
|
1277
|
+
type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
|
1278
|
+
feature_sets = {}
|
1279
|
+
if var_field is not None:
|
1280
|
+
schema_var = Schema.from_values(
|
1281
|
+
data_parse.var.index,
|
1282
|
+
var_field,
|
1283
|
+
type=type,
|
1284
|
+
mute=mute,
|
1285
|
+
organism=organism,
|
1286
|
+
raise_validation_error=False,
|
1287
|
+
)
|
1288
|
+
if schema_var is not None:
|
1289
|
+
feature_sets["var"] = schema_var
|
1290
|
+
if obs_field is not None and len(data_parse.obs.columns) > 0:
|
1291
|
+
schema_obs = Schema.from_df(
|
1292
|
+
df=data_parse.obs,
|
1293
|
+
field=obs_field,
|
1294
|
+
mute=mute,
|
1295
|
+
organism=organism,
|
1296
|
+
)
|
1297
|
+
if schema_obs is not None:
|
1298
|
+
feature_sets["obs"] = schema_obs
|
1299
|
+
if uns_field is not None and len(data_parse.uns) > 0:
|
1300
|
+
validated_features = Feature.from_values( # type: ignore
|
1301
|
+
data_parse.uns.keys(), field=uns_field, organism=organism
|
1302
|
+
)
|
1303
|
+
if len(validated_features) > 0:
|
1304
|
+
schema_uns = Schema(validated_features, dtype=None, otype="dict")
|
1305
|
+
feature_sets["uns"] = schema_uns
|
1306
|
+
return feature_sets
|
1307
|
+
|
1308
|
+
|
1309
|
+
# no longer called from within curator
|
1310
|
+
# might deprecate in the future?
|
1311
|
+
def _add_set_from_df(
|
1312
|
+
self,
|
1313
|
+
field: FieldAttr = Feature.name,
|
1314
|
+
organism: str | None = None,
|
1315
|
+
mute: bool = False,
|
1316
|
+
):
|
1317
|
+
"""Add feature set corresponding to column names of DataFrame."""
|
1318
|
+
assert self._host.otype == "DataFrame" # noqa: S101
|
1319
|
+
df = self._host.load(is_run_input=False)
|
1320
|
+
schema = Schema.from_df(
|
1321
|
+
df=df,
|
1322
|
+
field=field,
|
1323
|
+
mute=mute,
|
1324
|
+
organism=organism,
|
1325
|
+
)
|
1326
|
+
self._host._staged_feature_sets = {"columns": schema}
|
1327
|
+
self._host.save()
|
1328
|
+
|
1329
|
+
|
1330
|
+
def _add_set_from_anndata(
|
1331
|
+
self,
|
1332
|
+
var_field: FieldAttr | None = None,
|
1333
|
+
obs_field: FieldAttr | None = Feature.name,
|
1334
|
+
uns_field: FieldAttr | None = None,
|
1335
|
+
mute: bool = False,
|
1336
|
+
organism: str | Record | None = None,
|
1337
|
+
):
|
1338
|
+
"""Add features from AnnData."""
|
1339
|
+
assert self._host.otype == "AnnData" # noqa: S101
|
1340
|
+
|
1341
|
+
# parse and register features
|
1342
|
+
adata = self._host.load(is_run_input=False)
|
1343
|
+
feature_sets = parse_staged_feature_sets_from_anndata(
|
1344
|
+
adata,
|
1345
|
+
var_field=var_field,
|
1346
|
+
obs_field=obs_field,
|
1347
|
+
uns_field=uns_field,
|
1348
|
+
mute=mute,
|
1349
|
+
organism=organism,
|
1350
|
+
)
|
1351
|
+
|
1352
|
+
# link feature sets
|
1353
|
+
self._host._staged_feature_sets = feature_sets
|
1354
|
+
self._host.save()
|
1355
|
+
|
1356
|
+
|
1357
|
+
def _add_set_from_mudata(
|
1358
|
+
self,
|
1359
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1360
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1361
|
+
mute: bool = False,
|
1362
|
+
organism: str | Record | None = None,
|
1363
|
+
):
|
1364
|
+
"""Add features from MuData."""
|
1365
|
+
if obs_fields is None:
|
1366
|
+
obs_fields = {}
|
1367
|
+
assert self._host.otype == "MuData" # noqa: S101
|
1368
|
+
|
1369
|
+
# parse and register features
|
1370
|
+
mdata = self._host.load(is_run_input=False)
|
1371
|
+
feature_sets = {}
|
1372
|
+
|
1373
|
+
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1374
|
+
if len(obs_features) > 0:
|
1375
|
+
feature_sets["obs"] = Schema(features=obs_features)
|
1376
|
+
for modality, field in var_fields.items():
|
1377
|
+
modality_fs = parse_staged_feature_sets_from_anndata(
|
1378
|
+
mdata[modality],
|
1379
|
+
var_field=field,
|
1380
|
+
obs_field=obs_fields.get(modality, Feature.name),
|
1381
|
+
mute=mute,
|
1382
|
+
organism=organism,
|
1383
|
+
)
|
1384
|
+
for k, v in modality_fs.items():
|
1385
|
+
feature_sets[f"['{modality}'].{k}"] = v
|
1386
|
+
|
1387
|
+
# link feature sets
|
1388
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1389
|
+
self._host.save()
|
1390
|
+
|
1391
|
+
|
1392
|
+
def _add_set_from_spatialdata(
|
1393
|
+
self,
|
1394
|
+
sample_metadata_key: str,
|
1395
|
+
sample_metadata_field: FieldAttr = Feature.name,
|
1396
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1397
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1398
|
+
mute: bool = False,
|
1399
|
+
organism: str | Record | None = None,
|
1400
|
+
):
|
1401
|
+
"""Add features from SpatialData."""
|
1402
|
+
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1403
|
+
assert self._host.otype == "SpatialData" # noqa: S101
|
1404
|
+
|
1405
|
+
# parse and register features
|
1406
|
+
sdata = self._host.load(is_run_input=False)
|
1407
|
+
feature_sets = {}
|
1408
|
+
|
1409
|
+
# sample features
|
1410
|
+
sample_features = Feature.from_values(
|
1411
|
+
sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
|
1412
|
+
field=sample_metadata_field,
|
1413
|
+
) # type: ignore
|
1414
|
+
if len(sample_features) > 0:
|
1415
|
+
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1416
|
+
|
1417
|
+
# table features
|
1418
|
+
for table, field in var_fields.items():
|
1419
|
+
table_fs = parse_staged_feature_sets_from_anndata(
|
1420
|
+
sdata[table],
|
1421
|
+
var_field=field,
|
1422
|
+
obs_field=obs_fields.get(table, Feature.name),
|
1423
|
+
mute=mute,
|
1424
|
+
organism=organism,
|
1425
|
+
)
|
1426
|
+
for k, v in table_fs.items():
|
1427
|
+
feature_sets[f"['{table}'].{k}"] = v
|
1428
|
+
|
1429
|
+
# link feature sets
|
1430
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1431
|
+
self._host.save()
|
1432
|
+
|
1433
|
+
|
1343
1434
|
# mypy: ignore-errors
|
1344
1435
|
FeatureManager.__init__ = __init__
|
1345
1436
|
ParamManager.__init__ = __init__
|
1346
1437
|
FeatureManager.__repr__ = __repr__
|
1347
1438
|
ParamManager.__repr__ = __repr__
|
1439
|
+
FeatureManager.describe = describe
|
1440
|
+
ParamManager.describe = describe
|
1348
1441
|
FeatureManager.__getitem__ = __getitem__
|
1349
1442
|
FeatureManager.get_values = get_values
|
1350
1443
|
FeatureManager.slots = slots
|
1351
1444
|
FeatureManager.add_values = add_values_features
|
1352
1445
|
FeatureManager._add_schema = _add_schema
|
1353
|
-
FeatureManager.add_schema = add_schema # deprecated
|
1354
|
-
FeatureManager.add_feature_set = add_feature_set # deprecated
|
1355
|
-
FeatureManager._schema_by_slot = _schema_by_slot # deprecated
|
1356
|
-
FeatureManager._feature_set_by_slot = _feature_set_by_slot # deprecated
|
1357
1446
|
FeatureManager._accessor_by_registry = _accessor_by_registry
|
1358
|
-
FeatureManager._add_set_from_df = _add_set_from_df
|
1359
|
-
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1360
|
-
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1361
|
-
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1362
1447
|
FeatureManager._add_from = _add_from
|
1363
1448
|
FeatureManager.filter = filter
|
1364
1449
|
FeatureManager.get = get
|
@@ -1367,3 +1452,13 @@ FeatureManager.remove_values = remove_values
|
|
1367
1452
|
ParamManager.add_values = add_values_params
|
1368
1453
|
ParamManager.get_values = get_values
|
1369
1454
|
ParamManager.filter = filter
|
1455
|
+
|
1456
|
+
# deprecated
|
1457
|
+
FeatureManager._add_set_from_df = _add_set_from_df
|
1458
|
+
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1459
|
+
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1460
|
+
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1461
|
+
FeatureManager.add_schema = add_schema
|
1462
|
+
FeatureManager.add_feature_set = add_feature_set
|
1463
|
+
FeatureManager._schema_by_slot = _schema_by_slot
|
1464
|
+
FeatureManager._feature_set_by_slot = _feature_set_by_slot
|