lamindb 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +177 -89
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3349
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1546 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +365 -286
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +173 -95
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/collection.py +73 -52
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +51 -17
- lamindb/models/has_parents.py +2 -2
- lamindb/models/project.py +1 -1
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +245 -171
- lamindb/models/record.py +62 -243
- lamindb/models/run.py +4 -4
- lamindb/models/save.py +8 -2
- lamindb/models/schema.py +458 -181
- lamindb/models/transform.py +2 -2
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/METADATA +6 -6
- {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/RECORD +55 -42
- {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
- {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
@@ -13,7 +13,7 @@ import pandas as pd
|
|
13
13
|
from anndata import AnnData
|
14
14
|
from django.contrib.postgres.aggregates import ArrayAgg
|
15
15
|
from django.db import connections
|
16
|
-
from django.db.models import Aggregate
|
16
|
+
from django.db.models import Aggregate, ProtectedError, Subquery
|
17
17
|
from lamin_utils import logger
|
18
18
|
from lamindb_setup.core.hashing import hash_set
|
19
19
|
from lamindb_setup.core.upath import create_path
|
@@ -42,7 +42,7 @@ from ._describe import (
|
|
42
42
|
TYPE_WIDTH,
|
43
43
|
VALUES_WIDTH,
|
44
44
|
describe_header,
|
45
|
-
|
45
|
+
format_rich_tree,
|
46
46
|
)
|
47
47
|
from ._django import get_artifact_with_related
|
48
48
|
from ._label_manager import _get_labels, describe_labels
|
@@ -320,15 +320,27 @@ def describe_features(
|
|
320
320
|
schema_data[slot] = (schema, feature_names)
|
321
321
|
for feature_name in feature_names:
|
322
322
|
feature_data[feature_name] = (slot, registry_str)
|
323
|
+
schema_data.update(
|
324
|
+
{
|
325
|
+
slot: (schema, schema.n)
|
326
|
+
for slot, schema in get_schema_by_slot_(self).items()
|
327
|
+
if slot not in schema_data
|
328
|
+
}
|
329
|
+
)
|
323
330
|
else:
|
324
331
|
for slot, schema in get_schema_by_slot_(self).items():
|
325
332
|
features = schema.members
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
333
|
+
if features.exists():
|
334
|
+
# features.first() is a lot slower than features[0] here
|
335
|
+
name_field = get_name_field(features[0])
|
336
|
+
feature_names = list(
|
337
|
+
features.values_list(name_field, flat=True)[:20]
|
338
|
+
)
|
339
|
+
schema_data[slot] = (schema, feature_names)
|
340
|
+
for feature_name in feature_names:
|
341
|
+
feature_data[feature_name] = (slot, schema.itype)
|
342
|
+
else:
|
343
|
+
schema_data[slot] = (schema, schema.n)
|
332
344
|
|
333
345
|
internal_feature_names: dict[str, str] = {}
|
334
346
|
if isinstance(self, Artifact):
|
@@ -400,38 +412,44 @@ def describe_features(
|
|
400
412
|
internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
|
401
413
|
|
402
414
|
int_features_tree_children = []
|
403
|
-
for slot, (schema,
|
404
|
-
if
|
405
|
-
|
406
|
-
feature_rows = internal_feature_labels_slot[slot]
|
407
|
-
# add internal Feature features without labels
|
408
|
-
feature_rows += [
|
409
|
-
(
|
410
|
-
feature_name,
|
411
|
-
Text(str(internal_feature_names.get(feature_name)), style="dim"),
|
412
|
-
"",
|
413
|
-
)
|
414
|
-
for feature_name in feature_names
|
415
|
-
if feature_name and feature_name not in internal_feature_labels
|
416
|
-
]
|
415
|
+
for slot, (schema, feature_names_or_n) in schema_data.items():
|
416
|
+
if isinstance(feature_names_or_n, int):
|
417
|
+
feature_rows = []
|
417
418
|
else:
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
419
|
+
feature_names = feature_names_or_n
|
420
|
+
if slot in internal_feature_labels_slot:
|
421
|
+
# add internal Feature features with labels
|
422
|
+
feature_rows = internal_feature_labels_slot[slot]
|
423
|
+
# add internal Feature features without labels
|
424
|
+
feature_rows += [
|
425
|
+
(
|
426
|
+
feature_name,
|
427
|
+
Text(
|
428
|
+
str(internal_feature_names.get(feature_name)), style="dim"
|
427
429
|
),
|
428
|
-
|
429
|
-
)
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
430
|
+
"",
|
431
|
+
)
|
432
|
+
for feature_name in feature_names
|
433
|
+
if feature_name and feature_name not in internal_feature_labels
|
434
|
+
]
|
435
|
+
else:
|
436
|
+
# add internal non-Feature features without labels
|
437
|
+
feature_rows = [
|
438
|
+
(
|
439
|
+
feature_name,
|
440
|
+
Text(
|
441
|
+
str(
|
442
|
+
internal_feature_names.get(feature_name)
|
443
|
+
if feature_name in internal_feature_names
|
444
|
+
else schema.dtype
|
445
|
+
),
|
446
|
+
style="dim",
|
447
|
+
),
|
448
|
+
"",
|
449
|
+
)
|
450
|
+
for feature_name in feature_names
|
451
|
+
if feature_name
|
452
|
+
]
|
435
453
|
int_features_tree_children.append(
|
436
454
|
_create_feature_table(
|
437
455
|
Text.assemble(
|
@@ -482,59 +500,6 @@ def describe_features(
|
|
482
500
|
return tree
|
483
501
|
|
484
502
|
|
485
|
-
def parse_staged_feature_sets_from_anndata(
|
486
|
-
adata: AnnData,
|
487
|
-
var_field: FieldAttr | None = None,
|
488
|
-
obs_field: FieldAttr = Feature.name,
|
489
|
-
uns_field: FieldAttr | None = None,
|
490
|
-
mute: bool = False,
|
491
|
-
organism: str | Record | None = None,
|
492
|
-
) -> dict:
|
493
|
-
data_parse = adata
|
494
|
-
if not isinstance(adata, AnnData): # is a path
|
495
|
-
filepath = create_path(adata) # returns Path for local
|
496
|
-
if not isinstance(filepath, LocalPathClasses):
|
497
|
-
from lamindb import settings
|
498
|
-
from lamindb.core.storage._backed_access import backed_access
|
499
|
-
|
500
|
-
using_key = settings._using_key
|
501
|
-
data_parse = backed_access(filepath, using_key=using_key)
|
502
|
-
else:
|
503
|
-
data_parse = ad.read_h5ad(filepath, backed="r")
|
504
|
-
type = "float"
|
505
|
-
else:
|
506
|
-
type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
|
507
|
-
feature_sets = {}
|
508
|
-
if var_field is not None:
|
509
|
-
schema_var = Schema.from_values(
|
510
|
-
data_parse.var.index,
|
511
|
-
var_field,
|
512
|
-
type=type,
|
513
|
-
mute=mute,
|
514
|
-
organism=organism,
|
515
|
-
raise_validation_error=False,
|
516
|
-
)
|
517
|
-
if schema_var is not None:
|
518
|
-
feature_sets["var"] = schema_var
|
519
|
-
if obs_field is not None and len(data_parse.obs.columns) > 0:
|
520
|
-
schema_obs = Schema.from_df(
|
521
|
-
df=data_parse.obs,
|
522
|
-
field=obs_field,
|
523
|
-
mute=mute,
|
524
|
-
organism=organism,
|
525
|
-
)
|
526
|
-
if schema_obs is not None:
|
527
|
-
feature_sets["obs"] = schema_obs
|
528
|
-
if uns_field is not None and len(data_parse.uns) > 0:
|
529
|
-
validated_features = Feature.from_values( # type: ignore
|
530
|
-
data_parse.uns.keys(), field=uns_field, organism=organism
|
531
|
-
)
|
532
|
-
if len(validated_features) > 0:
|
533
|
-
schema_uns = Schema(validated_features, dtype=None, otype="dict")
|
534
|
-
feature_sets["uns"] = schema_uns
|
535
|
-
return feature_sets
|
536
|
-
|
537
|
-
|
538
503
|
def is_valid_datetime_str(date_string: str) -> bool | str:
|
539
504
|
try:
|
540
505
|
dt = datetime.fromisoformat(date_string)
|
@@ -625,8 +590,12 @@ def __init__(self, host: Artifact | Collection | Run):
|
|
625
590
|
|
626
591
|
|
627
592
|
def __repr__(self) -> str:
|
593
|
+
return describe(self, return_str=True) # type: ignore
|
594
|
+
|
595
|
+
|
596
|
+
def describe(self, return_str: bool = False) -> str | None:
|
628
597
|
tree = describe_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
|
629
|
-
return
|
598
|
+
return format_rich_tree(tree, fallback="no linked features", return_str=return_str)
|
630
599
|
|
631
600
|
|
632
601
|
def get_values(self) -> dict[str, Any]:
|
@@ -676,50 +645,70 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
|
|
676
645
|
comparator = f"__{split_key[1]}"
|
677
646
|
feature = features.get(name=normalized_key)
|
678
647
|
if not feature.dtype.startswith("cat"):
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
# we need the comparator here because users might query like so
|
690
|
-
# ln.Artifact.features.filter(experiment__contains="Experi")
|
691
|
-
expression = {f"name{comparator}": value}
|
692
|
-
labels = ULabel.filter(**expression).all()
|
693
|
-
if len(labels) == 0:
|
694
|
-
raise DoesNotExist(
|
695
|
-
f"Did not find a ULabel matching `name{comparator}={value}`"
|
648
|
+
if comparator == "__isnull":
|
649
|
+
if cls == FeatureManager:
|
650
|
+
from .artifact import ArtifactFeatureValue
|
651
|
+
|
652
|
+
return Artifact.objects.exclude(
|
653
|
+
id__in=Subquery(
|
654
|
+
ArtifactFeatureValue.objects.filter(
|
655
|
+
featurevalue__feature=feature
|
656
|
+
).values("artifact_id")
|
657
|
+
)
|
696
658
|
)
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
)
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
# simplified query if we have exactly one label
|
710
|
-
new_expression[
|
711
|
-
f"{accessor_name}__{label_registry.__name__.lower()}"
|
712
|
-
] = label
|
659
|
+
if comparator in {"__startswith", "__contains"}:
|
660
|
+
logger.important(
|
661
|
+
f"currently not supporting `{comparator}`, using `__icontains` instead"
|
662
|
+
)
|
663
|
+
comparator = "__icontains"
|
664
|
+
expression = {feature_param: feature, f"value{comparator}": value}
|
665
|
+
feature_values = value_model.filter(**expression)
|
666
|
+
new_expression[f"_{feature_param}_values__id__in"] = feature_values
|
667
|
+
elif isinstance(value, (str, Record, bool)):
|
668
|
+
if comparator == "__isnull":
|
669
|
+
if cls == FeatureManager:
|
670
|
+
return Artifact.objects.exclude(links_ulabel__feature=feature)
|
713
671
|
else:
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
672
|
+
# because SQL is sensitive to whether querying with __in or not
|
673
|
+
# and might return multiple equivalent records for the latter
|
674
|
+
# we distinguish cases in which we have multiple label matches vs. one
|
675
|
+
label = None
|
676
|
+
labels = None
|
677
|
+
if isinstance(value, str):
|
678
|
+
# we need the comparator here because users might query like so
|
679
|
+
# ln.Artifact.filter(experiment__contains="Experi")
|
680
|
+
expression = {f"name{comparator}": value}
|
681
|
+
labels = ULabel.filter(**expression).all()
|
682
|
+
if len(labels) == 0:
|
683
|
+
raise DoesNotExist(
|
684
|
+
f"Did not find a ULabel matching `name{comparator}={value}`"
|
685
|
+
)
|
686
|
+
elif len(labels) == 1:
|
687
|
+
label = labels[0]
|
688
|
+
elif isinstance(value, Record):
|
689
|
+
label = value
|
690
|
+
label_registry = (
|
691
|
+
label.__class__ if label is not None else labels[0].__class__
|
692
|
+
)
|
693
|
+
accessor_name = (
|
694
|
+
label_registry.artifacts.through.artifact.field._related_name
|
695
|
+
)
|
696
|
+
new_expression[f"{accessor_name}__feature"] = feature
|
697
|
+
if label is not None:
|
698
|
+
# simplified query if we have exactly one label
|
699
|
+
new_expression[
|
700
|
+
f"{accessor_name}__{label_registry.__name__.lower()}"
|
701
|
+
] = label
|
702
|
+
else:
|
703
|
+
new_expression[
|
704
|
+
f"{accessor_name}__{label_registry.__name__.lower()}__in"
|
705
|
+
] = labels
|
718
706
|
# if passing a list of records, we want to
|
719
707
|
# find artifacts that are annotated by all of them at the same
|
720
708
|
# time; hence, we don't want the __in construct that we use to match strings
|
721
709
|
# https://laminlabs.slack.com/archives/C04FPE8V01W/p1688328084810609
|
722
|
-
|
710
|
+
if not (new_expression):
|
711
|
+
raise NotImplementedError
|
723
712
|
if cls == FeatureManager or cls == ParamManagerArtifact:
|
724
713
|
return Artifact.objects.filter(**new_expression)
|
725
714
|
elif cls == ParamManagerRun:
|
@@ -821,15 +810,14 @@ def _add_values(
|
|
821
810
|
from .artifact import Artifact
|
822
811
|
|
823
812
|
# rename to distinguish from the values inside the dict
|
824
|
-
|
825
|
-
keys =
|
813
|
+
dictionary = values
|
814
|
+
keys = dictionary.keys()
|
826
815
|
if isinstance(keys, DICT_KEYS_TYPE):
|
827
816
|
keys = list(keys) # type: ignore
|
828
817
|
# deal with other cases later
|
829
818
|
assert all(isinstance(key, str) for key in keys) # noqa: S101
|
830
819
|
registry = feature_param_field.field.model
|
831
820
|
is_param = registry == Param
|
832
|
-
model = Param if is_param else Feature
|
833
821
|
value_model = ParamValue if is_param else FeatureValue
|
834
822
|
model_name = "Param" if is_param else "Feature"
|
835
823
|
if is_param:
|
@@ -842,13 +830,11 @@ def _add_values(
|
|
842
830
|
raise ValidationError(
|
843
831
|
"Can only set features for dataset-like artifacts."
|
844
832
|
)
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
if validated.sum() != len(keys):
|
849
|
-
not_validated_keys = keys_array[~validated]
|
833
|
+
records = registry.from_values(keys, field=feature_param_field, mute=True)
|
834
|
+
if len(records) != len(keys):
|
835
|
+
not_validated_keys = [key for key in keys if key not in records.list("name")]
|
850
836
|
not_validated_keys_dtype_message = [
|
851
|
-
(key, infer_feature_type_convert_json(key,
|
837
|
+
(key, infer_feature_type_convert_json(key, dictionary[key]))
|
852
838
|
for key in not_validated_keys
|
853
839
|
]
|
854
840
|
run = get_current_tracked_run()
|
@@ -866,7 +852,7 @@ def _add_values(
|
|
866
852
|
]
|
867
853
|
hint = "\n".join(elements)
|
868
854
|
msg = (
|
869
|
-
f"These keys could not be validated: {not_validated_keys
|
855
|
+
f"These keys could not be validated: {not_validated_keys}\n"
|
870
856
|
f"Here is how to create a {model_name.lower()}:\n\n{hint}"
|
871
857
|
)
|
872
858
|
raise ValidationError(msg)
|
@@ -875,10 +861,10 @@ def _add_values(
|
|
875
861
|
features_labels = defaultdict(list)
|
876
862
|
_feature_values = []
|
877
863
|
not_validated_values = []
|
878
|
-
for
|
879
|
-
|
864
|
+
for feature in records:
|
865
|
+
value = dictionary[feature.name]
|
880
866
|
inferred_type, converted_value, _ = infer_feature_type_convert_json(
|
881
|
-
|
867
|
+
feature.name,
|
882
868
|
value,
|
883
869
|
mute=True,
|
884
870
|
str_as_ulabel=str_as_ulabel,
|
@@ -886,25 +872,23 @@ def _add_values(
|
|
886
872
|
if feature.dtype == "num":
|
887
873
|
if inferred_type not in {"int", "float"}:
|
888
874
|
raise TypeError(
|
889
|
-
f"Value for feature '{
|
875
|
+
f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
|
890
876
|
)
|
891
877
|
elif feature.dtype.startswith("cat"):
|
892
878
|
if inferred_type != "?":
|
893
879
|
if not (inferred_type.startswith("cat") or isinstance(value, Record)):
|
894
880
|
raise TypeError(
|
895
|
-
f"Value for feature '{
|
881
|
+
f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
|
896
882
|
)
|
897
883
|
elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
|
898
884
|
feature.dtype != "str" and feature.dtype != inferred_type
|
899
885
|
):
|
900
886
|
raise ValidationError(
|
901
|
-
f"Expected dtype for '{
|
887
|
+
f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
|
902
888
|
)
|
903
889
|
if not feature.dtype.startswith("cat"):
|
904
890
|
filter_kwargs = {model_name.lower(): feature, "value": converted_value}
|
905
|
-
feature_value = value_model.
|
906
|
-
if feature_value is None:
|
907
|
-
feature_value = value_model(**filter_kwargs)
|
891
|
+
feature_value, _ = value_model.get_or_create(**filter_kwargs)
|
908
892
|
_feature_values.append(feature_value)
|
909
893
|
else:
|
910
894
|
if isinstance(value, Record) or (
|
@@ -942,30 +926,63 @@ def _add_values(
|
|
942
926
|
(feature, label_record) for label_record in label_records
|
943
927
|
]
|
944
928
|
if not_validated_values:
|
945
|
-
|
946
|
-
|
947
|
-
f" ln.save(ulabels)"
|
948
|
-
)
|
929
|
+
not_validated_values.sort()
|
930
|
+
hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
|
949
931
|
msg = (
|
950
932
|
f"These values could not be validated: {not_validated_values}\n"
|
951
933
|
f"Here is how to create ulabels for them:\n\n{hint}"
|
952
934
|
)
|
953
935
|
raise ValidationError(msg)
|
936
|
+
# TODO: create an explicit version of this
|
937
|
+
# if not is_param:
|
938
|
+
# # check if _expect_many is false for _all_ records
|
939
|
+
# if any(record._expect_many for record in records):
|
940
|
+
# updated_features = []
|
941
|
+
# for record in records:
|
942
|
+
# if record._expect_many:
|
943
|
+
# record._expect_many = False
|
944
|
+
# record.save()
|
945
|
+
# updated_features.append(record.name)
|
946
|
+
# if any(updated_features):
|
947
|
+
# logger.important(
|
948
|
+
# f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
|
949
|
+
# )
|
954
950
|
# bulk add all links
|
955
951
|
if features_labels:
|
956
952
|
add_label_feature_links(self, features_labels)
|
957
953
|
if _feature_values:
|
958
|
-
|
954
|
+
to_insert_feature_values = [
|
955
|
+
record for record in _feature_values if record._state.adding
|
956
|
+
]
|
957
|
+
if to_insert_feature_values:
|
958
|
+
save(to_insert_feature_values)
|
959
|
+
dict_typed_features = [
|
960
|
+
getattr(record, model_name.lower())
|
961
|
+
for record in _feature_values
|
962
|
+
if getattr(record, model_name.lower()).dtype == "dict"
|
963
|
+
]
|
959
964
|
if is_param:
|
960
965
|
LinkORM = self._host._param_values.through
|
961
966
|
valuefield_id = "paramvalue_id"
|
962
967
|
else:
|
963
968
|
LinkORM = self._host._feature_values.through
|
964
969
|
valuefield_id = "featurevalue_id"
|
970
|
+
host_class_lower = self._host.__class__.__get_name_with_module__().lower()
|
971
|
+
if dict_typed_features:
|
972
|
+
# delete all previously existing anotations with dictionaries
|
973
|
+
kwargs = {
|
974
|
+
f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
|
975
|
+
f"{model_name.lower()}__in": dict_typed_features,
|
976
|
+
}
|
977
|
+
try:
|
978
|
+
value_model.filter(**kwargs).all().delete()
|
979
|
+
except ProtectedError:
|
980
|
+
pass
|
981
|
+
# add new feature links
|
965
982
|
links = [
|
966
983
|
LinkORM(
|
967
984
|
**{
|
968
|
-
f"{
|
985
|
+
f"{host_class_lower}_id": self._host.id,
|
969
986
|
valuefield_id: feature_value.id,
|
970
987
|
}
|
971
988
|
)
|
@@ -1092,52 +1109,6 @@ def _add_schema(self, schema: Schema, slot: str) -> None:
|
|
1092
1109
|
self._slots[slot] = schema # type: ignore
|
1093
1110
|
|
1094
1111
|
|
1095
|
-
def _add_set_from_df(
|
1096
|
-
self,
|
1097
|
-
field: FieldAttr = Feature.name,
|
1098
|
-
organism: str | None = None,
|
1099
|
-
mute: bool = False,
|
1100
|
-
):
|
1101
|
-
"""Add feature set corresponding to column names of DataFrame."""
|
1102
|
-
assert self._host.otype == "DataFrame" # noqa: S101
|
1103
|
-
df = self._host.load(is_run_input=False)
|
1104
|
-
schema = Schema.from_df(
|
1105
|
-
df=df,
|
1106
|
-
field=field,
|
1107
|
-
mute=mute,
|
1108
|
-
organism=organism,
|
1109
|
-
)
|
1110
|
-
self._host._staged_feature_sets = {"columns": schema}
|
1111
|
-
self._host.save()
|
1112
|
-
|
1113
|
-
|
1114
|
-
def _add_set_from_anndata(
|
1115
|
-
self,
|
1116
|
-
var_field: FieldAttr | None = None,
|
1117
|
-
obs_field: FieldAttr | None = Feature.name,
|
1118
|
-
uns_field: FieldAttr | None = None,
|
1119
|
-
mute: bool = False,
|
1120
|
-
organism: str | Record | None = None,
|
1121
|
-
):
|
1122
|
-
"""Add features from AnnData."""
|
1123
|
-
assert self._host.otype == "AnnData" # noqa: S101
|
1124
|
-
|
1125
|
-
# parse and register features
|
1126
|
-
adata = self._host.load(is_run_input=False)
|
1127
|
-
feature_sets = parse_staged_feature_sets_from_anndata(
|
1128
|
-
adata,
|
1129
|
-
var_field=var_field,
|
1130
|
-
obs_field=obs_field,
|
1131
|
-
uns_field=uns_field,
|
1132
|
-
mute=mute,
|
1133
|
-
organism=organism,
|
1134
|
-
)
|
1135
|
-
|
1136
|
-
# link feature sets
|
1137
|
-
self._host._staged_feature_sets = feature_sets
|
1138
|
-
self._host.save()
|
1139
|
-
|
1140
|
-
|
1141
1112
|
def _unify_staged_feature_sets_by_hash(
|
1142
1113
|
feature_sets: MutableMapping[str, Schema],
|
1143
1114
|
):
|
@@ -1153,83 +1124,6 @@ def _unify_staged_feature_sets_by_hash(
|
|
1153
1124
|
return feature_sets
|
1154
1125
|
|
1155
1126
|
|
1156
|
-
def _add_set_from_mudata(
|
1157
|
-
self,
|
1158
|
-
var_fields: dict[str, FieldAttr] | None = None,
|
1159
|
-
obs_fields: dict[str, FieldAttr] | None = None,
|
1160
|
-
mute: bool = False,
|
1161
|
-
organism: str | Record | None = None,
|
1162
|
-
):
|
1163
|
-
"""Add features from MuData."""
|
1164
|
-
if obs_fields is None:
|
1165
|
-
obs_fields = {}
|
1166
|
-
assert self._host.otype == "MuData" # noqa: S101
|
1167
|
-
|
1168
|
-
# parse and register features
|
1169
|
-
mdata = self._host.load(is_run_input=False)
|
1170
|
-
feature_sets = {}
|
1171
|
-
|
1172
|
-
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1173
|
-
if len(obs_features) > 0:
|
1174
|
-
feature_sets["obs"] = Schema(features=obs_features)
|
1175
|
-
for modality, field in var_fields.items():
|
1176
|
-
modality_fs = parse_staged_feature_sets_from_anndata(
|
1177
|
-
mdata[modality],
|
1178
|
-
var_field=field,
|
1179
|
-
obs_field=obs_fields.get(modality, Feature.name),
|
1180
|
-
mute=mute,
|
1181
|
-
organism=organism,
|
1182
|
-
)
|
1183
|
-
for k, v in modality_fs.items():
|
1184
|
-
feature_sets[f"['{modality}'].{k}"] = v
|
1185
|
-
|
1186
|
-
# link feature sets
|
1187
|
-
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1188
|
-
self._host.save()
|
1189
|
-
|
1190
|
-
|
1191
|
-
def _add_set_from_spatialdata(
|
1192
|
-
self,
|
1193
|
-
sample_metadata_key: str,
|
1194
|
-
sample_metadata_field: FieldAttr = Feature.name,
|
1195
|
-
var_fields: dict[str, FieldAttr] | None = None,
|
1196
|
-
obs_fields: dict[str, FieldAttr] | None = None,
|
1197
|
-
mute: bool = False,
|
1198
|
-
organism: str | Record | None = None,
|
1199
|
-
):
|
1200
|
-
"""Add features from SpatialData."""
|
1201
|
-
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1202
|
-
assert self._host.otype == "SpatialData" # noqa: S101
|
1203
|
-
|
1204
|
-
# parse and register features
|
1205
|
-
sdata = self._host.load(is_run_input=False)
|
1206
|
-
feature_sets = {}
|
1207
|
-
|
1208
|
-
# sample features
|
1209
|
-
sample_features = Feature.from_values(
|
1210
|
-
sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
|
1211
|
-
field=sample_metadata_field,
|
1212
|
-
) # type: ignore
|
1213
|
-
if len(sample_features) > 0:
|
1214
|
-
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1215
|
-
|
1216
|
-
# table features
|
1217
|
-
for table, field in var_fields.items():
|
1218
|
-
table_fs = parse_staged_feature_sets_from_anndata(
|
1219
|
-
sdata[table],
|
1220
|
-
var_field=field,
|
1221
|
-
obs_field=obs_fields.get(table, Feature.name),
|
1222
|
-
mute=mute,
|
1223
|
-
organism=organism,
|
1224
|
-
)
|
1225
|
-
for k, v in table_fs.items():
|
1226
|
-
feature_sets[f"['{table}'].{k}"] = v
|
1227
|
-
|
1228
|
-
# link feature sets
|
1229
|
-
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1230
|
-
self._host.save()
|
1231
|
-
|
1232
|
-
|
1233
1127
|
def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
1234
1128
|
"""Transfer features from a artifact or collection."""
|
1235
1129
|
# This only covers feature sets
|
@@ -1340,25 +1234,200 @@ def _feature_set_by_slot(self):
|
|
1340
1234
|
return self.slots
|
1341
1235
|
|
1342
1236
|
|
1237
|
+
# deprecated: feature set parsing
|
1238
|
+
|
1239
|
+
|
1240
|
+
def parse_staged_feature_sets_from_anndata(
|
1241
|
+
adata: AnnData,
|
1242
|
+
var_field: FieldAttr | None = None,
|
1243
|
+
obs_field: FieldAttr = Feature.name,
|
1244
|
+
uns_field: FieldAttr | None = None,
|
1245
|
+
mute: bool = False,
|
1246
|
+
organism: str | Record | None = None,
|
1247
|
+
) -> dict:
|
1248
|
+
data_parse = adata
|
1249
|
+
if not isinstance(adata, AnnData): # is a path
|
1250
|
+
filepath = create_path(adata) # returns Path for local
|
1251
|
+
if not isinstance(filepath, LocalPathClasses):
|
1252
|
+
from lamindb import settings
|
1253
|
+
from lamindb.core.storage._backed_access import backed_access
|
1254
|
+
|
1255
|
+
using_key = settings._using_key
|
1256
|
+
data_parse = backed_access(filepath, using_key=using_key)
|
1257
|
+
else:
|
1258
|
+
data_parse = ad.read_h5ad(filepath, backed="r")
|
1259
|
+
type = "float"
|
1260
|
+
else:
|
1261
|
+
type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
|
1262
|
+
feature_sets = {}
|
1263
|
+
if var_field is not None:
|
1264
|
+
schema_var = Schema.from_values(
|
1265
|
+
data_parse.var.index,
|
1266
|
+
var_field,
|
1267
|
+
type=type,
|
1268
|
+
mute=mute,
|
1269
|
+
organism=organism,
|
1270
|
+
raise_validation_error=False,
|
1271
|
+
)
|
1272
|
+
if schema_var is not None:
|
1273
|
+
feature_sets["var"] = schema_var
|
1274
|
+
if obs_field is not None and len(data_parse.obs.columns) > 0:
|
1275
|
+
schema_obs = Schema.from_df(
|
1276
|
+
df=data_parse.obs,
|
1277
|
+
field=obs_field,
|
1278
|
+
mute=mute,
|
1279
|
+
organism=organism,
|
1280
|
+
)
|
1281
|
+
if schema_obs is not None:
|
1282
|
+
feature_sets["obs"] = schema_obs
|
1283
|
+
if uns_field is not None and len(data_parse.uns) > 0:
|
1284
|
+
validated_features = Feature.from_values( # type: ignore
|
1285
|
+
data_parse.uns.keys(), field=uns_field, organism=organism
|
1286
|
+
)
|
1287
|
+
if len(validated_features) > 0:
|
1288
|
+
schema_uns = Schema(validated_features, dtype=None, otype="dict")
|
1289
|
+
feature_sets["uns"] = schema_uns
|
1290
|
+
return feature_sets
|
1291
|
+
|
1292
|
+
|
1293
|
+
# no longer called from within curator
|
1294
|
+
# might deprecate in the future?
|
1295
|
+
def _add_set_from_df(
|
1296
|
+
self,
|
1297
|
+
field: FieldAttr = Feature.name,
|
1298
|
+
organism: str | None = None,
|
1299
|
+
mute: bool = False,
|
1300
|
+
):
|
1301
|
+
"""Add feature set corresponding to column names of DataFrame."""
|
1302
|
+
assert self._host.otype == "DataFrame" # noqa: S101
|
1303
|
+
df = self._host.load(is_run_input=False)
|
1304
|
+
schema = Schema.from_df(
|
1305
|
+
df=df,
|
1306
|
+
field=field,
|
1307
|
+
mute=mute,
|
1308
|
+
organism=organism,
|
1309
|
+
)
|
1310
|
+
self._host._staged_feature_sets = {"columns": schema}
|
1311
|
+
self._host.save()
|
1312
|
+
|
1313
|
+
|
1314
|
+
def _add_set_from_anndata(
|
1315
|
+
self,
|
1316
|
+
var_field: FieldAttr | None = None,
|
1317
|
+
obs_field: FieldAttr | None = Feature.name,
|
1318
|
+
uns_field: FieldAttr | None = None,
|
1319
|
+
mute: bool = False,
|
1320
|
+
organism: str | Record | None = None,
|
1321
|
+
):
|
1322
|
+
"""Add features from AnnData."""
|
1323
|
+
assert self._host.otype == "AnnData" # noqa: S101
|
1324
|
+
|
1325
|
+
# parse and register features
|
1326
|
+
adata = self._host.load(is_run_input=False)
|
1327
|
+
feature_sets = parse_staged_feature_sets_from_anndata(
|
1328
|
+
adata,
|
1329
|
+
var_field=var_field,
|
1330
|
+
obs_field=obs_field,
|
1331
|
+
uns_field=uns_field,
|
1332
|
+
mute=mute,
|
1333
|
+
organism=organism,
|
1334
|
+
)
|
1335
|
+
|
1336
|
+
# link feature sets
|
1337
|
+
self._host._staged_feature_sets = feature_sets
|
1338
|
+
self._host.save()
|
1339
|
+
|
1340
|
+
|
1341
|
+
def _add_set_from_mudata(
|
1342
|
+
self,
|
1343
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1344
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1345
|
+
mute: bool = False,
|
1346
|
+
organism: str | Record | None = None,
|
1347
|
+
):
|
1348
|
+
"""Add features from MuData."""
|
1349
|
+
if obs_fields is None:
|
1350
|
+
obs_fields = {}
|
1351
|
+
assert self._host.otype == "MuData" # noqa: S101
|
1352
|
+
|
1353
|
+
# parse and register features
|
1354
|
+
mdata = self._host.load(is_run_input=False)
|
1355
|
+
feature_sets = {}
|
1356
|
+
|
1357
|
+
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1358
|
+
if len(obs_features) > 0:
|
1359
|
+
feature_sets["obs"] = Schema(features=obs_features)
|
1360
|
+
for modality, field in var_fields.items():
|
1361
|
+
modality_fs = parse_staged_feature_sets_from_anndata(
|
1362
|
+
mdata[modality],
|
1363
|
+
var_field=field,
|
1364
|
+
obs_field=obs_fields.get(modality, Feature.name),
|
1365
|
+
mute=mute,
|
1366
|
+
organism=organism,
|
1367
|
+
)
|
1368
|
+
for k, v in modality_fs.items():
|
1369
|
+
feature_sets[f"['{modality}'].{k}"] = v
|
1370
|
+
|
1371
|
+
# link feature sets
|
1372
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1373
|
+
self._host.save()
|
1374
|
+
|
1375
|
+
|
1376
|
+
def _add_set_from_spatialdata(
|
1377
|
+
self,
|
1378
|
+
sample_metadata_key: str,
|
1379
|
+
sample_metadata_field: FieldAttr = Feature.name,
|
1380
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1381
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1382
|
+
mute: bool = False,
|
1383
|
+
organism: str | Record | None = None,
|
1384
|
+
):
|
1385
|
+
"""Add features from SpatialData."""
|
1386
|
+
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1387
|
+
assert self._host.otype == "SpatialData" # noqa: S101
|
1388
|
+
|
1389
|
+
# parse and register features
|
1390
|
+
sdata = self._host.load(is_run_input=False)
|
1391
|
+
feature_sets = {}
|
1392
|
+
|
1393
|
+
# sample features
|
1394
|
+
sample_features = Feature.from_values(
|
1395
|
+
sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
|
1396
|
+
field=sample_metadata_field,
|
1397
|
+
) # type: ignore
|
1398
|
+
if len(sample_features) > 0:
|
1399
|
+
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1400
|
+
|
1401
|
+
# table features
|
1402
|
+
for table, field in var_fields.items():
|
1403
|
+
table_fs = parse_staged_feature_sets_from_anndata(
|
1404
|
+
sdata[table],
|
1405
|
+
var_field=field,
|
1406
|
+
obs_field=obs_fields.get(table, Feature.name),
|
1407
|
+
mute=mute,
|
1408
|
+
organism=organism,
|
1409
|
+
)
|
1410
|
+
for k, v in table_fs.items():
|
1411
|
+
feature_sets[f"['{table}'].{k}"] = v
|
1412
|
+
|
1413
|
+
# link feature sets
|
1414
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1415
|
+
self._host.save()
|
1416
|
+
|
1417
|
+
|
1343
1418
|
# mypy: ignore-errors
|
1344
1419
|
FeatureManager.__init__ = __init__
|
1345
1420
|
ParamManager.__init__ = __init__
|
1346
1421
|
FeatureManager.__repr__ = __repr__
|
1347
1422
|
ParamManager.__repr__ = __repr__
|
1423
|
+
FeatureManager.describe = describe
|
1424
|
+
ParamManager.describe = describe
|
1348
1425
|
FeatureManager.__getitem__ = __getitem__
|
1349
1426
|
FeatureManager.get_values = get_values
|
1350
1427
|
FeatureManager.slots = slots
|
1351
1428
|
FeatureManager.add_values = add_values_features
|
1352
1429
|
FeatureManager._add_schema = _add_schema
|
1353
|
-
FeatureManager.add_schema = add_schema # deprecated
|
1354
|
-
FeatureManager.add_feature_set = add_feature_set # deprecated
|
1355
|
-
FeatureManager._schema_by_slot = _schema_by_slot # deprecated
|
1356
|
-
FeatureManager._feature_set_by_slot = _feature_set_by_slot # deprecated
|
1357
1430
|
FeatureManager._accessor_by_registry = _accessor_by_registry
|
1358
|
-
FeatureManager._add_set_from_df = _add_set_from_df
|
1359
|
-
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1360
|
-
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1361
|
-
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1362
1431
|
FeatureManager._add_from = _add_from
|
1363
1432
|
FeatureManager.filter = filter
|
1364
1433
|
FeatureManager.get = get
|
@@ -1367,3 +1436,13 @@ FeatureManager.remove_values = remove_values
|
|
1367
1436
|
ParamManager.add_values = add_values_params
|
1368
1437
|
ParamManager.get_values = get_values
|
1369
1438
|
ParamManager.filter = filter
|
1439
|
+
|
1440
|
+
# deprecated
|
1441
|
+
FeatureManager._add_set_from_df = _add_set_from_df
|
1442
|
+
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1443
|
+
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1444
|
+
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1445
|
+
FeatureManager.add_schema = add_schema
|
1446
|
+
FeatureManager.add_feature_set = add_feature_set
|
1447
|
+
FeatureManager._schema_by_slot = _schema_by_slot
|
1448
|
+
FeatureManager._feature_set_by_slot = _feature_set_by_slot
|