lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +216 -133
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3559
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1546 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/models/__init__.py +12 -2
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +384 -301
- lamindb/models/_from_values.py +1 -1
- lamindb/models/_is_versioned.py +5 -15
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +354 -177
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/can_curate.py +4 -1
- lamindb/models/collection.py +79 -56
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +78 -47
- lamindb/models/has_parents.py +24 -9
- lamindb/models/project.py +3 -3
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +251 -206
- lamindb/models/record.py +211 -344
- lamindb/models/run.py +59 -5
- lamindb/models/save.py +9 -5
- lamindb/models/schema.py +673 -196
- lamindb/models/transform.py +5 -14
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
- lamindb-1.5.0.dist-info/RECORD +108 -0
- lamindb-1.3.2.dist-info/RECORD +0 -95
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
@@ -13,7 +13,7 @@ import pandas as pd
|
|
13
13
|
from anndata import AnnData
|
14
14
|
from django.contrib.postgres.aggregates import ArrayAgg
|
15
15
|
from django.db import connections
|
16
|
-
from django.db.models import Aggregate
|
16
|
+
from django.db.models import Aggregate, ProtectedError, Subquery
|
17
17
|
from lamin_utils import logger
|
18
18
|
from lamindb_setup.core.hashing import hash_set
|
19
19
|
from lamindb_setup.core.upath import create_path
|
@@ -42,7 +42,7 @@ from ._describe import (
|
|
42
42
|
TYPE_WIDTH,
|
43
43
|
VALUES_WIDTH,
|
44
44
|
describe_header,
|
45
|
-
|
45
|
+
format_rich_tree,
|
46
46
|
)
|
47
47
|
from ._django import get_artifact_with_related
|
48
48
|
from ._label_manager import _get_labels, describe_labels
|
@@ -313,19 +313,34 @@ def describe_features(
|
|
313
313
|
fs_data = _get_schemas_postgres(self, related_data=related_data)
|
314
314
|
for fs_id, (slot, data) in fs_data.items():
|
315
315
|
for registry_str, feature_names in data.items():
|
316
|
+
# prevent projects show up as features
|
317
|
+
if registry_str == "Project":
|
318
|
+
continue
|
316
319
|
schema = Schema.objects.using(self._state.db).get(id=fs_id)
|
317
320
|
schema_data[slot] = (schema, feature_names)
|
318
321
|
for feature_name in feature_names:
|
319
322
|
feature_data[feature_name] = (slot, registry_str)
|
323
|
+
schema_data.update(
|
324
|
+
{
|
325
|
+
slot: (schema, schema.n)
|
326
|
+
for slot, schema in get_schema_by_slot_(self).items()
|
327
|
+
if slot not in schema_data
|
328
|
+
}
|
329
|
+
)
|
320
330
|
else:
|
321
331
|
for slot, schema in get_schema_by_slot_(self).items():
|
322
332
|
features = schema.members
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
333
|
+
if features.exists():
|
334
|
+
# features.first() is a lot slower than features[0] here
|
335
|
+
name_field = get_name_field(features[0])
|
336
|
+
feature_names = list(
|
337
|
+
features.values_list(name_field, flat=True)[:20]
|
338
|
+
)
|
339
|
+
schema_data[slot] = (schema, feature_names)
|
340
|
+
for feature_name in feature_names:
|
341
|
+
feature_data[feature_name] = (slot, schema.itype)
|
342
|
+
else:
|
343
|
+
schema_data[slot] = (schema, schema.n)
|
329
344
|
|
330
345
|
internal_feature_names: dict[str, str] = {}
|
331
346
|
if isinstance(self, Artifact):
|
@@ -397,38 +412,44 @@ def describe_features(
|
|
397
412
|
internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
|
398
413
|
|
399
414
|
int_features_tree_children = []
|
400
|
-
for slot, (schema,
|
401
|
-
if
|
402
|
-
|
403
|
-
feature_rows = internal_feature_labels_slot[slot]
|
404
|
-
# add internal Feature features without labels
|
405
|
-
feature_rows += [
|
406
|
-
(
|
407
|
-
feature_name,
|
408
|
-
Text(str(internal_feature_names.get(feature_name)), style="dim"),
|
409
|
-
"",
|
410
|
-
)
|
411
|
-
for feature_name in feature_names
|
412
|
-
if feature_name and feature_name not in internal_feature_labels
|
413
|
-
]
|
415
|
+
for slot, (schema, feature_names_or_n) in schema_data.items():
|
416
|
+
if isinstance(feature_names_or_n, int):
|
417
|
+
feature_rows = []
|
414
418
|
else:
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
419
|
+
feature_names = feature_names_or_n
|
420
|
+
if slot in internal_feature_labels_slot:
|
421
|
+
# add internal Feature features with labels
|
422
|
+
feature_rows = internal_feature_labels_slot[slot]
|
423
|
+
# add internal Feature features without labels
|
424
|
+
feature_rows += [
|
425
|
+
(
|
426
|
+
feature_name,
|
427
|
+
Text(
|
428
|
+
str(internal_feature_names.get(feature_name)), style="dim"
|
424
429
|
),
|
425
|
-
|
426
|
-
)
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
430
|
+
"",
|
431
|
+
)
|
432
|
+
for feature_name in feature_names
|
433
|
+
if feature_name and feature_name not in internal_feature_labels
|
434
|
+
]
|
435
|
+
else:
|
436
|
+
# add internal non-Feature features without labels
|
437
|
+
feature_rows = [
|
438
|
+
(
|
439
|
+
feature_name,
|
440
|
+
Text(
|
441
|
+
str(
|
442
|
+
internal_feature_names.get(feature_name)
|
443
|
+
if feature_name in internal_feature_names
|
444
|
+
else schema.dtype
|
445
|
+
),
|
446
|
+
style="dim",
|
447
|
+
),
|
448
|
+
"",
|
449
|
+
)
|
450
|
+
for feature_name in feature_names
|
451
|
+
if feature_name
|
452
|
+
]
|
432
453
|
int_features_tree_children.append(
|
433
454
|
_create_feature_table(
|
434
455
|
Text.assemble(
|
@@ -446,8 +467,6 @@ def describe_features(
|
|
446
467
|
dataset_tree = tree.add(
|
447
468
|
Text.assemble(
|
448
469
|
("Dataset features", "bold bright_magenta"),
|
449
|
-
("/", "dim"),
|
450
|
-
(".feature_sets", "dim bold"),
|
451
470
|
)
|
452
471
|
)
|
453
472
|
for child in int_features_tree_children:
|
@@ -481,59 +500,6 @@ def describe_features(
|
|
481
500
|
return tree
|
482
501
|
|
483
502
|
|
484
|
-
def parse_staged_feature_sets_from_anndata(
|
485
|
-
adata: AnnData,
|
486
|
-
var_field: FieldAttr | None = None,
|
487
|
-
obs_field: FieldAttr = Feature.name,
|
488
|
-
uns_field: FieldAttr | None = None,
|
489
|
-
mute: bool = False,
|
490
|
-
organism: str | Record | None = None,
|
491
|
-
) -> dict:
|
492
|
-
data_parse = adata
|
493
|
-
if not isinstance(adata, AnnData): # is a path
|
494
|
-
filepath = create_path(adata) # returns Path for local
|
495
|
-
if not isinstance(filepath, LocalPathClasses):
|
496
|
-
from lamindb import settings
|
497
|
-
from lamindb.core.storage._backed_access import backed_access
|
498
|
-
|
499
|
-
using_key = settings._using_key
|
500
|
-
data_parse = backed_access(filepath, using_key=using_key)
|
501
|
-
else:
|
502
|
-
data_parse = ad.read_h5ad(filepath, backed="r")
|
503
|
-
type = "float"
|
504
|
-
else:
|
505
|
-
type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
|
506
|
-
feature_sets = {}
|
507
|
-
if var_field is not None:
|
508
|
-
schema_var = Schema.from_values(
|
509
|
-
data_parse.var.index,
|
510
|
-
var_field,
|
511
|
-
type=type,
|
512
|
-
mute=mute,
|
513
|
-
organism=organism,
|
514
|
-
raise_validation_error=False,
|
515
|
-
)
|
516
|
-
if schema_var is not None:
|
517
|
-
feature_sets["var"] = schema_var
|
518
|
-
if obs_field is not None and len(data_parse.obs.columns) > 0:
|
519
|
-
schema_obs = Schema.from_df(
|
520
|
-
df=data_parse.obs,
|
521
|
-
field=obs_field,
|
522
|
-
mute=mute,
|
523
|
-
organism=organism,
|
524
|
-
)
|
525
|
-
if schema_obs is not None:
|
526
|
-
feature_sets["obs"] = schema_obs
|
527
|
-
if uns_field is not None and len(data_parse.uns) > 0:
|
528
|
-
validated_features = Feature.from_values( # type: ignore
|
529
|
-
data_parse.uns.keys(), field=uns_field, organism=organism
|
530
|
-
)
|
531
|
-
if len(validated_features) > 0:
|
532
|
-
schema_uns = Schema(validated_features, dtype=None, otype="dict")
|
533
|
-
feature_sets["uns"] = schema_uns
|
534
|
-
return feature_sets
|
535
|
-
|
536
|
-
|
537
503
|
def is_valid_datetime_str(date_string: str) -> bool | str:
|
538
504
|
try:
|
539
505
|
dt = datetime.fromisoformat(date_string)
|
@@ -624,8 +590,12 @@ def __init__(self, host: Artifact | Collection | Run):
|
|
624
590
|
|
625
591
|
|
626
592
|
def __repr__(self) -> str:
|
593
|
+
return describe(self, return_str=True) # type: ignore
|
594
|
+
|
595
|
+
|
596
|
+
def describe(self, return_str: bool = False) -> str | None:
|
627
597
|
tree = describe_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
|
628
|
-
return
|
598
|
+
return format_rich_tree(tree, fallback="no linked features", return_str=return_str)
|
629
599
|
|
630
600
|
|
631
601
|
def get_values(self) -> dict[str, Any]:
|
@@ -648,7 +618,7 @@ def __getitem__(self, slot) -> QuerySet:
|
|
648
618
|
return getattr(schema, self._accessor_by_registry[orm_name]).all()
|
649
619
|
|
650
620
|
|
651
|
-
def filter_base(cls, **expression):
|
621
|
+
def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
|
652
622
|
from .artifact import Artifact
|
653
623
|
|
654
624
|
if cls is FeatureManager:
|
@@ -658,11 +628,12 @@ def filter_base(cls, **expression):
|
|
658
628
|
model = Param
|
659
629
|
value_model = ParamValue
|
660
630
|
keys_normalized = [key.split("__")[0] for key in expression]
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
631
|
+
if not _skip_validation:
|
632
|
+
validated = model.validate(keys_normalized, field="name", mute=True)
|
633
|
+
if sum(validated) != len(keys_normalized):
|
634
|
+
raise ValidationError(
|
635
|
+
f"Some keys in the filter expression are not registered as features: {np.array(keys_normalized)[~validated]}"
|
636
|
+
)
|
666
637
|
new_expression = {}
|
667
638
|
features = model.filter(name__in=keys_normalized).all().distinct()
|
668
639
|
feature_param = "param" if model is Param else "feature"
|
@@ -674,76 +645,98 @@ def filter_base(cls, **expression):
|
|
674
645
|
comparator = f"__{split_key[1]}"
|
675
646
|
feature = features.get(name=normalized_key)
|
676
647
|
if not feature.dtype.startswith("cat"):
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
# we need the comparator here because users might query like so
|
688
|
-
# ln.Artifact.features.filter(experiment__contains="Experi")
|
689
|
-
expression = {f"name{comparator}": value}
|
690
|
-
labels = ULabel.filter(**expression).all()
|
691
|
-
if len(labels) == 0:
|
692
|
-
raise DoesNotExist(
|
693
|
-
f"Did not find a ULabel matching `name{comparator}={value}`"
|
648
|
+
if comparator == "__isnull":
|
649
|
+
if cls == FeatureManager:
|
650
|
+
from .artifact import ArtifactFeatureValue
|
651
|
+
|
652
|
+
return Artifact.objects.exclude(
|
653
|
+
id__in=Subquery(
|
654
|
+
ArtifactFeatureValue.objects.filter(
|
655
|
+
featurevalue__feature=feature
|
656
|
+
).values("artifact_id")
|
657
|
+
)
|
694
658
|
)
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
)
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
# simplified query if we have exactly one label
|
708
|
-
new_expression[
|
709
|
-
f"{accessor_name}__{label_registry.__name__.lower()}"
|
710
|
-
] = label
|
659
|
+
if comparator in {"__startswith", "__contains"}:
|
660
|
+
logger.important(
|
661
|
+
f"currently not supporting `{comparator}`, using `__icontains` instead"
|
662
|
+
)
|
663
|
+
comparator = "__icontains"
|
664
|
+
expression = {feature_param: feature, f"value{comparator}": value}
|
665
|
+
feature_values = value_model.filter(**expression)
|
666
|
+
new_expression[f"_{feature_param}_values__id__in"] = feature_values
|
667
|
+
elif isinstance(value, (str, Record, bool)):
|
668
|
+
if comparator == "__isnull":
|
669
|
+
if cls == FeatureManager:
|
670
|
+
return Artifact.objects.exclude(links_ulabel__feature=feature)
|
711
671
|
else:
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
672
|
+
# because SQL is sensitive to whether querying with __in or not
|
673
|
+
# and might return multiple equivalent records for the latter
|
674
|
+
# we distinguish cases in which we have multiple label matches vs. one
|
675
|
+
label = None
|
676
|
+
labels = None
|
677
|
+
if isinstance(value, str):
|
678
|
+
# we need the comparator here because users might query like so
|
679
|
+
# ln.Artifact.filter(experiment__contains="Experi")
|
680
|
+
expression = {f"name{comparator}": value}
|
681
|
+
labels = ULabel.filter(**expression).all()
|
682
|
+
if len(labels) == 0:
|
683
|
+
raise DoesNotExist(
|
684
|
+
f"Did not find a ULabel matching `name{comparator}={value}`"
|
685
|
+
)
|
686
|
+
elif len(labels) == 1:
|
687
|
+
label = labels[0]
|
688
|
+
elif isinstance(value, Record):
|
689
|
+
label = value
|
690
|
+
label_registry = (
|
691
|
+
label.__class__ if label is not None else labels[0].__class__
|
692
|
+
)
|
693
|
+
accessor_name = (
|
694
|
+
label_registry.artifacts.through.artifact.field._related_name
|
695
|
+
)
|
696
|
+
new_expression[f"{accessor_name}__feature"] = feature
|
697
|
+
if label is not None:
|
698
|
+
# simplified query if we have exactly one label
|
699
|
+
new_expression[
|
700
|
+
f"{accessor_name}__{label_registry.__name__.lower()}"
|
701
|
+
] = label
|
702
|
+
else:
|
703
|
+
new_expression[
|
704
|
+
f"{accessor_name}__{label_registry.__name__.lower()}__in"
|
705
|
+
] = labels
|
716
706
|
# if passing a list of records, we want to
|
717
707
|
# find artifacts that are annotated by all of them at the same
|
718
708
|
# time; hence, we don't want the __in construct that we use to match strings
|
719
709
|
# https://laminlabs.slack.com/archives/C04FPE8V01W/p1688328084810609
|
720
|
-
|
710
|
+
if not (new_expression):
|
711
|
+
raise NotImplementedError
|
721
712
|
if cls == FeatureManager or cls == ParamManagerArtifact:
|
722
|
-
return Artifact.filter(**new_expression)
|
713
|
+
return Artifact.objects.filter(**new_expression)
|
723
714
|
elif cls == ParamManagerRun:
|
724
|
-
return Run.filter(**new_expression)
|
715
|
+
return Run.objects.filter(**new_expression)
|
725
716
|
|
726
717
|
|
727
718
|
@classmethod # type: ignore
|
719
|
+
@deprecated("the filter() registry classmethod")
|
728
720
|
def filter(cls, **expression) -> QuerySet:
|
729
721
|
"""Query artifacts by features."""
|
730
|
-
return filter_base(cls, **expression)
|
722
|
+
return filter_base(cls, _skip_validation=False, **expression)
|
731
723
|
|
732
724
|
|
733
725
|
@classmethod # type: ignore
|
726
|
+
@deprecated("the filter() registry classmethod")
|
734
727
|
def get(cls, **expression) -> Record:
|
735
728
|
"""Query a single artifact by feature."""
|
736
|
-
return filter_base(cls, **expression).one()
|
729
|
+
return filter_base(cls, _skip_validation=False, **expression).one()
|
737
730
|
|
738
731
|
|
739
732
|
@property # type: ignore
|
740
733
|
def slots(self) -> dict[str, Schema]:
|
741
734
|
"""Schema by slot.
|
742
735
|
|
743
|
-
Example
|
736
|
+
Example::
|
744
737
|
|
745
|
-
|
746
|
-
{'var': <Schema: var>, 'obs': <Schema: obs>}
|
738
|
+
artifact.features.slots
|
739
|
+
#> {'var': <Schema: var>, 'obs': <Schema: obs>}
|
747
740
|
"""
|
748
741
|
if self._slots is None:
|
749
742
|
self._slots = get_schema_by_slot_(self._host)
|
@@ -817,15 +810,14 @@ def _add_values(
|
|
817
810
|
from .artifact import Artifact
|
818
811
|
|
819
812
|
# rename to distinguish from the values inside the dict
|
820
|
-
|
821
|
-
keys =
|
813
|
+
dictionary = values
|
814
|
+
keys = dictionary.keys()
|
822
815
|
if isinstance(keys, DICT_KEYS_TYPE):
|
823
816
|
keys = list(keys) # type: ignore
|
824
817
|
# deal with other cases later
|
825
818
|
assert all(isinstance(key, str) for key in keys) # noqa: S101
|
826
819
|
registry = feature_param_field.field.model
|
827
820
|
is_param = registry == Param
|
828
|
-
model = Param if is_param else Feature
|
829
821
|
value_model = ParamValue if is_param else FeatureValue
|
830
822
|
model_name = "Param" if is_param else "Feature"
|
831
823
|
if is_param:
|
@@ -838,13 +830,11 @@ def _add_values(
|
|
838
830
|
raise ValidationError(
|
839
831
|
"Can only set features for dataset-like artifacts."
|
840
832
|
)
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
if validated.sum() != len(keys):
|
845
|
-
not_validated_keys = keys_array[~validated]
|
833
|
+
records = registry.from_values(keys, field=feature_param_field, mute=True)
|
834
|
+
if len(records) != len(keys):
|
835
|
+
not_validated_keys = [key for key in keys if key not in records.list("name")]
|
846
836
|
not_validated_keys_dtype_message = [
|
847
|
-
(key, infer_feature_type_convert_json(key,
|
837
|
+
(key, infer_feature_type_convert_json(key, dictionary[key]))
|
848
838
|
for key in not_validated_keys
|
849
839
|
]
|
850
840
|
run = get_current_tracked_run()
|
@@ -862,7 +852,7 @@ def _add_values(
|
|
862
852
|
]
|
863
853
|
hint = "\n".join(elements)
|
864
854
|
msg = (
|
865
|
-
f"These keys could not be validated: {not_validated_keys
|
855
|
+
f"These keys could not be validated: {not_validated_keys}\n"
|
866
856
|
f"Here is how to create a {model_name.lower()}:\n\n{hint}"
|
867
857
|
)
|
868
858
|
raise ValidationError(msg)
|
@@ -871,10 +861,10 @@ def _add_values(
|
|
871
861
|
features_labels = defaultdict(list)
|
872
862
|
_feature_values = []
|
873
863
|
not_validated_values = []
|
874
|
-
for
|
875
|
-
|
864
|
+
for feature in records:
|
865
|
+
value = dictionary[feature.name]
|
876
866
|
inferred_type, converted_value, _ = infer_feature_type_convert_json(
|
877
|
-
|
867
|
+
feature.name,
|
878
868
|
value,
|
879
869
|
mute=True,
|
880
870
|
str_as_ulabel=str_as_ulabel,
|
@@ -882,25 +872,23 @@ def _add_values(
|
|
882
872
|
if feature.dtype == "num":
|
883
873
|
if inferred_type not in {"int", "float"}:
|
884
874
|
raise TypeError(
|
885
|
-
f"Value for feature '{
|
875
|
+
f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
|
886
876
|
)
|
887
877
|
elif feature.dtype.startswith("cat"):
|
888
878
|
if inferred_type != "?":
|
889
879
|
if not (inferred_type.startswith("cat") or isinstance(value, Record)):
|
890
880
|
raise TypeError(
|
891
|
-
f"Value for feature '{
|
881
|
+
f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
|
892
882
|
)
|
893
883
|
elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
|
894
884
|
feature.dtype != "str" and feature.dtype != inferred_type
|
895
885
|
):
|
896
886
|
raise ValidationError(
|
897
|
-
f"Expected dtype for '{
|
887
|
+
f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
|
898
888
|
)
|
899
889
|
if not feature.dtype.startswith("cat"):
|
900
890
|
filter_kwargs = {model_name.lower(): feature, "value": converted_value}
|
901
|
-
feature_value = value_model.
|
902
|
-
if feature_value is None:
|
903
|
-
feature_value = value_model(**filter_kwargs)
|
891
|
+
feature_value, _ = value_model.get_or_create(**filter_kwargs)
|
904
892
|
_feature_values.append(feature_value)
|
905
893
|
else:
|
906
894
|
if isinstance(value, Record) or (
|
@@ -938,30 +926,63 @@ def _add_values(
|
|
938
926
|
(feature, label_record) for label_record in label_records
|
939
927
|
]
|
940
928
|
if not_validated_values:
|
941
|
-
|
942
|
-
|
943
|
-
f" ln.save(ulabels)"
|
944
|
-
)
|
929
|
+
not_validated_values.sort()
|
930
|
+
hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
|
945
931
|
msg = (
|
946
932
|
f"These values could not be validated: {not_validated_values}\n"
|
947
933
|
f"Here is how to create ulabels for them:\n\n{hint}"
|
948
934
|
)
|
949
935
|
raise ValidationError(msg)
|
936
|
+
# TODO: create an explicit version of this
|
937
|
+
# if not is_param:
|
938
|
+
# # check if _expect_many is false for _all_ records
|
939
|
+
# if any(record._expect_many for record in records):
|
940
|
+
# updated_features = []
|
941
|
+
# for record in records:
|
942
|
+
# if record._expect_many:
|
943
|
+
# record._expect_many = False
|
944
|
+
# record.save()
|
945
|
+
# updated_features.append(record.name)
|
946
|
+
# if any(updated_features):
|
947
|
+
# logger.important(
|
948
|
+
# f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
|
949
|
+
# )
|
950
950
|
# bulk add all links
|
951
951
|
if features_labels:
|
952
952
|
add_label_feature_links(self, features_labels)
|
953
953
|
if _feature_values:
|
954
|
-
|
954
|
+
to_insert_feature_values = [
|
955
|
+
record for record in _feature_values if record._state.adding
|
956
|
+
]
|
957
|
+
if to_insert_feature_values:
|
958
|
+
save(to_insert_feature_values)
|
959
|
+
dict_typed_features = [
|
960
|
+
getattr(record, model_name.lower())
|
961
|
+
for record in _feature_values
|
962
|
+
if getattr(record, model_name.lower()).dtype == "dict"
|
963
|
+
]
|
955
964
|
if is_param:
|
956
965
|
LinkORM = self._host._param_values.through
|
957
966
|
valuefield_id = "paramvalue_id"
|
958
967
|
else:
|
959
968
|
LinkORM = self._host._feature_values.through
|
960
969
|
valuefield_id = "featurevalue_id"
|
970
|
+
host_class_lower = self._host.__class__.__get_name_with_module__().lower()
|
971
|
+
if dict_typed_features:
|
972
|
+
# delete all previously existing anotations with dictionaries
|
973
|
+
kwargs = {
|
974
|
+
f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
|
975
|
+
f"{model_name.lower()}__in": dict_typed_features,
|
976
|
+
}
|
977
|
+
try:
|
978
|
+
value_model.filter(**kwargs).all().delete()
|
979
|
+
except ProtectedError:
|
980
|
+
pass
|
981
|
+
# add new feature links
|
961
982
|
links = [
|
962
983
|
LinkORM(
|
963
984
|
**{
|
964
|
-
f"{
|
985
|
+
f"{host_class_lower}_id": self._host.id,
|
965
986
|
valuefield_id: feature_value.id,
|
966
987
|
}
|
967
988
|
)
|
@@ -1088,52 +1109,6 @@ def _add_schema(self, schema: Schema, slot: str) -> None:
|
|
1088
1109
|
self._slots[slot] = schema # type: ignore
|
1089
1110
|
|
1090
1111
|
|
1091
|
-
def _add_set_from_df(
|
1092
|
-
self,
|
1093
|
-
field: FieldAttr = Feature.name,
|
1094
|
-
organism: str | None = None,
|
1095
|
-
mute: bool = False,
|
1096
|
-
):
|
1097
|
-
"""Add feature set corresponding to column names of DataFrame."""
|
1098
|
-
assert self._host.otype == "DataFrame" # noqa: S101
|
1099
|
-
df = self._host.load()
|
1100
|
-
schema = Schema.from_df(
|
1101
|
-
df=df,
|
1102
|
-
field=field,
|
1103
|
-
mute=mute,
|
1104
|
-
organism=organism,
|
1105
|
-
)
|
1106
|
-
self._host._staged_feature_sets = {"columns": schema}
|
1107
|
-
self._host.save()
|
1108
|
-
|
1109
|
-
|
1110
|
-
def _add_set_from_anndata(
|
1111
|
-
self,
|
1112
|
-
var_field: FieldAttr | None = None,
|
1113
|
-
obs_field: FieldAttr | None = Feature.name,
|
1114
|
-
uns_field: FieldAttr | None = None,
|
1115
|
-
mute: bool = False,
|
1116
|
-
organism: str | Record | None = None,
|
1117
|
-
):
|
1118
|
-
"""Add features from AnnData."""
|
1119
|
-
assert self._host.otype == "AnnData" # noqa: S101
|
1120
|
-
|
1121
|
-
# parse and register features
|
1122
|
-
adata = self._host.load()
|
1123
|
-
feature_sets = parse_staged_feature_sets_from_anndata(
|
1124
|
-
adata,
|
1125
|
-
var_field=var_field,
|
1126
|
-
obs_field=obs_field,
|
1127
|
-
uns_field=uns_field,
|
1128
|
-
mute=mute,
|
1129
|
-
organism=organism,
|
1130
|
-
)
|
1131
|
-
|
1132
|
-
# link feature sets
|
1133
|
-
self._host._staged_feature_sets = feature_sets
|
1134
|
-
self._host.save()
|
1135
|
-
|
1136
|
-
|
1137
1112
|
def _unify_staged_feature_sets_by_hash(
|
1138
1113
|
feature_sets: MutableMapping[str, Schema],
|
1139
1114
|
):
|
@@ -1149,83 +1124,6 @@ def _unify_staged_feature_sets_by_hash(
|
|
1149
1124
|
return feature_sets
|
1150
1125
|
|
1151
1126
|
|
1152
|
-
def _add_set_from_mudata(
|
1153
|
-
self,
|
1154
|
-
var_fields: dict[str, FieldAttr] | None = None,
|
1155
|
-
obs_fields: dict[str, FieldAttr] | None = None,
|
1156
|
-
mute: bool = False,
|
1157
|
-
organism: str | Record | None = None,
|
1158
|
-
):
|
1159
|
-
"""Add features from MuData."""
|
1160
|
-
if obs_fields is None:
|
1161
|
-
obs_fields = {}
|
1162
|
-
assert self._host.otype == "MuData" # noqa: S101
|
1163
|
-
|
1164
|
-
# parse and register features
|
1165
|
-
mdata = self._host.load()
|
1166
|
-
feature_sets = {}
|
1167
|
-
|
1168
|
-
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1169
|
-
if len(obs_features) > 0:
|
1170
|
-
feature_sets["obs"] = Schema(features=obs_features)
|
1171
|
-
for modality, field in var_fields.items():
|
1172
|
-
modality_fs = parse_staged_feature_sets_from_anndata(
|
1173
|
-
mdata[modality],
|
1174
|
-
var_field=field,
|
1175
|
-
obs_field=obs_fields.get(modality, Feature.name),
|
1176
|
-
mute=mute,
|
1177
|
-
organism=organism,
|
1178
|
-
)
|
1179
|
-
for k, v in modality_fs.items():
|
1180
|
-
feature_sets[f"['{modality}'].{k}"] = v
|
1181
|
-
|
1182
|
-
# link feature sets
|
1183
|
-
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1184
|
-
self._host.save()
|
1185
|
-
|
1186
|
-
|
1187
|
-
def _add_set_from_spatialdata(
|
1188
|
-
self,
|
1189
|
-
sample_metadata_key: str,
|
1190
|
-
sample_metadata_field: FieldAttr = Feature.name,
|
1191
|
-
var_fields: dict[str, FieldAttr] | None = None,
|
1192
|
-
obs_fields: dict[str, FieldAttr] | None = None,
|
1193
|
-
mute: bool = False,
|
1194
|
-
organism: str | Record | None = None,
|
1195
|
-
):
|
1196
|
-
"""Add features from SpatialData."""
|
1197
|
-
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1198
|
-
assert self._host.otype == "SpatialData" # noqa: S101
|
1199
|
-
|
1200
|
-
# parse and register features
|
1201
|
-
sdata = self._host.load()
|
1202
|
-
feature_sets = {}
|
1203
|
-
|
1204
|
-
# sample features
|
1205
|
-
sample_features = Feature.from_values(
|
1206
|
-
sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
|
1207
|
-
field=sample_metadata_field,
|
1208
|
-
) # type: ignore
|
1209
|
-
if len(sample_features) > 0:
|
1210
|
-
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1211
|
-
|
1212
|
-
# table features
|
1213
|
-
for table, field in var_fields.items():
|
1214
|
-
table_fs = parse_staged_feature_sets_from_anndata(
|
1215
|
-
sdata[table],
|
1216
|
-
var_field=field,
|
1217
|
-
obs_field=obs_fields.get(table, Feature.name),
|
1218
|
-
mute=mute,
|
1219
|
-
organism=organism,
|
1220
|
-
)
|
1221
|
-
for k, v in table_fs.items():
|
1222
|
-
feature_sets[f"['{table}'].{k}"] = v
|
1223
|
-
|
1224
|
-
# link feature sets
|
1225
|
-
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1226
|
-
self._host.save()
|
1227
|
-
|
1228
|
-
|
1229
1127
|
def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
1230
1128
|
"""Transfer features from a artifact or collection."""
|
1231
1129
|
# This only covers feature sets
|
@@ -1336,25 +1234,200 @@ def _feature_set_by_slot(self):
|
|
1336
1234
|
return self.slots
|
1337
1235
|
|
1338
1236
|
|
1237
|
+
# deprecated: feature set parsing
|
1238
|
+
|
1239
|
+
|
1240
|
+
def parse_staged_feature_sets_from_anndata(
|
1241
|
+
adata: AnnData,
|
1242
|
+
var_field: FieldAttr | None = None,
|
1243
|
+
obs_field: FieldAttr = Feature.name,
|
1244
|
+
uns_field: FieldAttr | None = None,
|
1245
|
+
mute: bool = False,
|
1246
|
+
organism: str | Record | None = None,
|
1247
|
+
) -> dict:
|
1248
|
+
data_parse = adata
|
1249
|
+
if not isinstance(adata, AnnData): # is a path
|
1250
|
+
filepath = create_path(adata) # returns Path for local
|
1251
|
+
if not isinstance(filepath, LocalPathClasses):
|
1252
|
+
from lamindb import settings
|
1253
|
+
from lamindb.core.storage._backed_access import backed_access
|
1254
|
+
|
1255
|
+
using_key = settings._using_key
|
1256
|
+
data_parse = backed_access(filepath, using_key=using_key)
|
1257
|
+
else:
|
1258
|
+
data_parse = ad.read_h5ad(filepath, backed="r")
|
1259
|
+
type = "float"
|
1260
|
+
else:
|
1261
|
+
type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
|
1262
|
+
feature_sets = {}
|
1263
|
+
if var_field is not None:
|
1264
|
+
schema_var = Schema.from_values(
|
1265
|
+
data_parse.var.index,
|
1266
|
+
var_field,
|
1267
|
+
type=type,
|
1268
|
+
mute=mute,
|
1269
|
+
organism=organism,
|
1270
|
+
raise_validation_error=False,
|
1271
|
+
)
|
1272
|
+
if schema_var is not None:
|
1273
|
+
feature_sets["var"] = schema_var
|
1274
|
+
if obs_field is not None and len(data_parse.obs.columns) > 0:
|
1275
|
+
schema_obs = Schema.from_df(
|
1276
|
+
df=data_parse.obs,
|
1277
|
+
field=obs_field,
|
1278
|
+
mute=mute,
|
1279
|
+
organism=organism,
|
1280
|
+
)
|
1281
|
+
if schema_obs is not None:
|
1282
|
+
feature_sets["obs"] = schema_obs
|
1283
|
+
if uns_field is not None and len(data_parse.uns) > 0:
|
1284
|
+
validated_features = Feature.from_values( # type: ignore
|
1285
|
+
data_parse.uns.keys(), field=uns_field, organism=organism
|
1286
|
+
)
|
1287
|
+
if len(validated_features) > 0:
|
1288
|
+
schema_uns = Schema(validated_features, dtype=None, otype="dict")
|
1289
|
+
feature_sets["uns"] = schema_uns
|
1290
|
+
return feature_sets
|
1291
|
+
|
1292
|
+
|
1293
|
+
# no longer called from within curator
|
1294
|
+
# might deprecate in the future?
|
1295
|
+
def _add_set_from_df(
|
1296
|
+
self,
|
1297
|
+
field: FieldAttr = Feature.name,
|
1298
|
+
organism: str | None = None,
|
1299
|
+
mute: bool = False,
|
1300
|
+
):
|
1301
|
+
"""Add feature set corresponding to column names of DataFrame."""
|
1302
|
+
assert self._host.otype == "DataFrame" # noqa: S101
|
1303
|
+
df = self._host.load(is_run_input=False)
|
1304
|
+
schema = Schema.from_df(
|
1305
|
+
df=df,
|
1306
|
+
field=field,
|
1307
|
+
mute=mute,
|
1308
|
+
organism=organism,
|
1309
|
+
)
|
1310
|
+
self._host._staged_feature_sets = {"columns": schema}
|
1311
|
+
self._host.save()
|
1312
|
+
|
1313
|
+
|
1314
|
+
def _add_set_from_anndata(
|
1315
|
+
self,
|
1316
|
+
var_field: FieldAttr | None = None,
|
1317
|
+
obs_field: FieldAttr | None = Feature.name,
|
1318
|
+
uns_field: FieldAttr | None = None,
|
1319
|
+
mute: bool = False,
|
1320
|
+
organism: str | Record | None = None,
|
1321
|
+
):
|
1322
|
+
"""Add features from AnnData."""
|
1323
|
+
assert self._host.otype == "AnnData" # noqa: S101
|
1324
|
+
|
1325
|
+
# parse and register features
|
1326
|
+
adata = self._host.load(is_run_input=False)
|
1327
|
+
feature_sets = parse_staged_feature_sets_from_anndata(
|
1328
|
+
adata,
|
1329
|
+
var_field=var_field,
|
1330
|
+
obs_field=obs_field,
|
1331
|
+
uns_field=uns_field,
|
1332
|
+
mute=mute,
|
1333
|
+
organism=organism,
|
1334
|
+
)
|
1335
|
+
|
1336
|
+
# link feature sets
|
1337
|
+
self._host._staged_feature_sets = feature_sets
|
1338
|
+
self._host.save()
|
1339
|
+
|
1340
|
+
|
1341
|
+
def _add_set_from_mudata(
|
1342
|
+
self,
|
1343
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1344
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1345
|
+
mute: bool = False,
|
1346
|
+
organism: str | Record | None = None,
|
1347
|
+
):
|
1348
|
+
"""Add features from MuData."""
|
1349
|
+
if obs_fields is None:
|
1350
|
+
obs_fields = {}
|
1351
|
+
assert self._host.otype == "MuData" # noqa: S101
|
1352
|
+
|
1353
|
+
# parse and register features
|
1354
|
+
mdata = self._host.load(is_run_input=False)
|
1355
|
+
feature_sets = {}
|
1356
|
+
|
1357
|
+
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1358
|
+
if len(obs_features) > 0:
|
1359
|
+
feature_sets["obs"] = Schema(features=obs_features)
|
1360
|
+
for modality, field in var_fields.items():
|
1361
|
+
modality_fs = parse_staged_feature_sets_from_anndata(
|
1362
|
+
mdata[modality],
|
1363
|
+
var_field=field,
|
1364
|
+
obs_field=obs_fields.get(modality, Feature.name),
|
1365
|
+
mute=mute,
|
1366
|
+
organism=organism,
|
1367
|
+
)
|
1368
|
+
for k, v in modality_fs.items():
|
1369
|
+
feature_sets[f"['{modality}'].{k}"] = v
|
1370
|
+
|
1371
|
+
# link feature sets
|
1372
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1373
|
+
self._host.save()
|
1374
|
+
|
1375
|
+
|
1376
|
+
def _add_set_from_spatialdata(
|
1377
|
+
self,
|
1378
|
+
sample_metadata_key: str,
|
1379
|
+
sample_metadata_field: FieldAttr = Feature.name,
|
1380
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1381
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1382
|
+
mute: bool = False,
|
1383
|
+
organism: str | Record | None = None,
|
1384
|
+
):
|
1385
|
+
"""Add features from SpatialData."""
|
1386
|
+
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1387
|
+
assert self._host.otype == "SpatialData" # noqa: S101
|
1388
|
+
|
1389
|
+
# parse and register features
|
1390
|
+
sdata = self._host.load(is_run_input=False)
|
1391
|
+
feature_sets = {}
|
1392
|
+
|
1393
|
+
# sample features
|
1394
|
+
sample_features = Feature.from_values(
|
1395
|
+
sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
|
1396
|
+
field=sample_metadata_field,
|
1397
|
+
) # type: ignore
|
1398
|
+
if len(sample_features) > 0:
|
1399
|
+
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1400
|
+
|
1401
|
+
# table features
|
1402
|
+
for table, field in var_fields.items():
|
1403
|
+
table_fs = parse_staged_feature_sets_from_anndata(
|
1404
|
+
sdata[table],
|
1405
|
+
var_field=field,
|
1406
|
+
obs_field=obs_fields.get(table, Feature.name),
|
1407
|
+
mute=mute,
|
1408
|
+
organism=organism,
|
1409
|
+
)
|
1410
|
+
for k, v in table_fs.items():
|
1411
|
+
feature_sets[f"['{table}'].{k}"] = v
|
1412
|
+
|
1413
|
+
# link feature sets
|
1414
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1415
|
+
self._host.save()
|
1416
|
+
|
1417
|
+
|
1339
1418
|
# mypy: ignore-errors
|
1340
1419
|
FeatureManager.__init__ = __init__
|
1341
1420
|
ParamManager.__init__ = __init__
|
1342
1421
|
FeatureManager.__repr__ = __repr__
|
1343
1422
|
ParamManager.__repr__ = __repr__
|
1423
|
+
FeatureManager.describe = describe
|
1424
|
+
ParamManager.describe = describe
|
1344
1425
|
FeatureManager.__getitem__ = __getitem__
|
1345
1426
|
FeatureManager.get_values = get_values
|
1346
1427
|
FeatureManager.slots = slots
|
1347
1428
|
FeatureManager.add_values = add_values_features
|
1348
1429
|
FeatureManager._add_schema = _add_schema
|
1349
|
-
FeatureManager.add_schema = add_schema # deprecated
|
1350
|
-
FeatureManager.add_feature_set = add_feature_set # deprecated
|
1351
|
-
FeatureManager._schema_by_slot = _schema_by_slot # deprecated
|
1352
|
-
FeatureManager._feature_set_by_slot = _feature_set_by_slot # deprecated
|
1353
1430
|
FeatureManager._accessor_by_registry = _accessor_by_registry
|
1354
|
-
FeatureManager._add_set_from_df = _add_set_from_df
|
1355
|
-
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1356
|
-
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1357
|
-
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1358
1431
|
FeatureManager._add_from = _add_from
|
1359
1432
|
FeatureManager.filter = filter
|
1360
1433
|
FeatureManager.get = get
|
@@ -1363,3 +1436,13 @@ FeatureManager.remove_values = remove_values
|
|
1363
1436
|
ParamManager.add_values = add_values_params
|
1364
1437
|
ParamManager.get_values = get_values
|
1365
1438
|
ParamManager.filter = filter
|
1439
|
+
|
1440
|
+
# deprecated
|
1441
|
+
FeatureManager._add_set_from_df = _add_set_from_df
|
1442
|
+
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1443
|
+
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1444
|
+
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1445
|
+
FeatureManager.add_schema = add_schema
|
1446
|
+
FeatureManager.add_feature_set = add_feature_set
|
1447
|
+
FeatureManager._schema_by_slot = _schema_by_slot
|
1448
|
+
FeatureManager._feature_set_by_slot = _feature_set_by_slot
|