lamindb 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +177 -89
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/objects.py +6 -0
  20. lamindb/core/subsettings/__init__.py +2 -0
  21. lamindb/core/subsettings/_annotation_settings.py +11 -0
  22. lamindb/curators/__init__.py +7 -3349
  23. lamindb/curators/_legacy.py +2056 -0
  24. lamindb/curators/core.py +1546 -0
  25. lamindb/errors.py +11 -0
  26. lamindb/examples/__init__.py +27 -0
  27. lamindb/examples/schemas/__init__.py +12 -0
  28. lamindb/examples/schemas/_anndata.py +25 -0
  29. lamindb/examples/schemas/_simple.py +19 -0
  30. lamindb/integrations/_vitessce.py +8 -5
  31. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  32. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  33. lamindb/models/__init__.py +4 -1
  34. lamindb/models/_describe.py +21 -4
  35. lamindb/models/_feature_manager.py +365 -286
  36. lamindb/models/_label_manager.py +8 -2
  37. lamindb/models/artifact.py +173 -95
  38. lamindb/models/artifact_set.py +122 -0
  39. lamindb/models/collection.py +73 -52
  40. lamindb/models/core.py +1 -1
  41. lamindb/models/feature.py +51 -17
  42. lamindb/models/has_parents.py +2 -2
  43. lamindb/models/project.py +1 -1
  44. lamindb/models/query_manager.py +221 -22
  45. lamindb/models/query_set.py +245 -171
  46. lamindb/models/record.py +62 -243
  47. lamindb/models/run.py +4 -4
  48. lamindb/models/save.py +8 -2
  49. lamindb/models/schema.py +458 -181
  50. lamindb/models/transform.py +2 -2
  51. lamindb/models/ulabel.py +8 -5
  52. {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/METADATA +6 -6
  53. {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/RECORD +55 -42
  54. {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
  55. {lamindb-1.4.0.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
@@ -13,7 +13,7 @@ import pandas as pd
13
13
  from anndata import AnnData
14
14
  from django.contrib.postgres.aggregates import ArrayAgg
15
15
  from django.db import connections
16
- from django.db.models import Aggregate
16
+ from django.db.models import Aggregate, ProtectedError, Subquery
17
17
  from lamin_utils import logger
18
18
  from lamindb_setup.core.hashing import hash_set
19
19
  from lamindb_setup.core.upath import create_path
@@ -42,7 +42,7 @@ from ._describe import (
42
42
  TYPE_WIDTH,
43
43
  VALUES_WIDTH,
44
44
  describe_header,
45
- print_rich_tree,
45
+ format_rich_tree,
46
46
  )
47
47
  from ._django import get_artifact_with_related
48
48
  from ._label_manager import _get_labels, describe_labels
@@ -320,15 +320,27 @@ def describe_features(
320
320
  schema_data[slot] = (schema, feature_names)
321
321
  for feature_name in feature_names:
322
322
  feature_data[feature_name] = (slot, registry_str)
323
+ schema_data.update(
324
+ {
325
+ slot: (schema, schema.n)
326
+ for slot, schema in get_schema_by_slot_(self).items()
327
+ if slot not in schema_data
328
+ }
329
+ )
323
330
  else:
324
331
  for slot, schema in get_schema_by_slot_(self).items():
325
332
  features = schema.members
326
- # features.first() is a lot slower than features[0] here
327
- name_field = get_name_field(features[0])
328
- feature_names = list(features.values_list(name_field, flat=True)[:20])
329
- schema_data[slot] = (schema, feature_names)
330
- for feature_name in feature_names:
331
- feature_data[feature_name] = (slot, schema.itype)
333
+ if features.exists():
334
+ # features.first() is a lot slower than features[0] here
335
+ name_field = get_name_field(features[0])
336
+ feature_names = list(
337
+ features.values_list(name_field, flat=True)[:20]
338
+ )
339
+ schema_data[slot] = (schema, feature_names)
340
+ for feature_name in feature_names:
341
+ feature_data[feature_name] = (slot, schema.itype)
342
+ else:
343
+ schema_data[slot] = (schema, schema.n)
332
344
 
333
345
  internal_feature_names: dict[str, str] = {}
334
346
  if isinstance(self, Artifact):
@@ -400,38 +412,44 @@ def describe_features(
400
412
  internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
401
413
 
402
414
  int_features_tree_children = []
403
- for slot, (schema, feature_names) in schema_data.items():
404
- if slot in internal_feature_labels_slot:
405
- # add internal Feature features with labels
406
- feature_rows = internal_feature_labels_slot[slot]
407
- # add internal Feature features without labels
408
- feature_rows += [
409
- (
410
- feature_name,
411
- Text(str(internal_feature_names.get(feature_name)), style="dim"),
412
- "",
413
- )
414
- for feature_name in feature_names
415
- if feature_name and feature_name not in internal_feature_labels
416
- ]
415
+ for slot, (schema, feature_names_or_n) in schema_data.items():
416
+ if isinstance(feature_names_or_n, int):
417
+ feature_rows = []
417
418
  else:
418
- # add internal non-Feature features without labels
419
- feature_rows = [
420
- (
421
- feature_name,
422
- Text(
423
- str(
424
- internal_feature_names.get(feature_name)
425
- if feature_name in internal_feature_names
426
- else schema.dtype
419
+ feature_names = feature_names_or_n
420
+ if slot in internal_feature_labels_slot:
421
+ # add internal Feature features with labels
422
+ feature_rows = internal_feature_labels_slot[slot]
423
+ # add internal Feature features without labels
424
+ feature_rows += [
425
+ (
426
+ feature_name,
427
+ Text(
428
+ str(internal_feature_names.get(feature_name)), style="dim"
427
429
  ),
428
- style="dim",
429
- ),
430
- "",
431
- )
432
- for feature_name in feature_names
433
- if feature_name
434
- ]
430
+ "",
431
+ )
432
+ for feature_name in feature_names
433
+ if feature_name and feature_name not in internal_feature_labels
434
+ ]
435
+ else:
436
+ # add internal non-Feature features without labels
437
+ feature_rows = [
438
+ (
439
+ feature_name,
440
+ Text(
441
+ str(
442
+ internal_feature_names.get(feature_name)
443
+ if feature_name in internal_feature_names
444
+ else schema.dtype
445
+ ),
446
+ style="dim",
447
+ ),
448
+ "",
449
+ )
450
+ for feature_name in feature_names
451
+ if feature_name
452
+ ]
435
453
  int_features_tree_children.append(
436
454
  _create_feature_table(
437
455
  Text.assemble(
@@ -482,59 +500,6 @@ def describe_features(
482
500
  return tree
483
501
 
484
502
 
485
- def parse_staged_feature_sets_from_anndata(
486
- adata: AnnData,
487
- var_field: FieldAttr | None = None,
488
- obs_field: FieldAttr = Feature.name,
489
- uns_field: FieldAttr | None = None,
490
- mute: bool = False,
491
- organism: str | Record | None = None,
492
- ) -> dict:
493
- data_parse = adata
494
- if not isinstance(adata, AnnData): # is a path
495
- filepath = create_path(adata) # returns Path for local
496
- if not isinstance(filepath, LocalPathClasses):
497
- from lamindb import settings
498
- from lamindb.core.storage._backed_access import backed_access
499
-
500
- using_key = settings._using_key
501
- data_parse = backed_access(filepath, using_key=using_key)
502
- else:
503
- data_parse = ad.read_h5ad(filepath, backed="r")
504
- type = "float"
505
- else:
506
- type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
507
- feature_sets = {}
508
- if var_field is not None:
509
- schema_var = Schema.from_values(
510
- data_parse.var.index,
511
- var_field,
512
- type=type,
513
- mute=mute,
514
- organism=organism,
515
- raise_validation_error=False,
516
- )
517
- if schema_var is not None:
518
- feature_sets["var"] = schema_var
519
- if obs_field is not None and len(data_parse.obs.columns) > 0:
520
- schema_obs = Schema.from_df(
521
- df=data_parse.obs,
522
- field=obs_field,
523
- mute=mute,
524
- organism=organism,
525
- )
526
- if schema_obs is not None:
527
- feature_sets["obs"] = schema_obs
528
- if uns_field is not None and len(data_parse.uns) > 0:
529
- validated_features = Feature.from_values( # type: ignore
530
- data_parse.uns.keys(), field=uns_field, organism=organism
531
- )
532
- if len(validated_features) > 0:
533
- schema_uns = Schema(validated_features, dtype=None, otype="dict")
534
- feature_sets["uns"] = schema_uns
535
- return feature_sets
536
-
537
-
538
503
  def is_valid_datetime_str(date_string: str) -> bool | str:
539
504
  try:
540
505
  dt = datetime.fromisoformat(date_string)
@@ -625,8 +590,12 @@ def __init__(self, host: Artifact | Collection | Run):
625
590
 
626
591
 
627
592
  def __repr__(self) -> str:
593
+ return describe(self, return_str=True) # type: ignore
594
+
595
+
596
+ def describe(self, return_str: bool = False) -> str | None:
628
597
  tree = describe_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
629
- return print_rich_tree(tree, fallback="no linked features")
598
+ return format_rich_tree(tree, fallback="no linked features", return_str=return_str)
630
599
 
631
600
 
632
601
  def get_values(self) -> dict[str, Any]:
@@ -676,50 +645,70 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
676
645
  comparator = f"__{split_key[1]}"
677
646
  feature = features.get(name=normalized_key)
678
647
  if not feature.dtype.startswith("cat"):
679
- expression = {feature_param: feature, f"value{comparator}": value}
680
- feature_value = value_model.filter(**expression)
681
- new_expression[f"_{feature_param}_values__in"] = feature_value
682
- elif isinstance(value, (str, Record)):
683
- # because SQL is sensitive to whether querying with __in or not
684
- # and might return multiple equivalent records for the latter
685
- # we distinguish cases in which we have multiple label matches vs. one
686
- label = None
687
- labels = None
688
- if isinstance(value, str):
689
- # we need the comparator here because users might query like so
690
- # ln.Artifact.features.filter(experiment__contains="Experi")
691
- expression = {f"name{comparator}": value}
692
- labels = ULabel.filter(**expression).all()
693
- if len(labels) == 0:
694
- raise DoesNotExist(
695
- f"Did not find a ULabel matching `name{comparator}={value}`"
648
+ if comparator == "__isnull":
649
+ if cls == FeatureManager:
650
+ from .artifact import ArtifactFeatureValue
651
+
652
+ return Artifact.objects.exclude(
653
+ id__in=Subquery(
654
+ ArtifactFeatureValue.objects.filter(
655
+ featurevalue__feature=feature
656
+ ).values("artifact_id")
657
+ )
696
658
  )
697
- elif len(labels) == 1:
698
- label = labels[0]
699
- elif isinstance(value, Record):
700
- label = value
701
- label_registry = (
702
- label.__class__ if label is not None else labels[0].__class__
703
- )
704
- accessor_name = (
705
- label_registry.artifacts.through.artifact.field._related_name
706
- )
707
- new_expression[f"{accessor_name}__feature"] = feature
708
- if label is not None:
709
- # simplified query if we have exactly one label
710
- new_expression[
711
- f"{accessor_name}__{label_registry.__name__.lower()}"
712
- ] = label
659
+ if comparator in {"__startswith", "__contains"}:
660
+ logger.important(
661
+ f"currently not supporting `{comparator}`, using `__icontains` instead"
662
+ )
663
+ comparator = "__icontains"
664
+ expression = {feature_param: feature, f"value{comparator}": value}
665
+ feature_values = value_model.filter(**expression)
666
+ new_expression[f"_{feature_param}_values__id__in"] = feature_values
667
+ elif isinstance(value, (str, Record, bool)):
668
+ if comparator == "__isnull":
669
+ if cls == FeatureManager:
670
+ return Artifact.objects.exclude(links_ulabel__feature=feature)
713
671
  else:
714
- new_expression[
715
- f"{accessor_name}__{label_registry.__name__.lower()}__in"
716
- ] = labels
717
- else:
672
+ # because SQL is sensitive to whether querying with __in or not
673
+ # and might return multiple equivalent records for the latter
674
+ # we distinguish cases in which we have multiple label matches vs. one
675
+ label = None
676
+ labels = None
677
+ if isinstance(value, str):
678
+ # we need the comparator here because users might query like so
679
+ # ln.Artifact.filter(experiment__contains="Experi")
680
+ expression = {f"name{comparator}": value}
681
+ labels = ULabel.filter(**expression).all()
682
+ if len(labels) == 0:
683
+ raise DoesNotExist(
684
+ f"Did not find a ULabel matching `name{comparator}={value}`"
685
+ )
686
+ elif len(labels) == 1:
687
+ label = labels[0]
688
+ elif isinstance(value, Record):
689
+ label = value
690
+ label_registry = (
691
+ label.__class__ if label is not None else labels[0].__class__
692
+ )
693
+ accessor_name = (
694
+ label_registry.artifacts.through.artifact.field._related_name
695
+ )
696
+ new_expression[f"{accessor_name}__feature"] = feature
697
+ if label is not None:
698
+ # simplified query if we have exactly one label
699
+ new_expression[
700
+ f"{accessor_name}__{label_registry.__name__.lower()}"
701
+ ] = label
702
+ else:
703
+ new_expression[
704
+ f"{accessor_name}__{label_registry.__name__.lower()}__in"
705
+ ] = labels
718
706
  # if passing a list of records, we want to
719
707
  # find artifacts that are annotated by all of them at the same
720
708
  # time; hence, we don't want the __in construct that we use to match strings
721
709
  # https://laminlabs.slack.com/archives/C04FPE8V01W/p1688328084810609
722
- raise NotImplementedError
710
+ if not (new_expression):
711
+ raise NotImplementedError
723
712
  if cls == FeatureManager or cls == ParamManagerArtifact:
724
713
  return Artifact.objects.filter(**new_expression)
725
714
  elif cls == ParamManagerRun:
@@ -821,15 +810,14 @@ def _add_values(
821
810
  from .artifact import Artifact
822
811
 
823
812
  # rename to distinguish from the values inside the dict
824
- features_values = values
825
- keys = features_values.keys()
813
+ dictionary = values
814
+ keys = dictionary.keys()
826
815
  if isinstance(keys, DICT_KEYS_TYPE):
827
816
  keys = list(keys) # type: ignore
828
817
  # deal with other cases later
829
818
  assert all(isinstance(key, str) for key in keys) # noqa: S101
830
819
  registry = feature_param_field.field.model
831
820
  is_param = registry == Param
832
- model = Param if is_param else Feature
833
821
  value_model = ParamValue if is_param else FeatureValue
834
822
  model_name = "Param" if is_param else "Feature"
835
823
  if is_param:
@@ -842,13 +830,11 @@ def _add_values(
842
830
  raise ValidationError(
843
831
  "Can only set features for dataset-like artifacts."
844
832
  )
845
- validated = registry.validate(keys, field=feature_param_field, mute=True)
846
- keys_array = np.array(keys)
847
- keys_array[validated]
848
- if validated.sum() != len(keys):
849
- not_validated_keys = keys_array[~validated]
833
+ records = registry.from_values(keys, field=feature_param_field, mute=True)
834
+ if len(records) != len(keys):
835
+ not_validated_keys = [key for key in keys if key not in records.list("name")]
850
836
  not_validated_keys_dtype_message = [
851
- (key, infer_feature_type_convert_json(key, features_values[key]))
837
+ (key, infer_feature_type_convert_json(key, dictionary[key]))
852
838
  for key in not_validated_keys
853
839
  ]
854
840
  run = get_current_tracked_run()
@@ -866,7 +852,7 @@ def _add_values(
866
852
  ]
867
853
  hint = "\n".join(elements)
868
854
  msg = (
869
- f"These keys could not be validated: {not_validated_keys.tolist()}\n"
855
+ f"These keys could not be validated: {not_validated_keys}\n"
870
856
  f"Here is how to create a {model_name.lower()}:\n\n{hint}"
871
857
  )
872
858
  raise ValidationError(msg)
@@ -875,10 +861,10 @@ def _add_values(
875
861
  features_labels = defaultdict(list)
876
862
  _feature_values = []
877
863
  not_validated_values = []
878
- for key, value in features_values.items():
879
- feature = model.get(name=key)
864
+ for feature in records:
865
+ value = dictionary[feature.name]
880
866
  inferred_type, converted_value, _ = infer_feature_type_convert_json(
881
- key,
867
+ feature.name,
882
868
  value,
883
869
  mute=True,
884
870
  str_as_ulabel=str_as_ulabel,
@@ -886,25 +872,23 @@ def _add_values(
886
872
  if feature.dtype == "num":
887
873
  if inferred_type not in {"int", "float"}:
888
874
  raise TypeError(
889
- f"Value for feature '{key}' with type {feature.dtype} must be a number"
875
+ f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
890
876
  )
891
877
  elif feature.dtype.startswith("cat"):
892
878
  if inferred_type != "?":
893
879
  if not (inferred_type.startswith("cat") or isinstance(value, Record)):
894
880
  raise TypeError(
895
- f"Value for feature '{key}' with type '{feature.dtype}' must be a string or record."
881
+ f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
896
882
  )
897
883
  elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
898
884
  feature.dtype != "str" and feature.dtype != inferred_type
899
885
  ):
900
886
  raise ValidationError(
901
- f"Expected dtype for '{key}' is '{feature.dtype}', got '{inferred_type}'"
887
+ f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
902
888
  )
903
889
  if not feature.dtype.startswith("cat"):
904
890
  filter_kwargs = {model_name.lower(): feature, "value": converted_value}
905
- feature_value = value_model.filter(**filter_kwargs).one_or_none()
906
- if feature_value is None:
907
- feature_value = value_model(**filter_kwargs)
891
+ feature_value, _ = value_model.get_or_create(**filter_kwargs)
908
892
  _feature_values.append(feature_value)
909
893
  else:
910
894
  if isinstance(value, Record) or (
@@ -942,30 +926,63 @@ def _add_values(
942
926
  (feature, label_record) for label_record in label_records
943
927
  ]
944
928
  if not_validated_values:
945
- hint = (
946
- f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True)\n"
947
- f" ln.save(ulabels)"
948
- )
929
+ not_validated_values.sort()
930
+ hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
949
931
  msg = (
950
932
  f"These values could not be validated: {not_validated_values}\n"
951
933
  f"Here is how to create ulabels for them:\n\n{hint}"
952
934
  )
953
935
  raise ValidationError(msg)
936
+ # TODO: create an explicit version of this
937
+ # if not is_param:
938
+ # # check if _expect_many is false for _all_ records
939
+ # if any(record._expect_many for record in records):
940
+ # updated_features = []
941
+ # for record in records:
942
+ # if record._expect_many:
943
+ # record._expect_many = False
944
+ # record.save()
945
+ # updated_features.append(record.name)
946
+ # if any(updated_features):
947
+ # logger.important(
948
+ # f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
949
+ # )
954
950
  # bulk add all links
955
951
  if features_labels:
956
952
  add_label_feature_links(self, features_labels)
957
953
  if _feature_values:
958
- save(_feature_values)
954
+ to_insert_feature_values = [
955
+ record for record in _feature_values if record._state.adding
956
+ ]
957
+ if to_insert_feature_values:
958
+ save(to_insert_feature_values)
959
+ dict_typed_features = [
960
+ getattr(record, model_name.lower())
961
+ for record in _feature_values
962
+ if getattr(record, model_name.lower()).dtype == "dict"
963
+ ]
959
964
  if is_param:
960
965
  LinkORM = self._host._param_values.through
961
966
  valuefield_id = "paramvalue_id"
962
967
  else:
963
968
  LinkORM = self._host._feature_values.through
964
969
  valuefield_id = "featurevalue_id"
970
+ host_class_lower = self._host.__class__.__get_name_with_module__().lower()
971
+ if dict_typed_features:
972
+ # delete all previously existing anotations with dictionaries
973
+ kwargs = {
974
+ f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
975
+ f"{model_name.lower()}__in": dict_typed_features,
976
+ }
977
+ try:
978
+ value_model.filter(**kwargs).all().delete()
979
+ except ProtectedError:
980
+ pass
981
+ # add new feature links
965
982
  links = [
966
983
  LinkORM(
967
984
  **{
968
- f"{self._host.__class__.__get_name_with_module__().lower()}_id": self._host.id,
985
+ f"{host_class_lower}_id": self._host.id,
969
986
  valuefield_id: feature_value.id,
970
987
  }
971
988
  )
@@ -1092,52 +1109,6 @@ def _add_schema(self, schema: Schema, slot: str) -> None:
1092
1109
  self._slots[slot] = schema # type: ignore
1093
1110
 
1094
1111
 
1095
- def _add_set_from_df(
1096
- self,
1097
- field: FieldAttr = Feature.name,
1098
- organism: str | None = None,
1099
- mute: bool = False,
1100
- ):
1101
- """Add feature set corresponding to column names of DataFrame."""
1102
- assert self._host.otype == "DataFrame" # noqa: S101
1103
- df = self._host.load(is_run_input=False)
1104
- schema = Schema.from_df(
1105
- df=df,
1106
- field=field,
1107
- mute=mute,
1108
- organism=organism,
1109
- )
1110
- self._host._staged_feature_sets = {"columns": schema}
1111
- self._host.save()
1112
-
1113
-
1114
- def _add_set_from_anndata(
1115
- self,
1116
- var_field: FieldAttr | None = None,
1117
- obs_field: FieldAttr | None = Feature.name,
1118
- uns_field: FieldAttr | None = None,
1119
- mute: bool = False,
1120
- organism: str | Record | None = None,
1121
- ):
1122
- """Add features from AnnData."""
1123
- assert self._host.otype == "AnnData" # noqa: S101
1124
-
1125
- # parse and register features
1126
- adata = self._host.load(is_run_input=False)
1127
- feature_sets = parse_staged_feature_sets_from_anndata(
1128
- adata,
1129
- var_field=var_field,
1130
- obs_field=obs_field,
1131
- uns_field=uns_field,
1132
- mute=mute,
1133
- organism=organism,
1134
- )
1135
-
1136
- # link feature sets
1137
- self._host._staged_feature_sets = feature_sets
1138
- self._host.save()
1139
-
1140
-
1141
1112
  def _unify_staged_feature_sets_by_hash(
1142
1113
  feature_sets: MutableMapping[str, Schema],
1143
1114
  ):
@@ -1153,83 +1124,6 @@ def _unify_staged_feature_sets_by_hash(
1153
1124
  return feature_sets
1154
1125
 
1155
1126
 
1156
- def _add_set_from_mudata(
1157
- self,
1158
- var_fields: dict[str, FieldAttr] | None = None,
1159
- obs_fields: dict[str, FieldAttr] | None = None,
1160
- mute: bool = False,
1161
- organism: str | Record | None = None,
1162
- ):
1163
- """Add features from MuData."""
1164
- if obs_fields is None:
1165
- obs_fields = {}
1166
- assert self._host.otype == "MuData" # noqa: S101
1167
-
1168
- # parse and register features
1169
- mdata = self._host.load(is_run_input=False)
1170
- feature_sets = {}
1171
-
1172
- obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1173
- if len(obs_features) > 0:
1174
- feature_sets["obs"] = Schema(features=obs_features)
1175
- for modality, field in var_fields.items():
1176
- modality_fs = parse_staged_feature_sets_from_anndata(
1177
- mdata[modality],
1178
- var_field=field,
1179
- obs_field=obs_fields.get(modality, Feature.name),
1180
- mute=mute,
1181
- organism=organism,
1182
- )
1183
- for k, v in modality_fs.items():
1184
- feature_sets[f"['{modality}'].{k}"] = v
1185
-
1186
- # link feature sets
1187
- self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1188
- self._host.save()
1189
-
1190
-
1191
- def _add_set_from_spatialdata(
1192
- self,
1193
- sample_metadata_key: str,
1194
- sample_metadata_field: FieldAttr = Feature.name,
1195
- var_fields: dict[str, FieldAttr] | None = None,
1196
- obs_fields: dict[str, FieldAttr] | None = None,
1197
- mute: bool = False,
1198
- organism: str | Record | None = None,
1199
- ):
1200
- """Add features from SpatialData."""
1201
- obs_fields, var_fields = obs_fields or {}, var_fields or {}
1202
- assert self._host.otype == "SpatialData" # noqa: S101
1203
-
1204
- # parse and register features
1205
- sdata = self._host.load(is_run_input=False)
1206
- feature_sets = {}
1207
-
1208
- # sample features
1209
- sample_features = Feature.from_values(
1210
- sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
1211
- field=sample_metadata_field,
1212
- ) # type: ignore
1213
- if len(sample_features) > 0:
1214
- feature_sets[sample_metadata_key] = Schema(features=sample_features)
1215
-
1216
- # table features
1217
- for table, field in var_fields.items():
1218
- table_fs = parse_staged_feature_sets_from_anndata(
1219
- sdata[table],
1220
- var_field=field,
1221
- obs_field=obs_fields.get(table, Feature.name),
1222
- mute=mute,
1223
- organism=organism,
1224
- )
1225
- for k, v in table_fs.items():
1226
- feature_sets[f"['{table}'].{k}"] = v
1227
-
1228
- # link feature sets
1229
- self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1230
- self._host.save()
1231
-
1232
-
1233
1127
  def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1234
1128
  """Transfer features from a artifact or collection."""
1235
1129
  # This only covers feature sets
@@ -1340,25 +1234,200 @@ def _feature_set_by_slot(self):
1340
1234
  return self.slots
1341
1235
 
1342
1236
 
1237
+ # deprecated: feature set parsing
1238
+
1239
+
1240
+ def parse_staged_feature_sets_from_anndata(
1241
+ adata: AnnData,
1242
+ var_field: FieldAttr | None = None,
1243
+ obs_field: FieldAttr = Feature.name,
1244
+ uns_field: FieldAttr | None = None,
1245
+ mute: bool = False,
1246
+ organism: str | Record | None = None,
1247
+ ) -> dict:
1248
+ data_parse = adata
1249
+ if not isinstance(adata, AnnData): # is a path
1250
+ filepath = create_path(adata) # returns Path for local
1251
+ if not isinstance(filepath, LocalPathClasses):
1252
+ from lamindb import settings
1253
+ from lamindb.core.storage._backed_access import backed_access
1254
+
1255
+ using_key = settings._using_key
1256
+ data_parse = backed_access(filepath, using_key=using_key)
1257
+ else:
1258
+ data_parse = ad.read_h5ad(filepath, backed="r")
1259
+ type = "float"
1260
+ else:
1261
+ type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
1262
+ feature_sets = {}
1263
+ if var_field is not None:
1264
+ schema_var = Schema.from_values(
1265
+ data_parse.var.index,
1266
+ var_field,
1267
+ type=type,
1268
+ mute=mute,
1269
+ organism=organism,
1270
+ raise_validation_error=False,
1271
+ )
1272
+ if schema_var is not None:
1273
+ feature_sets["var"] = schema_var
1274
+ if obs_field is not None and len(data_parse.obs.columns) > 0:
1275
+ schema_obs = Schema.from_df(
1276
+ df=data_parse.obs,
1277
+ field=obs_field,
1278
+ mute=mute,
1279
+ organism=organism,
1280
+ )
1281
+ if schema_obs is not None:
1282
+ feature_sets["obs"] = schema_obs
1283
+ if uns_field is not None and len(data_parse.uns) > 0:
1284
+ validated_features = Feature.from_values( # type: ignore
1285
+ data_parse.uns.keys(), field=uns_field, organism=organism
1286
+ )
1287
+ if len(validated_features) > 0:
1288
+ schema_uns = Schema(validated_features, dtype=None, otype="dict")
1289
+ feature_sets["uns"] = schema_uns
1290
+ return feature_sets
1291
+
1292
+
1293
+ # no longer called from within curator
1294
+ # might deprecate in the future?
1295
+ def _add_set_from_df(
1296
+ self,
1297
+ field: FieldAttr = Feature.name,
1298
+ organism: str | None = None,
1299
+ mute: bool = False,
1300
+ ):
1301
+ """Add feature set corresponding to column names of DataFrame."""
1302
+ assert self._host.otype == "DataFrame" # noqa: S101
1303
+ df = self._host.load(is_run_input=False)
1304
+ schema = Schema.from_df(
1305
+ df=df,
1306
+ field=field,
1307
+ mute=mute,
1308
+ organism=organism,
1309
+ )
1310
+ self._host._staged_feature_sets = {"columns": schema}
1311
+ self._host.save()
1312
+
1313
+
1314
+ def _add_set_from_anndata(
1315
+ self,
1316
+ var_field: FieldAttr | None = None,
1317
+ obs_field: FieldAttr | None = Feature.name,
1318
+ uns_field: FieldAttr | None = None,
1319
+ mute: bool = False,
1320
+ organism: str | Record | None = None,
1321
+ ):
1322
+ """Add features from AnnData."""
1323
+ assert self._host.otype == "AnnData" # noqa: S101
1324
+
1325
+ # parse and register features
1326
+ adata = self._host.load(is_run_input=False)
1327
+ feature_sets = parse_staged_feature_sets_from_anndata(
1328
+ adata,
1329
+ var_field=var_field,
1330
+ obs_field=obs_field,
1331
+ uns_field=uns_field,
1332
+ mute=mute,
1333
+ organism=organism,
1334
+ )
1335
+
1336
+ # link feature sets
1337
+ self._host._staged_feature_sets = feature_sets
1338
+ self._host.save()
1339
+
1340
+
1341
+ def _add_set_from_mudata(
1342
+ self,
1343
+ var_fields: dict[str, FieldAttr] | None = None,
1344
+ obs_fields: dict[str, FieldAttr] | None = None,
1345
+ mute: bool = False,
1346
+ organism: str | Record | None = None,
1347
+ ):
1348
+ """Add features from MuData."""
1349
+ if obs_fields is None:
1350
+ obs_fields = {}
1351
+ assert self._host.otype == "MuData" # noqa: S101
1352
+
1353
+ # parse and register features
1354
+ mdata = self._host.load(is_run_input=False)
1355
+ feature_sets = {}
1356
+
1357
+ obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1358
+ if len(obs_features) > 0:
1359
+ feature_sets["obs"] = Schema(features=obs_features)
1360
+ for modality, field in var_fields.items():
1361
+ modality_fs = parse_staged_feature_sets_from_anndata(
1362
+ mdata[modality],
1363
+ var_field=field,
1364
+ obs_field=obs_fields.get(modality, Feature.name),
1365
+ mute=mute,
1366
+ organism=organism,
1367
+ )
1368
+ for k, v in modality_fs.items():
1369
+ feature_sets[f"['{modality}'].{k}"] = v
1370
+
1371
+ # link feature sets
1372
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1373
+ self._host.save()
1374
+
1375
+
1376
+ def _add_set_from_spatialdata(
1377
+ self,
1378
+ sample_metadata_key: str,
1379
+ sample_metadata_field: FieldAttr = Feature.name,
1380
+ var_fields: dict[str, FieldAttr] | None = None,
1381
+ obs_fields: dict[str, FieldAttr] | None = None,
1382
+ mute: bool = False,
1383
+ organism: str | Record | None = None,
1384
+ ):
1385
+ """Add features from SpatialData."""
1386
+ obs_fields, var_fields = obs_fields or {}, var_fields or {}
1387
+ assert self._host.otype == "SpatialData" # noqa: S101
1388
+
1389
+ # parse and register features
1390
+ sdata = self._host.load(is_run_input=False)
1391
+ feature_sets = {}
1392
+
1393
+ # sample features
1394
+ sample_features = Feature.from_values(
1395
+ sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
1396
+ field=sample_metadata_field,
1397
+ ) # type: ignore
1398
+ if len(sample_features) > 0:
1399
+ feature_sets[sample_metadata_key] = Schema(features=sample_features)
1400
+
1401
+ # table features
1402
+ for table, field in var_fields.items():
1403
+ table_fs = parse_staged_feature_sets_from_anndata(
1404
+ sdata[table],
1405
+ var_field=field,
1406
+ obs_field=obs_fields.get(table, Feature.name),
1407
+ mute=mute,
1408
+ organism=organism,
1409
+ )
1410
+ for k, v in table_fs.items():
1411
+ feature_sets[f"['{table}'].{k}"] = v
1412
+
1413
+ # link feature sets
1414
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1415
+ self._host.save()
1416
+
1417
+
1343
1418
  # mypy: ignore-errors
1344
1419
  FeatureManager.__init__ = __init__
1345
1420
  ParamManager.__init__ = __init__
1346
1421
  FeatureManager.__repr__ = __repr__
1347
1422
  ParamManager.__repr__ = __repr__
1423
+ FeatureManager.describe = describe
1424
+ ParamManager.describe = describe
1348
1425
  FeatureManager.__getitem__ = __getitem__
1349
1426
  FeatureManager.get_values = get_values
1350
1427
  FeatureManager.slots = slots
1351
1428
  FeatureManager.add_values = add_values_features
1352
1429
  FeatureManager._add_schema = _add_schema
1353
- FeatureManager.add_schema = add_schema # deprecated
1354
- FeatureManager.add_feature_set = add_feature_set # deprecated
1355
- FeatureManager._schema_by_slot = _schema_by_slot # deprecated
1356
- FeatureManager._feature_set_by_slot = _feature_set_by_slot # deprecated
1357
1430
  FeatureManager._accessor_by_registry = _accessor_by_registry
1358
- FeatureManager._add_set_from_df = _add_set_from_df
1359
- FeatureManager._add_set_from_anndata = _add_set_from_anndata
1360
- FeatureManager._add_set_from_mudata = _add_set_from_mudata
1361
- FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
1362
1431
  FeatureManager._add_from = _add_from
1363
1432
  FeatureManager.filter = filter
1364
1433
  FeatureManager.get = get
@@ -1367,3 +1436,13 @@ FeatureManager.remove_values = remove_values
1367
1436
  ParamManager.add_values = add_values_params
1368
1437
  ParamManager.get_values = get_values
1369
1438
  ParamManager.filter = filter
1439
+
1440
+ # deprecated
1441
+ FeatureManager._add_set_from_df = _add_set_from_df
1442
+ FeatureManager._add_set_from_anndata = _add_set_from_anndata
1443
+ FeatureManager._add_set_from_mudata = _add_set_from_mudata
1444
+ FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
1445
+ FeatureManager.add_schema = add_schema
1446
+ FeatureManager.add_feature_set = add_feature_set
1447
+ FeatureManager._schema_by_slot = _schema_by_slot
1448
+ FeatureManager._feature_set_by_slot = _feature_set_by_slot