lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +216 -133
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/objects.py +6 -0
  20. lamindb/core/subsettings/__init__.py +2 -0
  21. lamindb/core/subsettings/_annotation_settings.py +11 -0
  22. lamindb/curators/__init__.py +7 -3559
  23. lamindb/curators/_legacy.py +2056 -0
  24. lamindb/curators/core.py +1546 -0
  25. lamindb/errors.py +11 -0
  26. lamindb/examples/__init__.py +27 -0
  27. lamindb/examples/schemas/__init__.py +12 -0
  28. lamindb/examples/schemas/_anndata.py +25 -0
  29. lamindb/examples/schemas/_simple.py +19 -0
  30. lamindb/integrations/_vitessce.py +8 -5
  31. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  32. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  33. lamindb/models/__init__.py +12 -2
  34. lamindb/models/_describe.py +21 -4
  35. lamindb/models/_feature_manager.py +384 -301
  36. lamindb/models/_from_values.py +1 -1
  37. lamindb/models/_is_versioned.py +5 -15
  38. lamindb/models/_label_manager.py +8 -2
  39. lamindb/models/artifact.py +354 -177
  40. lamindb/models/artifact_set.py +122 -0
  41. lamindb/models/can_curate.py +4 -1
  42. lamindb/models/collection.py +79 -56
  43. lamindb/models/core.py +1 -1
  44. lamindb/models/feature.py +78 -47
  45. lamindb/models/has_parents.py +24 -9
  46. lamindb/models/project.py +3 -3
  47. lamindb/models/query_manager.py +221 -22
  48. lamindb/models/query_set.py +251 -206
  49. lamindb/models/record.py +211 -344
  50. lamindb/models/run.py +59 -5
  51. lamindb/models/save.py +9 -5
  52. lamindb/models/schema.py +673 -196
  53. lamindb/models/transform.py +5 -14
  54. lamindb/models/ulabel.py +8 -5
  55. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
  56. lamindb-1.5.0.dist-info/RECORD +108 -0
  57. lamindb-1.3.2.dist-info/RECORD +0 -95
  58. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
  59. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
@@ -13,7 +13,7 @@ import pandas as pd
13
13
  from anndata import AnnData
14
14
  from django.contrib.postgres.aggregates import ArrayAgg
15
15
  from django.db import connections
16
- from django.db.models import Aggregate
16
+ from django.db.models import Aggregate, ProtectedError, Subquery
17
17
  from lamin_utils import logger
18
18
  from lamindb_setup.core.hashing import hash_set
19
19
  from lamindb_setup.core.upath import create_path
@@ -42,7 +42,7 @@ from ._describe import (
42
42
  TYPE_WIDTH,
43
43
  VALUES_WIDTH,
44
44
  describe_header,
45
- print_rich_tree,
45
+ format_rich_tree,
46
46
  )
47
47
  from ._django import get_artifact_with_related
48
48
  from ._label_manager import _get_labels, describe_labels
@@ -313,19 +313,34 @@ def describe_features(
313
313
  fs_data = _get_schemas_postgres(self, related_data=related_data)
314
314
  for fs_id, (slot, data) in fs_data.items():
315
315
  for registry_str, feature_names in data.items():
316
+ # prevent projects show up as features
317
+ if registry_str == "Project":
318
+ continue
316
319
  schema = Schema.objects.using(self._state.db).get(id=fs_id)
317
320
  schema_data[slot] = (schema, feature_names)
318
321
  for feature_name in feature_names:
319
322
  feature_data[feature_name] = (slot, registry_str)
323
+ schema_data.update(
324
+ {
325
+ slot: (schema, schema.n)
326
+ for slot, schema in get_schema_by_slot_(self).items()
327
+ if slot not in schema_data
328
+ }
329
+ )
320
330
  else:
321
331
  for slot, schema in get_schema_by_slot_(self).items():
322
332
  features = schema.members
323
- # features.first() is a lot slower than features[0] here
324
- name_field = get_name_field(features[0])
325
- feature_names = list(features.values_list(name_field, flat=True)[:20])
326
- schema_data[slot] = (schema, feature_names)
327
- for feature_name in feature_names:
328
- feature_data[feature_name] = (slot, schema.itype)
333
+ if features.exists():
334
+ # features.first() is a lot slower than features[0] here
335
+ name_field = get_name_field(features[0])
336
+ feature_names = list(
337
+ features.values_list(name_field, flat=True)[:20]
338
+ )
339
+ schema_data[slot] = (schema, feature_names)
340
+ for feature_name in feature_names:
341
+ feature_data[feature_name] = (slot, schema.itype)
342
+ else:
343
+ schema_data[slot] = (schema, schema.n)
329
344
 
330
345
  internal_feature_names: dict[str, str] = {}
331
346
  if isinstance(self, Artifact):
@@ -397,38 +412,44 @@ def describe_features(
397
412
  internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
398
413
 
399
414
  int_features_tree_children = []
400
- for slot, (schema, feature_names) in schema_data.items():
401
- if slot in internal_feature_labels_slot:
402
- # add internal Feature features with labels
403
- feature_rows = internal_feature_labels_slot[slot]
404
- # add internal Feature features without labels
405
- feature_rows += [
406
- (
407
- feature_name,
408
- Text(str(internal_feature_names.get(feature_name)), style="dim"),
409
- "",
410
- )
411
- for feature_name in feature_names
412
- if feature_name and feature_name not in internal_feature_labels
413
- ]
415
+ for slot, (schema, feature_names_or_n) in schema_data.items():
416
+ if isinstance(feature_names_or_n, int):
417
+ feature_rows = []
414
418
  else:
415
- # add internal non-Feature features without labels
416
- feature_rows = [
417
- (
418
- feature_name,
419
- Text(
420
- str(
421
- internal_feature_names.get(feature_name)
422
- if feature_name in internal_feature_names
423
- else schema.dtype
419
+ feature_names = feature_names_or_n
420
+ if slot in internal_feature_labels_slot:
421
+ # add internal Feature features with labels
422
+ feature_rows = internal_feature_labels_slot[slot]
423
+ # add internal Feature features without labels
424
+ feature_rows += [
425
+ (
426
+ feature_name,
427
+ Text(
428
+ str(internal_feature_names.get(feature_name)), style="dim"
424
429
  ),
425
- style="dim",
426
- ),
427
- "",
428
- )
429
- for feature_name in feature_names
430
- if feature_name
431
- ]
430
+ "",
431
+ )
432
+ for feature_name in feature_names
433
+ if feature_name and feature_name not in internal_feature_labels
434
+ ]
435
+ else:
436
+ # add internal non-Feature features without labels
437
+ feature_rows = [
438
+ (
439
+ feature_name,
440
+ Text(
441
+ str(
442
+ internal_feature_names.get(feature_name)
443
+ if feature_name in internal_feature_names
444
+ else schema.dtype
445
+ ),
446
+ style="dim",
447
+ ),
448
+ "",
449
+ )
450
+ for feature_name in feature_names
451
+ if feature_name
452
+ ]
432
453
  int_features_tree_children.append(
433
454
  _create_feature_table(
434
455
  Text.assemble(
@@ -446,8 +467,6 @@ def describe_features(
446
467
  dataset_tree = tree.add(
447
468
  Text.assemble(
448
469
  ("Dataset features", "bold bright_magenta"),
449
- ("/", "dim"),
450
- (".feature_sets", "dim bold"),
451
470
  )
452
471
  )
453
472
  for child in int_features_tree_children:
@@ -481,59 +500,6 @@ def describe_features(
481
500
  return tree
482
501
 
483
502
 
484
- def parse_staged_feature_sets_from_anndata(
485
- adata: AnnData,
486
- var_field: FieldAttr | None = None,
487
- obs_field: FieldAttr = Feature.name,
488
- uns_field: FieldAttr | None = None,
489
- mute: bool = False,
490
- organism: str | Record | None = None,
491
- ) -> dict:
492
- data_parse = adata
493
- if not isinstance(adata, AnnData): # is a path
494
- filepath = create_path(adata) # returns Path for local
495
- if not isinstance(filepath, LocalPathClasses):
496
- from lamindb import settings
497
- from lamindb.core.storage._backed_access import backed_access
498
-
499
- using_key = settings._using_key
500
- data_parse = backed_access(filepath, using_key=using_key)
501
- else:
502
- data_parse = ad.read_h5ad(filepath, backed="r")
503
- type = "float"
504
- else:
505
- type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
506
- feature_sets = {}
507
- if var_field is not None:
508
- schema_var = Schema.from_values(
509
- data_parse.var.index,
510
- var_field,
511
- type=type,
512
- mute=mute,
513
- organism=organism,
514
- raise_validation_error=False,
515
- )
516
- if schema_var is not None:
517
- feature_sets["var"] = schema_var
518
- if obs_field is not None and len(data_parse.obs.columns) > 0:
519
- schema_obs = Schema.from_df(
520
- df=data_parse.obs,
521
- field=obs_field,
522
- mute=mute,
523
- organism=organism,
524
- )
525
- if schema_obs is not None:
526
- feature_sets["obs"] = schema_obs
527
- if uns_field is not None and len(data_parse.uns) > 0:
528
- validated_features = Feature.from_values( # type: ignore
529
- data_parse.uns.keys(), field=uns_field, organism=organism
530
- )
531
- if len(validated_features) > 0:
532
- schema_uns = Schema(validated_features, dtype=None, otype="dict")
533
- feature_sets["uns"] = schema_uns
534
- return feature_sets
535
-
536
-
537
503
  def is_valid_datetime_str(date_string: str) -> bool | str:
538
504
  try:
539
505
  dt = datetime.fromisoformat(date_string)
@@ -624,8 +590,12 @@ def __init__(self, host: Artifact | Collection | Run):
624
590
 
625
591
 
626
592
  def __repr__(self) -> str:
593
+ return describe(self, return_str=True) # type: ignore
594
+
595
+
596
+ def describe(self, return_str: bool = False) -> str | None:
627
597
  tree = describe_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
628
- return print_rich_tree(tree, fallback="no linked features")
598
+ return format_rich_tree(tree, fallback="no linked features", return_str=return_str)
629
599
 
630
600
 
631
601
  def get_values(self) -> dict[str, Any]:
@@ -648,7 +618,7 @@ def __getitem__(self, slot) -> QuerySet:
648
618
  return getattr(schema, self._accessor_by_registry[orm_name]).all()
649
619
 
650
620
 
651
- def filter_base(cls, **expression):
621
+ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
652
622
  from .artifact import Artifact
653
623
 
654
624
  if cls is FeatureManager:
@@ -658,11 +628,12 @@ def filter_base(cls, **expression):
658
628
  model = Param
659
629
  value_model = ParamValue
660
630
  keys_normalized = [key.split("__")[0] for key in expression]
661
- validated = model.validate(keys_normalized, field="name", mute=True)
662
- if sum(validated) != len(keys_normalized):
663
- raise ValidationError(
664
- f"Some keys in the filter expression are not registered as features: {np.array(keys_normalized)[~validated]}"
665
- )
631
+ if not _skip_validation:
632
+ validated = model.validate(keys_normalized, field="name", mute=True)
633
+ if sum(validated) != len(keys_normalized):
634
+ raise ValidationError(
635
+ f"Some keys in the filter expression are not registered as features: {np.array(keys_normalized)[~validated]}"
636
+ )
666
637
  new_expression = {}
667
638
  features = model.filter(name__in=keys_normalized).all().distinct()
668
639
  feature_param = "param" if model is Param else "feature"
@@ -674,76 +645,98 @@ def filter_base(cls, **expression):
674
645
  comparator = f"__{split_key[1]}"
675
646
  feature = features.get(name=normalized_key)
676
647
  if not feature.dtype.startswith("cat"):
677
- expression = {feature_param: feature, f"value{comparator}": value}
678
- feature_value = value_model.filter(**expression)
679
- new_expression[f"_{feature_param}_values__in"] = feature_value
680
- elif isinstance(value, (str, Record)):
681
- # because SQL is sensitive to whether querying with __in or not
682
- # and might return multiple equivalent records for the latter
683
- # we distinguish cases in which we have multiple label matches vs. one
684
- label = None
685
- labels = None
686
- if isinstance(value, str):
687
- # we need the comparator here because users might query like so
688
- # ln.Artifact.features.filter(experiment__contains="Experi")
689
- expression = {f"name{comparator}": value}
690
- labels = ULabel.filter(**expression).all()
691
- if len(labels) == 0:
692
- raise DoesNotExist(
693
- f"Did not find a ULabel matching `name{comparator}={value}`"
648
+ if comparator == "__isnull":
649
+ if cls == FeatureManager:
650
+ from .artifact import ArtifactFeatureValue
651
+
652
+ return Artifact.objects.exclude(
653
+ id__in=Subquery(
654
+ ArtifactFeatureValue.objects.filter(
655
+ featurevalue__feature=feature
656
+ ).values("artifact_id")
657
+ )
694
658
  )
695
- elif len(labels) == 1:
696
- label = labels[0]
697
- elif isinstance(value, Record):
698
- label = value
699
- label_registry = (
700
- label.__class__ if label is not None else labels[0].__class__
701
- )
702
- accessor_name = (
703
- label_registry.artifacts.through.artifact.field._related_name
704
- )
705
- new_expression[f"{accessor_name}__feature"] = feature
706
- if label is not None:
707
- # simplified query if we have exactly one label
708
- new_expression[
709
- f"{accessor_name}__{label_registry.__name__.lower()}"
710
- ] = label
659
+ if comparator in {"__startswith", "__contains"}:
660
+ logger.important(
661
+ f"currently not supporting `{comparator}`, using `__icontains` instead"
662
+ )
663
+ comparator = "__icontains"
664
+ expression = {feature_param: feature, f"value{comparator}": value}
665
+ feature_values = value_model.filter(**expression)
666
+ new_expression[f"_{feature_param}_values__id__in"] = feature_values
667
+ elif isinstance(value, (str, Record, bool)):
668
+ if comparator == "__isnull":
669
+ if cls == FeatureManager:
670
+ return Artifact.objects.exclude(links_ulabel__feature=feature)
711
671
  else:
712
- new_expression[
713
- f"{accessor_name}__{label_registry.__name__.lower()}__in"
714
- ] = labels
715
- else:
672
+ # because SQL is sensitive to whether querying with __in or not
673
+ # and might return multiple equivalent records for the latter
674
+ # we distinguish cases in which we have multiple label matches vs. one
675
+ label = None
676
+ labels = None
677
+ if isinstance(value, str):
678
+ # we need the comparator here because users might query like so
679
+ # ln.Artifact.filter(experiment__contains="Experi")
680
+ expression = {f"name{comparator}": value}
681
+ labels = ULabel.filter(**expression).all()
682
+ if len(labels) == 0:
683
+ raise DoesNotExist(
684
+ f"Did not find a ULabel matching `name{comparator}={value}`"
685
+ )
686
+ elif len(labels) == 1:
687
+ label = labels[0]
688
+ elif isinstance(value, Record):
689
+ label = value
690
+ label_registry = (
691
+ label.__class__ if label is not None else labels[0].__class__
692
+ )
693
+ accessor_name = (
694
+ label_registry.artifacts.through.artifact.field._related_name
695
+ )
696
+ new_expression[f"{accessor_name}__feature"] = feature
697
+ if label is not None:
698
+ # simplified query if we have exactly one label
699
+ new_expression[
700
+ f"{accessor_name}__{label_registry.__name__.lower()}"
701
+ ] = label
702
+ else:
703
+ new_expression[
704
+ f"{accessor_name}__{label_registry.__name__.lower()}__in"
705
+ ] = labels
716
706
  # if passing a list of records, we want to
717
707
  # find artifacts that are annotated by all of them at the same
718
708
  # time; hence, we don't want the __in construct that we use to match strings
719
709
  # https://laminlabs.slack.com/archives/C04FPE8V01W/p1688328084810609
720
- raise NotImplementedError
710
+ if not (new_expression):
711
+ raise NotImplementedError
721
712
  if cls == FeatureManager or cls == ParamManagerArtifact:
722
- return Artifact.filter(**new_expression)
713
+ return Artifact.objects.filter(**new_expression)
723
714
  elif cls == ParamManagerRun:
724
- return Run.filter(**new_expression)
715
+ return Run.objects.filter(**new_expression)
725
716
 
726
717
 
727
718
  @classmethod # type: ignore
719
+ @deprecated("the filter() registry classmethod")
728
720
  def filter(cls, **expression) -> QuerySet:
729
721
  """Query artifacts by features."""
730
- return filter_base(cls, **expression)
722
+ return filter_base(cls, _skip_validation=False, **expression)
731
723
 
732
724
 
733
725
  @classmethod # type: ignore
726
+ @deprecated("the filter() registry classmethod")
734
727
  def get(cls, **expression) -> Record:
735
728
  """Query a single artifact by feature."""
736
- return filter_base(cls, **expression).one()
729
+ return filter_base(cls, _skip_validation=False, **expression).one()
737
730
 
738
731
 
739
732
  @property # type: ignore
740
733
  def slots(self) -> dict[str, Schema]:
741
734
  """Schema by slot.
742
735
 
743
- Example:
736
+ Example::
744
737
 
745
- >>> artifact.features.slots
746
- {'var': <Schema: var>, 'obs': <Schema: obs>}
738
+ artifact.features.slots
739
+ #> {'var': <Schema: var>, 'obs': <Schema: obs>}
747
740
  """
748
741
  if self._slots is None:
749
742
  self._slots = get_schema_by_slot_(self._host)
@@ -817,15 +810,14 @@ def _add_values(
817
810
  from .artifact import Artifact
818
811
 
819
812
  # rename to distinguish from the values inside the dict
820
- features_values = values
821
- keys = features_values.keys()
813
+ dictionary = values
814
+ keys = dictionary.keys()
822
815
  if isinstance(keys, DICT_KEYS_TYPE):
823
816
  keys = list(keys) # type: ignore
824
817
  # deal with other cases later
825
818
  assert all(isinstance(key, str) for key in keys) # noqa: S101
826
819
  registry = feature_param_field.field.model
827
820
  is_param = registry == Param
828
- model = Param if is_param else Feature
829
821
  value_model = ParamValue if is_param else FeatureValue
830
822
  model_name = "Param" if is_param else "Feature"
831
823
  if is_param:
@@ -838,13 +830,11 @@ def _add_values(
838
830
  raise ValidationError(
839
831
  "Can only set features for dataset-like artifacts."
840
832
  )
841
- validated = registry.validate(keys, field=feature_param_field, mute=True)
842
- keys_array = np.array(keys)
843
- keys_array[validated]
844
- if validated.sum() != len(keys):
845
- not_validated_keys = keys_array[~validated]
833
+ records = registry.from_values(keys, field=feature_param_field, mute=True)
834
+ if len(records) != len(keys):
835
+ not_validated_keys = [key for key in keys if key not in records.list("name")]
846
836
  not_validated_keys_dtype_message = [
847
- (key, infer_feature_type_convert_json(key, features_values[key]))
837
+ (key, infer_feature_type_convert_json(key, dictionary[key]))
848
838
  for key in not_validated_keys
849
839
  ]
850
840
  run = get_current_tracked_run()
@@ -862,7 +852,7 @@ def _add_values(
862
852
  ]
863
853
  hint = "\n".join(elements)
864
854
  msg = (
865
- f"These keys could not be validated: {not_validated_keys.tolist()}\n"
855
+ f"These keys could not be validated: {not_validated_keys}\n"
866
856
  f"Here is how to create a {model_name.lower()}:\n\n{hint}"
867
857
  )
868
858
  raise ValidationError(msg)
@@ -871,10 +861,10 @@ def _add_values(
871
861
  features_labels = defaultdict(list)
872
862
  _feature_values = []
873
863
  not_validated_values = []
874
- for key, value in features_values.items():
875
- feature = model.get(name=key)
864
+ for feature in records:
865
+ value = dictionary[feature.name]
876
866
  inferred_type, converted_value, _ = infer_feature_type_convert_json(
877
- key,
867
+ feature.name,
878
868
  value,
879
869
  mute=True,
880
870
  str_as_ulabel=str_as_ulabel,
@@ -882,25 +872,23 @@ def _add_values(
882
872
  if feature.dtype == "num":
883
873
  if inferred_type not in {"int", "float"}:
884
874
  raise TypeError(
885
- f"Value for feature '{key}' with type {feature.dtype} must be a number"
875
+ f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
886
876
  )
887
877
  elif feature.dtype.startswith("cat"):
888
878
  if inferred_type != "?":
889
879
  if not (inferred_type.startswith("cat") or isinstance(value, Record)):
890
880
  raise TypeError(
891
- f"Value for feature '{key}' with type '{feature.dtype}' must be a string or record."
881
+ f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
892
882
  )
893
883
  elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
894
884
  feature.dtype != "str" and feature.dtype != inferred_type
895
885
  ):
896
886
  raise ValidationError(
897
- f"Expected dtype for '{key}' is '{feature.dtype}', got '{inferred_type}'"
887
+ f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
898
888
  )
899
889
  if not feature.dtype.startswith("cat"):
900
890
  filter_kwargs = {model_name.lower(): feature, "value": converted_value}
901
- feature_value = value_model.filter(**filter_kwargs).one_or_none()
902
- if feature_value is None:
903
- feature_value = value_model(**filter_kwargs)
891
+ feature_value, _ = value_model.get_or_create(**filter_kwargs)
904
892
  _feature_values.append(feature_value)
905
893
  else:
906
894
  if isinstance(value, Record) or (
@@ -938,30 +926,63 @@ def _add_values(
938
926
  (feature, label_record) for label_record in label_records
939
927
  ]
940
928
  if not_validated_values:
941
- hint = (
942
- f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True)\n"
943
- f" ln.save(ulabels)"
944
- )
929
+ not_validated_values.sort()
930
+ hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
945
931
  msg = (
946
932
  f"These values could not be validated: {not_validated_values}\n"
947
933
  f"Here is how to create ulabels for them:\n\n{hint}"
948
934
  )
949
935
  raise ValidationError(msg)
936
+ # TODO: create an explicit version of this
937
+ # if not is_param:
938
+ # # check if _expect_many is false for _all_ records
939
+ # if any(record._expect_many for record in records):
940
+ # updated_features = []
941
+ # for record in records:
942
+ # if record._expect_many:
943
+ # record._expect_many = False
944
+ # record.save()
945
+ # updated_features.append(record.name)
946
+ # if any(updated_features):
947
+ # logger.important(
948
+ # f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
949
+ # )
950
950
  # bulk add all links
951
951
  if features_labels:
952
952
  add_label_feature_links(self, features_labels)
953
953
  if _feature_values:
954
- save(_feature_values)
954
+ to_insert_feature_values = [
955
+ record for record in _feature_values if record._state.adding
956
+ ]
957
+ if to_insert_feature_values:
958
+ save(to_insert_feature_values)
959
+ dict_typed_features = [
960
+ getattr(record, model_name.lower())
961
+ for record in _feature_values
962
+ if getattr(record, model_name.lower()).dtype == "dict"
963
+ ]
955
964
  if is_param:
956
965
  LinkORM = self._host._param_values.through
957
966
  valuefield_id = "paramvalue_id"
958
967
  else:
959
968
  LinkORM = self._host._feature_values.through
960
969
  valuefield_id = "featurevalue_id"
970
+ host_class_lower = self._host.__class__.__get_name_with_module__().lower()
971
+ if dict_typed_features:
972
+ # delete all previously existing anotations with dictionaries
973
+ kwargs = {
974
+ f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
975
+ f"{model_name.lower()}__in": dict_typed_features,
976
+ }
977
+ try:
978
+ value_model.filter(**kwargs).all().delete()
979
+ except ProtectedError:
980
+ pass
981
+ # add new feature links
961
982
  links = [
962
983
  LinkORM(
963
984
  **{
964
- f"{self._host.__class__.__get_name_with_module__().lower()}_id": self._host.id,
985
+ f"{host_class_lower}_id": self._host.id,
965
986
  valuefield_id: feature_value.id,
966
987
  }
967
988
  )
@@ -1088,52 +1109,6 @@ def _add_schema(self, schema: Schema, slot: str) -> None:
1088
1109
  self._slots[slot] = schema # type: ignore
1089
1110
 
1090
1111
 
1091
- def _add_set_from_df(
1092
- self,
1093
- field: FieldAttr = Feature.name,
1094
- organism: str | None = None,
1095
- mute: bool = False,
1096
- ):
1097
- """Add feature set corresponding to column names of DataFrame."""
1098
- assert self._host.otype == "DataFrame" # noqa: S101
1099
- df = self._host.load()
1100
- schema = Schema.from_df(
1101
- df=df,
1102
- field=field,
1103
- mute=mute,
1104
- organism=organism,
1105
- )
1106
- self._host._staged_feature_sets = {"columns": schema}
1107
- self._host.save()
1108
-
1109
-
1110
- def _add_set_from_anndata(
1111
- self,
1112
- var_field: FieldAttr | None = None,
1113
- obs_field: FieldAttr | None = Feature.name,
1114
- uns_field: FieldAttr | None = None,
1115
- mute: bool = False,
1116
- organism: str | Record | None = None,
1117
- ):
1118
- """Add features from AnnData."""
1119
- assert self._host.otype == "AnnData" # noqa: S101
1120
-
1121
- # parse and register features
1122
- adata = self._host.load()
1123
- feature_sets = parse_staged_feature_sets_from_anndata(
1124
- adata,
1125
- var_field=var_field,
1126
- obs_field=obs_field,
1127
- uns_field=uns_field,
1128
- mute=mute,
1129
- organism=organism,
1130
- )
1131
-
1132
- # link feature sets
1133
- self._host._staged_feature_sets = feature_sets
1134
- self._host.save()
1135
-
1136
-
1137
1112
  def _unify_staged_feature_sets_by_hash(
1138
1113
  feature_sets: MutableMapping[str, Schema],
1139
1114
  ):
@@ -1149,83 +1124,6 @@ def _unify_staged_feature_sets_by_hash(
1149
1124
  return feature_sets
1150
1125
 
1151
1126
 
1152
- def _add_set_from_mudata(
1153
- self,
1154
- var_fields: dict[str, FieldAttr] | None = None,
1155
- obs_fields: dict[str, FieldAttr] | None = None,
1156
- mute: bool = False,
1157
- organism: str | Record | None = None,
1158
- ):
1159
- """Add features from MuData."""
1160
- if obs_fields is None:
1161
- obs_fields = {}
1162
- assert self._host.otype == "MuData" # noqa: S101
1163
-
1164
- # parse and register features
1165
- mdata = self._host.load()
1166
- feature_sets = {}
1167
-
1168
- obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1169
- if len(obs_features) > 0:
1170
- feature_sets["obs"] = Schema(features=obs_features)
1171
- for modality, field in var_fields.items():
1172
- modality_fs = parse_staged_feature_sets_from_anndata(
1173
- mdata[modality],
1174
- var_field=field,
1175
- obs_field=obs_fields.get(modality, Feature.name),
1176
- mute=mute,
1177
- organism=organism,
1178
- )
1179
- for k, v in modality_fs.items():
1180
- feature_sets[f"['{modality}'].{k}"] = v
1181
-
1182
- # link feature sets
1183
- self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1184
- self._host.save()
1185
-
1186
-
1187
- def _add_set_from_spatialdata(
1188
- self,
1189
- sample_metadata_key: str,
1190
- sample_metadata_field: FieldAttr = Feature.name,
1191
- var_fields: dict[str, FieldAttr] | None = None,
1192
- obs_fields: dict[str, FieldAttr] | None = None,
1193
- mute: bool = False,
1194
- organism: str | Record | None = None,
1195
- ):
1196
- """Add features from SpatialData."""
1197
- obs_fields, var_fields = obs_fields or {}, var_fields or {}
1198
- assert self._host.otype == "SpatialData" # noqa: S101
1199
-
1200
- # parse and register features
1201
- sdata = self._host.load()
1202
- feature_sets = {}
1203
-
1204
- # sample features
1205
- sample_features = Feature.from_values(
1206
- sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
1207
- field=sample_metadata_field,
1208
- ) # type: ignore
1209
- if len(sample_features) > 0:
1210
- feature_sets[sample_metadata_key] = Schema(features=sample_features)
1211
-
1212
- # table features
1213
- for table, field in var_fields.items():
1214
- table_fs = parse_staged_feature_sets_from_anndata(
1215
- sdata[table],
1216
- var_field=field,
1217
- obs_field=obs_fields.get(table, Feature.name),
1218
- mute=mute,
1219
- organism=organism,
1220
- )
1221
- for k, v in table_fs.items():
1222
- feature_sets[f"['{table}'].{k}"] = v
1223
-
1224
- # link feature sets
1225
- self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1226
- self._host.save()
1227
-
1228
-
1229
1127
  def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1230
1128
  """Transfer features from a artifact or collection."""
1231
1129
  # This only covers feature sets
@@ -1336,25 +1234,200 @@ def _feature_set_by_slot(self):
1336
1234
  return self.slots
1337
1235
 
1338
1236
 
1237
+ # deprecated: feature set parsing
1238
+
1239
+
1240
+ def parse_staged_feature_sets_from_anndata(
1241
+ adata: AnnData,
1242
+ var_field: FieldAttr | None = None,
1243
+ obs_field: FieldAttr = Feature.name,
1244
+ uns_field: FieldAttr | None = None,
1245
+ mute: bool = False,
1246
+ organism: str | Record | None = None,
1247
+ ) -> dict:
1248
+ data_parse = adata
1249
+ if not isinstance(adata, AnnData): # is a path
1250
+ filepath = create_path(adata) # returns Path for local
1251
+ if not isinstance(filepath, LocalPathClasses):
1252
+ from lamindb import settings
1253
+ from lamindb.core.storage._backed_access import backed_access
1254
+
1255
+ using_key = settings._using_key
1256
+ data_parse = backed_access(filepath, using_key=using_key)
1257
+ else:
1258
+ data_parse = ad.read_h5ad(filepath, backed="r")
1259
+ type = "float"
1260
+ else:
1261
+ type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
1262
+ feature_sets = {}
1263
+ if var_field is not None:
1264
+ schema_var = Schema.from_values(
1265
+ data_parse.var.index,
1266
+ var_field,
1267
+ type=type,
1268
+ mute=mute,
1269
+ organism=organism,
1270
+ raise_validation_error=False,
1271
+ )
1272
+ if schema_var is not None:
1273
+ feature_sets["var"] = schema_var
1274
+ if obs_field is not None and len(data_parse.obs.columns) > 0:
1275
+ schema_obs = Schema.from_df(
1276
+ df=data_parse.obs,
1277
+ field=obs_field,
1278
+ mute=mute,
1279
+ organism=organism,
1280
+ )
1281
+ if schema_obs is not None:
1282
+ feature_sets["obs"] = schema_obs
1283
+ if uns_field is not None and len(data_parse.uns) > 0:
1284
+ validated_features = Feature.from_values( # type: ignore
1285
+ data_parse.uns.keys(), field=uns_field, organism=organism
1286
+ )
1287
+ if len(validated_features) > 0:
1288
+ schema_uns = Schema(validated_features, dtype=None, otype="dict")
1289
+ feature_sets["uns"] = schema_uns
1290
+ return feature_sets
1291
+
1292
+
1293
+ # no longer called from within curator
1294
+ # might deprecate in the future?
1295
+ def _add_set_from_df(
1296
+ self,
1297
+ field: FieldAttr = Feature.name,
1298
+ organism: str | None = None,
1299
+ mute: bool = False,
1300
+ ):
1301
+ """Add feature set corresponding to column names of DataFrame."""
1302
+ assert self._host.otype == "DataFrame" # noqa: S101
1303
+ df = self._host.load(is_run_input=False)
1304
+ schema = Schema.from_df(
1305
+ df=df,
1306
+ field=field,
1307
+ mute=mute,
1308
+ organism=organism,
1309
+ )
1310
+ self._host._staged_feature_sets = {"columns": schema}
1311
+ self._host.save()
1312
+
1313
+
1314
+ def _add_set_from_anndata(
1315
+ self,
1316
+ var_field: FieldAttr | None = None,
1317
+ obs_field: FieldAttr | None = Feature.name,
1318
+ uns_field: FieldAttr | None = None,
1319
+ mute: bool = False,
1320
+ organism: str | Record | None = None,
1321
+ ):
1322
+ """Add features from AnnData."""
1323
+ assert self._host.otype == "AnnData" # noqa: S101
1324
+
1325
+ # parse and register features
1326
+ adata = self._host.load(is_run_input=False)
1327
+ feature_sets = parse_staged_feature_sets_from_anndata(
1328
+ adata,
1329
+ var_field=var_field,
1330
+ obs_field=obs_field,
1331
+ uns_field=uns_field,
1332
+ mute=mute,
1333
+ organism=organism,
1334
+ )
1335
+
1336
+ # link feature sets
1337
+ self._host._staged_feature_sets = feature_sets
1338
+ self._host.save()
1339
+
1340
+
1341
+ def _add_set_from_mudata(
1342
+ self,
1343
+ var_fields: dict[str, FieldAttr] | None = None,
1344
+ obs_fields: dict[str, FieldAttr] | None = None,
1345
+ mute: bool = False,
1346
+ organism: str | Record | None = None,
1347
+ ):
1348
+ """Add features from MuData."""
1349
+ if obs_fields is None:
1350
+ obs_fields = {}
1351
+ assert self._host.otype == "MuData" # noqa: S101
1352
+
1353
+ # parse and register features
1354
+ mdata = self._host.load(is_run_input=False)
1355
+ feature_sets = {}
1356
+
1357
+ obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1358
+ if len(obs_features) > 0:
1359
+ feature_sets["obs"] = Schema(features=obs_features)
1360
+ for modality, field in var_fields.items():
1361
+ modality_fs = parse_staged_feature_sets_from_anndata(
1362
+ mdata[modality],
1363
+ var_field=field,
1364
+ obs_field=obs_fields.get(modality, Feature.name),
1365
+ mute=mute,
1366
+ organism=organism,
1367
+ )
1368
+ for k, v in modality_fs.items():
1369
+ feature_sets[f"['{modality}'].{k}"] = v
1370
+
1371
+ # link feature sets
1372
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1373
+ self._host.save()
1374
+
1375
+
1376
+ def _add_set_from_spatialdata(
1377
+ self,
1378
+ sample_metadata_key: str,
1379
+ sample_metadata_field: FieldAttr = Feature.name,
1380
+ var_fields: dict[str, FieldAttr] | None = None,
1381
+ obs_fields: dict[str, FieldAttr] | None = None,
1382
+ mute: bool = False,
1383
+ organism: str | Record | None = None,
1384
+ ):
1385
+ """Add features from SpatialData."""
1386
+ obs_fields, var_fields = obs_fields or {}, var_fields or {}
1387
+ assert self._host.otype == "SpatialData" # noqa: S101
1388
+
1389
+ # parse and register features
1390
+ sdata = self._host.load(is_run_input=False)
1391
+ feature_sets = {}
1392
+
1393
+ # sample features
1394
+ sample_features = Feature.from_values(
1395
+ sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
1396
+ field=sample_metadata_field,
1397
+ ) # type: ignore
1398
+ if len(sample_features) > 0:
1399
+ feature_sets[sample_metadata_key] = Schema(features=sample_features)
1400
+
1401
+ # table features
1402
+ for table, field in var_fields.items():
1403
+ table_fs = parse_staged_feature_sets_from_anndata(
1404
+ sdata[table],
1405
+ var_field=field,
1406
+ obs_field=obs_fields.get(table, Feature.name),
1407
+ mute=mute,
1408
+ organism=organism,
1409
+ )
1410
+ for k, v in table_fs.items():
1411
+ feature_sets[f"['{table}'].{k}"] = v
1412
+
1413
+ # link feature sets
1414
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1415
+ self._host.save()
1416
+
1417
+
1339
1418
  # mypy: ignore-errors
1340
1419
  FeatureManager.__init__ = __init__
1341
1420
  ParamManager.__init__ = __init__
1342
1421
  FeatureManager.__repr__ = __repr__
1343
1422
  ParamManager.__repr__ = __repr__
1423
+ FeatureManager.describe = describe
1424
+ ParamManager.describe = describe
1344
1425
  FeatureManager.__getitem__ = __getitem__
1345
1426
  FeatureManager.get_values = get_values
1346
1427
  FeatureManager.slots = slots
1347
1428
  FeatureManager.add_values = add_values_features
1348
1429
  FeatureManager._add_schema = _add_schema
1349
- FeatureManager.add_schema = add_schema # deprecated
1350
- FeatureManager.add_feature_set = add_feature_set # deprecated
1351
- FeatureManager._schema_by_slot = _schema_by_slot # deprecated
1352
- FeatureManager._feature_set_by_slot = _feature_set_by_slot # deprecated
1353
1430
  FeatureManager._accessor_by_registry = _accessor_by_registry
1354
- FeatureManager._add_set_from_df = _add_set_from_df
1355
- FeatureManager._add_set_from_anndata = _add_set_from_anndata
1356
- FeatureManager._add_set_from_mudata = _add_set_from_mudata
1357
- FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
1358
1431
  FeatureManager._add_from = _add_from
1359
1432
  FeatureManager.filter = filter
1360
1433
  FeatureManager.get = get
@@ -1363,3 +1436,13 @@ FeatureManager.remove_values = remove_values
1363
1436
  ParamManager.add_values = add_values_params
1364
1437
  ParamManager.get_values = get_values
1365
1438
  ParamManager.filter = filter
1439
+
1440
+ # deprecated
1441
+ FeatureManager._add_set_from_df = _add_set_from_df
1442
+ FeatureManager._add_set_from_anndata = _add_set_from_anndata
1443
+ FeatureManager._add_set_from_mudata = _add_set_from_mudata
1444
+ FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
1445
+ FeatureManager.add_schema = add_schema
1446
+ FeatureManager.add_feature_set = add_feature_set
1447
+ FeatureManager._schema_by_slot = _schema_by_slot
1448
+ FeatureManager._feature_set_by_slot = _feature_set_by_slot