lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +203 -102
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/_tiledbsoma.py +29 -13
  20. lamindb/core/storage/objects.py +6 -0
  21. lamindb/core/subsettings/__init__.py +2 -0
  22. lamindb/core/subsettings/_annotation_settings.py +11 -0
  23. lamindb/curators/__init__.py +7 -3349
  24. lamindb/curators/_legacy.py +2056 -0
  25. lamindb/curators/core.py +1534 -0
  26. lamindb/errors.py +11 -0
  27. lamindb/examples/__init__.py +27 -0
  28. lamindb/examples/schemas/__init__.py +12 -0
  29. lamindb/examples/schemas/_anndata.py +25 -0
  30. lamindb/examples/schemas/_simple.py +19 -0
  31. lamindb/integrations/_vitessce.py +8 -5
  32. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  33. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  34. lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
  35. lamindb/models/__init__.py +4 -1
  36. lamindb/models/_describe.py +21 -4
  37. lamindb/models/_feature_manager.py +382 -287
  38. lamindb/models/_label_manager.py +8 -2
  39. lamindb/models/artifact.py +177 -106
  40. lamindb/models/artifact_set.py +122 -0
  41. lamindb/models/collection.py +73 -52
  42. lamindb/models/core.py +1 -1
  43. lamindb/models/feature.py +51 -17
  44. lamindb/models/has_parents.py +69 -14
  45. lamindb/models/project.py +1 -1
  46. lamindb/models/query_manager.py +221 -22
  47. lamindb/models/query_set.py +247 -172
  48. lamindb/models/record.py +65 -247
  49. lamindb/models/run.py +4 -4
  50. lamindb/models/save.py +8 -2
  51. lamindb/models/schema.py +456 -184
  52. lamindb/models/transform.py +2 -2
  53. lamindb/models/ulabel.py +8 -5
  54. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
  55. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
  56. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
  57. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0
@@ -13,7 +13,7 @@ import pandas as pd
13
13
  from anndata import AnnData
14
14
  from django.contrib.postgres.aggregates import ArrayAgg
15
15
  from django.db import connections
16
- from django.db.models import Aggregate
16
+ from django.db.models import Aggregate, ProtectedError, Subquery
17
17
  from lamin_utils import logger
18
18
  from lamindb_setup.core.hashing import hash_set
19
19
  from lamindb_setup.core.upath import create_path
@@ -42,14 +42,14 @@ from ._describe import (
42
42
  TYPE_WIDTH,
43
43
  VALUES_WIDTH,
44
44
  describe_header,
45
- print_rich_tree,
45
+ format_rich_tree,
46
46
  )
47
47
  from ._django import get_artifact_with_related
48
48
  from ._label_manager import _get_labels, describe_labels
49
49
  from ._relations import (
50
50
  dict_related_model_to_related_name,
51
51
  )
52
- from .feature import Feature, FeatureValue
52
+ from .feature import Feature, FeatureValue, parse_dtype
53
53
  from .record import Record
54
54
  from .run import Param, ParamManager, ParamManagerRun, ParamValue, Run
55
55
  from .ulabel import ULabel
@@ -320,15 +320,27 @@ def describe_features(
320
320
  schema_data[slot] = (schema, feature_names)
321
321
  for feature_name in feature_names:
322
322
  feature_data[feature_name] = (slot, registry_str)
323
+ schema_data.update(
324
+ {
325
+ slot: (schema, schema.n)
326
+ for slot, schema in get_schema_by_slot_(self).items()
327
+ if slot not in schema_data
328
+ }
329
+ )
323
330
  else:
324
331
  for slot, schema in get_schema_by_slot_(self).items():
325
332
  features = schema.members
326
- # features.first() is a lot slower than features[0] here
327
- name_field = get_name_field(features[0])
328
- feature_names = list(features.values_list(name_field, flat=True)[:20])
329
- schema_data[slot] = (schema, feature_names)
330
- for feature_name in feature_names:
331
- feature_data[feature_name] = (slot, schema.itype)
333
+ if features.exists():
334
+ # features.first() is a lot slower than features[0] here
335
+ name_field = get_name_field(features[0])
336
+ feature_names = list(
337
+ features.values_list(name_field, flat=True)[:20]
338
+ )
339
+ schema_data[slot] = (schema, feature_names)
340
+ for feature_name in feature_names:
341
+ feature_data[feature_name] = (slot, schema.itype)
342
+ else:
343
+ schema_data[slot] = (schema, schema.n)
332
344
 
333
345
  internal_feature_names: dict[str, str] = {}
334
346
  if isinstance(self, Artifact):
@@ -400,38 +412,44 @@ def describe_features(
400
412
  internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
401
413
 
402
414
  int_features_tree_children = []
403
- for slot, (schema, feature_names) in schema_data.items():
404
- if slot in internal_feature_labels_slot:
405
- # add internal Feature features with labels
406
- feature_rows = internal_feature_labels_slot[slot]
407
- # add internal Feature features without labels
408
- feature_rows += [
409
- (
410
- feature_name,
411
- Text(str(internal_feature_names.get(feature_name)), style="dim"),
412
- "",
413
- )
414
- for feature_name in feature_names
415
- if feature_name and feature_name not in internal_feature_labels
416
- ]
415
+ for slot, (schema, feature_names_or_n) in schema_data.items():
416
+ if isinstance(feature_names_or_n, int):
417
+ feature_rows = []
417
418
  else:
418
- # add internal non-Feature features without labels
419
- feature_rows = [
420
- (
421
- feature_name,
422
- Text(
423
- str(
424
- internal_feature_names.get(feature_name)
425
- if feature_name in internal_feature_names
426
- else schema.dtype
419
+ feature_names = feature_names_or_n
420
+ if slot in internal_feature_labels_slot:
421
+ # add internal Feature features with labels
422
+ feature_rows = internal_feature_labels_slot[slot]
423
+ # add internal Feature features without labels
424
+ feature_rows += [
425
+ (
426
+ feature_name,
427
+ Text(
428
+ str(internal_feature_names.get(feature_name)), style="dim"
427
429
  ),
428
- style="dim",
429
- ),
430
- "",
431
- )
432
- for feature_name in feature_names
433
- if feature_name
434
- ]
430
+ "",
431
+ )
432
+ for feature_name in feature_names
433
+ if feature_name and feature_name not in internal_feature_labels
434
+ ]
435
+ else:
436
+ # add internal non-Feature features without labels
437
+ feature_rows = [
438
+ (
439
+ feature_name,
440
+ Text(
441
+ str(
442
+ internal_feature_names.get(feature_name)
443
+ if feature_name in internal_feature_names
444
+ else schema.dtype
445
+ ),
446
+ style="dim",
447
+ ),
448
+ "",
449
+ )
450
+ for feature_name in feature_names
451
+ if feature_name
452
+ ]
435
453
  int_features_tree_children.append(
436
454
  _create_feature_table(
437
455
  Text.assemble(
@@ -482,59 +500,6 @@ def describe_features(
482
500
  return tree
483
501
 
484
502
 
485
- def parse_staged_feature_sets_from_anndata(
486
- adata: AnnData,
487
- var_field: FieldAttr | None = None,
488
- obs_field: FieldAttr = Feature.name,
489
- uns_field: FieldAttr | None = None,
490
- mute: bool = False,
491
- organism: str | Record | None = None,
492
- ) -> dict:
493
- data_parse = adata
494
- if not isinstance(adata, AnnData): # is a path
495
- filepath = create_path(adata) # returns Path for local
496
- if not isinstance(filepath, LocalPathClasses):
497
- from lamindb import settings
498
- from lamindb.core.storage._backed_access import backed_access
499
-
500
- using_key = settings._using_key
501
- data_parse = backed_access(filepath, using_key=using_key)
502
- else:
503
- data_parse = ad.read_h5ad(filepath, backed="r")
504
- type = "float"
505
- else:
506
- type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
507
- feature_sets = {}
508
- if var_field is not None:
509
- schema_var = Schema.from_values(
510
- data_parse.var.index,
511
- var_field,
512
- type=type,
513
- mute=mute,
514
- organism=organism,
515
- raise_validation_error=False,
516
- )
517
- if schema_var is not None:
518
- feature_sets["var"] = schema_var
519
- if obs_field is not None and len(data_parse.obs.columns) > 0:
520
- schema_obs = Schema.from_df(
521
- df=data_parse.obs,
522
- field=obs_field,
523
- mute=mute,
524
- organism=organism,
525
- )
526
- if schema_obs is not None:
527
- feature_sets["obs"] = schema_obs
528
- if uns_field is not None and len(data_parse.uns) > 0:
529
- validated_features = Feature.from_values( # type: ignore
530
- data_parse.uns.keys(), field=uns_field, organism=organism
531
- )
532
- if len(validated_features) > 0:
533
- schema_uns = Schema(validated_features, dtype=None, otype="dict")
534
- feature_sets["uns"] = schema_uns
535
- return feature_sets
536
-
537
-
538
503
  def is_valid_datetime_str(date_string: str) -> bool | str:
539
504
  try:
540
505
  dt = datetime.fromisoformat(date_string)
@@ -625,8 +590,12 @@ def __init__(self, host: Artifact | Collection | Run):
625
590
 
626
591
 
627
592
  def __repr__(self) -> str:
593
+ return describe(self, return_str=True) # type: ignore
594
+
595
+
596
+ def describe(self, return_str: bool = False) -> str | None:
628
597
  tree = describe_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
629
- return print_rich_tree(tree, fallback="no linked features")
598
+ return format_rich_tree(tree, fallback="no linked features", return_str=return_str)
630
599
 
631
600
 
632
601
  def get_values(self) -> dict[str, Any]:
@@ -676,50 +645,86 @@ def filter_base(cls, _skip_validation: bool = True, **expression) -> QuerySet:
676
645
  comparator = f"__{split_key[1]}"
677
646
  feature = features.get(name=normalized_key)
678
647
  if not feature.dtype.startswith("cat"):
648
+ if comparator == "__isnull":
649
+ if cls == FeatureManager:
650
+ from .artifact import ArtifactFeatureValue
651
+
652
+ if value: # True
653
+ return Artifact.objects.exclude(
654
+ id__in=Subquery(
655
+ ArtifactFeatureValue.objects.filter(
656
+ featurevalue__feature=feature
657
+ ).values("artifact_id")
658
+ )
659
+ )
660
+ else:
661
+ return Artifact.objects.exclude(
662
+ id__in=Subquery(
663
+ ArtifactFeatureValue.objects.filter(
664
+ featurevalue__feature=feature
665
+ ).values("artifact_id")
666
+ )
667
+ )
668
+ if comparator in {"__startswith", "__contains"}:
669
+ logger.important(
670
+ f"currently not supporting `{comparator}`, using `__icontains` instead"
671
+ )
672
+ comparator = "__icontains"
679
673
  expression = {feature_param: feature, f"value{comparator}": value}
680
- feature_value = value_model.filter(**expression)
681
- new_expression[f"_{feature_param}_values__in"] = feature_value
682
- elif isinstance(value, (str, Record)):
683
- # because SQL is sensitive to whether querying with __in or not
684
- # and might return multiple equivalent records for the latter
685
- # we distinguish cases in which we have multiple label matches vs. one
686
- label = None
687
- labels = None
688
- if isinstance(value, str):
689
- # we need the comparator here because users might query like so
690
- # ln.Artifact.features.filter(experiment__contains="Experi")
691
- expression = {f"name{comparator}": value}
692
- labels = ULabel.filter(**expression).all()
693
- if len(labels) == 0:
694
- raise DoesNotExist(
695
- f"Did not find a ULabel matching `name{comparator}={value}`"
696
- )
697
- elif len(labels) == 1:
698
- label = labels[0]
699
- elif isinstance(value, Record):
700
- label = value
701
- label_registry = (
702
- label.__class__ if label is not None else labels[0].__class__
703
- )
704
- accessor_name = (
705
- label_registry.artifacts.through.artifact.field._related_name
706
- )
707
- new_expression[f"{accessor_name}__feature"] = feature
708
- if label is not None:
709
- # simplified query if we have exactly one label
710
- new_expression[
711
- f"{accessor_name}__{label_registry.__name__.lower()}"
712
- ] = label
674
+ feature_values = value_model.filter(**expression)
675
+ new_expression[f"_{feature_param}_values__id__in"] = feature_values
676
+ elif isinstance(value, (str, Record, bool)):
677
+ if comparator == "__isnull":
678
+ if cls == FeatureManager:
679
+ result = parse_dtype(feature.dtype)[0]
680
+ kwargs = {
681
+ f"links_{result['registry'].__name__.lower()}__feature": feature
682
+ }
683
+ if value: # True
684
+ return Artifact.objects.exclude(**kwargs)
685
+ else:
686
+ return Artifact.objects.filter(**kwargs)
713
687
  else:
714
- new_expression[
715
- f"{accessor_name}__{label_registry.__name__.lower()}__in"
716
- ] = labels
717
- else:
688
+ # because SQL is sensitive to whether querying with __in or not
689
+ # and might return multiple equivalent records for the latter
690
+ # we distinguish cases in which we have multiple label matches vs. one
691
+ label = None
692
+ labels = None
693
+ if isinstance(value, str):
694
+ # we need the comparator here because users might query like so
695
+ # ln.Artifact.filter(experiment__contains="Experi")
696
+ expression = {f"name{comparator}": value}
697
+ labels = ULabel.filter(**expression).all()
698
+ if len(labels) == 0:
699
+ raise DoesNotExist(
700
+ f"Did not find a ULabel matching `name{comparator}={value}`"
701
+ )
702
+ elif len(labels) == 1:
703
+ label = labels[0]
704
+ elif isinstance(value, Record):
705
+ label = value
706
+ label_registry = (
707
+ label.__class__ if label is not None else labels[0].__class__
708
+ )
709
+ accessor_name = (
710
+ label_registry.artifacts.through.artifact.field._related_name
711
+ )
712
+ new_expression[f"{accessor_name}__feature"] = feature
713
+ if label is not None:
714
+ # simplified query if we have exactly one label
715
+ new_expression[
716
+ f"{accessor_name}__{label_registry.__name__.lower()}"
717
+ ] = label
718
+ else:
719
+ new_expression[
720
+ f"{accessor_name}__{label_registry.__name__.lower()}__in"
721
+ ] = labels
718
722
  # if passing a list of records, we want to
719
723
  # find artifacts that are annotated by all of them at the same
720
724
  # time; hence, we don't want the __in construct that we use to match strings
721
725
  # https://laminlabs.slack.com/archives/C04FPE8V01W/p1688328084810609
722
- raise NotImplementedError
726
+ if not (new_expression):
727
+ raise NotImplementedError
723
728
  if cls == FeatureManager or cls == ParamManagerArtifact:
724
729
  return Artifact.objects.filter(**new_expression)
725
730
  elif cls == ParamManagerRun:
@@ -821,15 +826,14 @@ def _add_values(
821
826
  from .artifact import Artifact
822
827
 
823
828
  # rename to distinguish from the values inside the dict
824
- features_values = values
825
- keys = features_values.keys()
829
+ dictionary = values
830
+ keys = dictionary.keys()
826
831
  if isinstance(keys, DICT_KEYS_TYPE):
827
832
  keys = list(keys) # type: ignore
828
833
  # deal with other cases later
829
834
  assert all(isinstance(key, str) for key in keys) # noqa: S101
830
835
  registry = feature_param_field.field.model
831
836
  is_param = registry == Param
832
- model = Param if is_param else Feature
833
837
  value_model = ParamValue if is_param else FeatureValue
834
838
  model_name = "Param" if is_param else "Feature"
835
839
  if is_param:
@@ -842,13 +846,11 @@ def _add_values(
842
846
  raise ValidationError(
843
847
  "Can only set features for dataset-like artifacts."
844
848
  )
845
- validated = registry.validate(keys, field=feature_param_field, mute=True)
846
- keys_array = np.array(keys)
847
- keys_array[validated]
848
- if validated.sum() != len(keys):
849
- not_validated_keys = keys_array[~validated]
849
+ records = registry.from_values(keys, field=feature_param_field, mute=True)
850
+ if len(records) != len(keys):
851
+ not_validated_keys = [key for key in keys if key not in records.list("name")]
850
852
  not_validated_keys_dtype_message = [
851
- (key, infer_feature_type_convert_json(key, features_values[key]))
853
+ (key, infer_feature_type_convert_json(key, dictionary[key]))
852
854
  for key in not_validated_keys
853
855
  ]
854
856
  run = get_current_tracked_run()
@@ -866,7 +868,7 @@ def _add_values(
866
868
  ]
867
869
  hint = "\n".join(elements)
868
870
  msg = (
869
- f"These keys could not be validated: {not_validated_keys.tolist()}\n"
871
+ f"These keys could not be validated: {not_validated_keys}\n"
870
872
  f"Here is how to create a {model_name.lower()}:\n\n{hint}"
871
873
  )
872
874
  raise ValidationError(msg)
@@ -875,10 +877,10 @@ def _add_values(
875
877
  features_labels = defaultdict(list)
876
878
  _feature_values = []
877
879
  not_validated_values = []
878
- for key, value in features_values.items():
879
- feature = model.get(name=key)
880
+ for feature in records:
881
+ value = dictionary[feature.name]
880
882
  inferred_type, converted_value, _ = infer_feature_type_convert_json(
881
- key,
883
+ feature.name,
882
884
  value,
883
885
  mute=True,
884
886
  str_as_ulabel=str_as_ulabel,
@@ -886,25 +888,23 @@ def _add_values(
886
888
  if feature.dtype == "num":
887
889
  if inferred_type not in {"int", "float"}:
888
890
  raise TypeError(
889
- f"Value for feature '{key}' with type {feature.dtype} must be a number"
891
+ f"Value for feature '{feature.name}' with type {feature.dtype} must be a number"
890
892
  )
891
893
  elif feature.dtype.startswith("cat"):
892
894
  if inferred_type != "?":
893
895
  if not (inferred_type.startswith("cat") or isinstance(value, Record)):
894
896
  raise TypeError(
895
- f"Value for feature '{key}' with type '{feature.dtype}' must be a string or record."
897
+ f"Value for feature '{feature.name}' with type '{feature.dtype}' must be a string or record."
896
898
  )
897
899
  elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
898
900
  feature.dtype != "str" and feature.dtype != inferred_type
899
901
  ):
900
902
  raise ValidationError(
901
- f"Expected dtype for '{key}' is '{feature.dtype}', got '{inferred_type}'"
903
+ f"Expected dtype for '{feature.name}' is '{feature.dtype}', got '{inferred_type}'"
902
904
  )
903
905
  if not feature.dtype.startswith("cat"):
904
906
  filter_kwargs = {model_name.lower(): feature, "value": converted_value}
905
- feature_value = value_model.filter(**filter_kwargs).one_or_none()
906
- if feature_value is None:
907
- feature_value = value_model(**filter_kwargs)
907
+ feature_value, _ = value_model.get_or_create(**filter_kwargs)
908
908
  _feature_values.append(feature_value)
909
909
  else:
910
910
  if isinstance(value, Record) or (
@@ -942,30 +942,63 @@ def _add_values(
942
942
  (feature, label_record) for label_record in label_records
943
943
  ]
944
944
  if not_validated_values:
945
- hint = (
946
- f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True)\n"
947
- f" ln.save(ulabels)"
948
- )
945
+ not_validated_values.sort()
946
+ hint = f" ulabels = ln.ULabel.from_values({not_validated_values}, create=True).save()\n"
949
947
  msg = (
950
948
  f"These values could not be validated: {not_validated_values}\n"
951
949
  f"Here is how to create ulabels for them:\n\n{hint}"
952
950
  )
953
951
  raise ValidationError(msg)
952
+ # TODO: create an explicit version of this
953
+ # if not is_param:
954
+ # # check if _expect_many is false for _all_ records
955
+ # if any(record._expect_many for record in records):
956
+ # updated_features = []
957
+ # for record in records:
958
+ # if record._expect_many:
959
+ # record._expect_many = False
960
+ # record.save()
961
+ # updated_features.append(record.name)
962
+ # if any(updated_features):
963
+ # logger.important(
964
+ # f"changed observational unit to Artifact for features: {', '.join(updated_features)}"
965
+ # )
954
966
  # bulk add all links
955
967
  if features_labels:
956
968
  add_label_feature_links(self, features_labels)
957
969
  if _feature_values:
958
- save(_feature_values)
970
+ to_insert_feature_values = [
971
+ record for record in _feature_values if record._state.adding
972
+ ]
973
+ if to_insert_feature_values:
974
+ save(to_insert_feature_values)
975
+ dict_typed_features = [
976
+ getattr(record, model_name.lower())
977
+ for record in _feature_values
978
+ if getattr(record, model_name.lower()).dtype == "dict"
979
+ ]
959
980
  if is_param:
960
981
  LinkORM = self._host._param_values.through
961
982
  valuefield_id = "paramvalue_id"
962
983
  else:
963
984
  LinkORM = self._host._feature_values.through
964
985
  valuefield_id = "featurevalue_id"
986
+ host_class_lower = self._host.__class__.__get_name_with_module__().lower()
987
+ if dict_typed_features:
988
+ # delete all previously existing anotations with dictionaries
989
+ kwargs = {
990
+ f"links_{host_class_lower}__{host_class_lower}_id": self._host.id,
991
+ f"{model_name.lower()}__in": dict_typed_features,
992
+ }
993
+ try:
994
+ value_model.filter(**kwargs).all().delete()
995
+ except ProtectedError:
996
+ pass
997
+ # add new feature links
965
998
  links = [
966
999
  LinkORM(
967
1000
  **{
968
- f"{self._host.__class__.__get_name_with_module__().lower()}_id": self._host.id,
1001
+ f"{host_class_lower}_id": self._host.id,
969
1002
  valuefield_id: feature_value.id,
970
1003
  }
971
1004
  )
@@ -1092,52 +1125,6 @@ def _add_schema(self, schema: Schema, slot: str) -> None:
1092
1125
  self._slots[slot] = schema # type: ignore
1093
1126
 
1094
1127
 
1095
- def _add_set_from_df(
1096
- self,
1097
- field: FieldAttr = Feature.name,
1098
- organism: str | None = None,
1099
- mute: bool = False,
1100
- ):
1101
- """Add feature set corresponding to column names of DataFrame."""
1102
- assert self._host.otype == "DataFrame" # noqa: S101
1103
- df = self._host.load(is_run_input=False)
1104
- schema = Schema.from_df(
1105
- df=df,
1106
- field=field,
1107
- mute=mute,
1108
- organism=organism,
1109
- )
1110
- self._host._staged_feature_sets = {"columns": schema}
1111
- self._host.save()
1112
-
1113
-
1114
- def _add_set_from_anndata(
1115
- self,
1116
- var_field: FieldAttr | None = None,
1117
- obs_field: FieldAttr | None = Feature.name,
1118
- uns_field: FieldAttr | None = None,
1119
- mute: bool = False,
1120
- organism: str | Record | None = None,
1121
- ):
1122
- """Add features from AnnData."""
1123
- assert self._host.otype == "AnnData" # noqa: S101
1124
-
1125
- # parse and register features
1126
- adata = self._host.load(is_run_input=False)
1127
- feature_sets = parse_staged_feature_sets_from_anndata(
1128
- adata,
1129
- var_field=var_field,
1130
- obs_field=obs_field,
1131
- uns_field=uns_field,
1132
- mute=mute,
1133
- organism=organism,
1134
- )
1135
-
1136
- # link feature sets
1137
- self._host._staged_feature_sets = feature_sets
1138
- self._host.save()
1139
-
1140
-
1141
1128
  def _unify_staged_feature_sets_by_hash(
1142
1129
  feature_sets: MutableMapping[str, Schema],
1143
1130
  ):
@@ -1153,83 +1140,6 @@ def _unify_staged_feature_sets_by_hash(
1153
1140
  return feature_sets
1154
1141
 
1155
1142
 
1156
- def _add_set_from_mudata(
1157
- self,
1158
- var_fields: dict[str, FieldAttr] | None = None,
1159
- obs_fields: dict[str, FieldAttr] | None = None,
1160
- mute: bool = False,
1161
- organism: str | Record | None = None,
1162
- ):
1163
- """Add features from MuData."""
1164
- if obs_fields is None:
1165
- obs_fields = {}
1166
- assert self._host.otype == "MuData" # noqa: S101
1167
-
1168
- # parse and register features
1169
- mdata = self._host.load(is_run_input=False)
1170
- feature_sets = {}
1171
-
1172
- obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1173
- if len(obs_features) > 0:
1174
- feature_sets["obs"] = Schema(features=obs_features)
1175
- for modality, field in var_fields.items():
1176
- modality_fs = parse_staged_feature_sets_from_anndata(
1177
- mdata[modality],
1178
- var_field=field,
1179
- obs_field=obs_fields.get(modality, Feature.name),
1180
- mute=mute,
1181
- organism=organism,
1182
- )
1183
- for k, v in modality_fs.items():
1184
- feature_sets[f"['{modality}'].{k}"] = v
1185
-
1186
- # link feature sets
1187
- self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1188
- self._host.save()
1189
-
1190
-
1191
- def _add_set_from_spatialdata(
1192
- self,
1193
- sample_metadata_key: str,
1194
- sample_metadata_field: FieldAttr = Feature.name,
1195
- var_fields: dict[str, FieldAttr] | None = None,
1196
- obs_fields: dict[str, FieldAttr] | None = None,
1197
- mute: bool = False,
1198
- organism: str | Record | None = None,
1199
- ):
1200
- """Add features from SpatialData."""
1201
- obs_fields, var_fields = obs_fields or {}, var_fields or {}
1202
- assert self._host.otype == "SpatialData" # noqa: S101
1203
-
1204
- # parse and register features
1205
- sdata = self._host.load(is_run_input=False)
1206
- feature_sets = {}
1207
-
1208
- # sample features
1209
- sample_features = Feature.from_values(
1210
- sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
1211
- field=sample_metadata_field,
1212
- ) # type: ignore
1213
- if len(sample_features) > 0:
1214
- feature_sets[sample_metadata_key] = Schema(features=sample_features)
1215
-
1216
- # table features
1217
- for table, field in var_fields.items():
1218
- table_fs = parse_staged_feature_sets_from_anndata(
1219
- sdata[table],
1220
- var_field=field,
1221
- obs_field=obs_fields.get(table, Feature.name),
1222
- mute=mute,
1223
- organism=organism,
1224
- )
1225
- for k, v in table_fs.items():
1226
- feature_sets[f"['{table}'].{k}"] = v
1227
-
1228
- # link feature sets
1229
- self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1230
- self._host.save()
1231
-
1232
-
1233
1143
  def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1234
1144
  """Transfer features from a artifact or collection."""
1235
1145
  # This only covers feature sets
@@ -1340,25 +1250,200 @@ def _feature_set_by_slot(self):
1340
1250
  return self.slots
1341
1251
 
1342
1252
 
1253
+ # deprecated: feature set parsing
1254
+
1255
+
1256
+ def parse_staged_feature_sets_from_anndata(
1257
+ adata: AnnData,
1258
+ var_field: FieldAttr | None = None,
1259
+ obs_field: FieldAttr = Feature.name,
1260
+ uns_field: FieldAttr | None = None,
1261
+ mute: bool = False,
1262
+ organism: str | Record | None = None,
1263
+ ) -> dict:
1264
+ data_parse = adata
1265
+ if not isinstance(adata, AnnData): # is a path
1266
+ filepath = create_path(adata) # returns Path for local
1267
+ if not isinstance(filepath, LocalPathClasses):
1268
+ from lamindb import settings
1269
+ from lamindb.core.storage._backed_access import backed_access
1270
+
1271
+ using_key = settings._using_key
1272
+ data_parse = backed_access(filepath, using_key=using_key)
1273
+ else:
1274
+ data_parse = ad.read_h5ad(filepath, backed="r")
1275
+ type = "float"
1276
+ else:
1277
+ type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
1278
+ feature_sets = {}
1279
+ if var_field is not None:
1280
+ schema_var = Schema.from_values(
1281
+ data_parse.var.index,
1282
+ var_field,
1283
+ type=type,
1284
+ mute=mute,
1285
+ organism=organism,
1286
+ raise_validation_error=False,
1287
+ )
1288
+ if schema_var is not None:
1289
+ feature_sets["var"] = schema_var
1290
+ if obs_field is not None and len(data_parse.obs.columns) > 0:
1291
+ schema_obs = Schema.from_df(
1292
+ df=data_parse.obs,
1293
+ field=obs_field,
1294
+ mute=mute,
1295
+ organism=organism,
1296
+ )
1297
+ if schema_obs is not None:
1298
+ feature_sets["obs"] = schema_obs
1299
+ if uns_field is not None and len(data_parse.uns) > 0:
1300
+ validated_features = Feature.from_values( # type: ignore
1301
+ data_parse.uns.keys(), field=uns_field, organism=organism
1302
+ )
1303
+ if len(validated_features) > 0:
1304
+ schema_uns = Schema(validated_features, dtype=None, otype="dict")
1305
+ feature_sets["uns"] = schema_uns
1306
+ return feature_sets
1307
+
1308
+
1309
+ # no longer called from within curator
1310
+ # might deprecate in the future?
1311
+ def _add_set_from_df(
1312
+ self,
1313
+ field: FieldAttr = Feature.name,
1314
+ organism: str | None = None,
1315
+ mute: bool = False,
1316
+ ):
1317
+ """Add feature set corresponding to column names of DataFrame."""
1318
+ assert self._host.otype == "DataFrame" # noqa: S101
1319
+ df = self._host.load(is_run_input=False)
1320
+ schema = Schema.from_df(
1321
+ df=df,
1322
+ field=field,
1323
+ mute=mute,
1324
+ organism=organism,
1325
+ )
1326
+ self._host._staged_feature_sets = {"columns": schema}
1327
+ self._host.save()
1328
+
1329
+
1330
+ def _add_set_from_anndata(
1331
+ self,
1332
+ var_field: FieldAttr | None = None,
1333
+ obs_field: FieldAttr | None = Feature.name,
1334
+ uns_field: FieldAttr | None = None,
1335
+ mute: bool = False,
1336
+ organism: str | Record | None = None,
1337
+ ):
1338
+ """Add features from AnnData."""
1339
+ assert self._host.otype == "AnnData" # noqa: S101
1340
+
1341
+ # parse and register features
1342
+ adata = self._host.load(is_run_input=False)
1343
+ feature_sets = parse_staged_feature_sets_from_anndata(
1344
+ adata,
1345
+ var_field=var_field,
1346
+ obs_field=obs_field,
1347
+ uns_field=uns_field,
1348
+ mute=mute,
1349
+ organism=organism,
1350
+ )
1351
+
1352
+ # link feature sets
1353
+ self._host._staged_feature_sets = feature_sets
1354
+ self._host.save()
1355
+
1356
+
1357
+ def _add_set_from_mudata(
1358
+ self,
1359
+ var_fields: dict[str, FieldAttr] | None = None,
1360
+ obs_fields: dict[str, FieldAttr] | None = None,
1361
+ mute: bool = False,
1362
+ organism: str | Record | None = None,
1363
+ ):
1364
+ """Add features from MuData."""
1365
+ if obs_fields is None:
1366
+ obs_fields = {}
1367
+ assert self._host.otype == "MuData" # noqa: S101
1368
+
1369
+ # parse and register features
1370
+ mdata = self._host.load(is_run_input=False)
1371
+ feature_sets = {}
1372
+
1373
+ obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1374
+ if len(obs_features) > 0:
1375
+ feature_sets["obs"] = Schema(features=obs_features)
1376
+ for modality, field in var_fields.items():
1377
+ modality_fs = parse_staged_feature_sets_from_anndata(
1378
+ mdata[modality],
1379
+ var_field=field,
1380
+ obs_field=obs_fields.get(modality, Feature.name),
1381
+ mute=mute,
1382
+ organism=organism,
1383
+ )
1384
+ for k, v in modality_fs.items():
1385
+ feature_sets[f"['{modality}'].{k}"] = v
1386
+
1387
+ # link feature sets
1388
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1389
+ self._host.save()
1390
+
1391
+
1392
+ def _add_set_from_spatialdata(
1393
+ self,
1394
+ sample_metadata_key: str,
1395
+ sample_metadata_field: FieldAttr = Feature.name,
1396
+ var_fields: dict[str, FieldAttr] | None = None,
1397
+ obs_fields: dict[str, FieldAttr] | None = None,
1398
+ mute: bool = False,
1399
+ organism: str | Record | None = None,
1400
+ ):
1401
+ """Add features from SpatialData."""
1402
+ obs_fields, var_fields = obs_fields or {}, var_fields or {}
1403
+ assert self._host.otype == "SpatialData" # noqa: S101
1404
+
1405
+ # parse and register features
1406
+ sdata = self._host.load(is_run_input=False)
1407
+ feature_sets = {}
1408
+
1409
+ # sample features
1410
+ sample_features = Feature.from_values(
1411
+ sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
1412
+ field=sample_metadata_field,
1413
+ ) # type: ignore
1414
+ if len(sample_features) > 0:
1415
+ feature_sets[sample_metadata_key] = Schema(features=sample_features)
1416
+
1417
+ # table features
1418
+ for table, field in var_fields.items():
1419
+ table_fs = parse_staged_feature_sets_from_anndata(
1420
+ sdata[table],
1421
+ var_field=field,
1422
+ obs_field=obs_fields.get(table, Feature.name),
1423
+ mute=mute,
1424
+ organism=organism,
1425
+ )
1426
+ for k, v in table_fs.items():
1427
+ feature_sets[f"['{table}'].{k}"] = v
1428
+
1429
+ # link feature sets
1430
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1431
+ self._host.save()
1432
+
1433
+
1343
1434
  # mypy: ignore-errors
1344
1435
  FeatureManager.__init__ = __init__
1345
1436
  ParamManager.__init__ = __init__
1346
1437
  FeatureManager.__repr__ = __repr__
1347
1438
  ParamManager.__repr__ = __repr__
1439
+ FeatureManager.describe = describe
1440
+ ParamManager.describe = describe
1348
1441
  FeatureManager.__getitem__ = __getitem__
1349
1442
  FeatureManager.get_values = get_values
1350
1443
  FeatureManager.slots = slots
1351
1444
  FeatureManager.add_values = add_values_features
1352
1445
  FeatureManager._add_schema = _add_schema
1353
- FeatureManager.add_schema = add_schema # deprecated
1354
- FeatureManager.add_feature_set = add_feature_set # deprecated
1355
- FeatureManager._schema_by_slot = _schema_by_slot # deprecated
1356
- FeatureManager._feature_set_by_slot = _feature_set_by_slot # deprecated
1357
1446
  FeatureManager._accessor_by_registry = _accessor_by_registry
1358
- FeatureManager._add_set_from_df = _add_set_from_df
1359
- FeatureManager._add_set_from_anndata = _add_set_from_anndata
1360
- FeatureManager._add_set_from_mudata = _add_set_from_mudata
1361
- FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
1362
1447
  FeatureManager._add_from = _add_from
1363
1448
  FeatureManager.filter = filter
1364
1449
  FeatureManager.get = get
@@ -1367,3 +1452,13 @@ FeatureManager.remove_values = remove_values
1367
1452
  ParamManager.add_values = add_values_params
1368
1453
  ParamManager.get_values = get_values
1369
1454
  ParamManager.filter = filter
1455
+
1456
+ # deprecated
1457
+ FeatureManager._add_set_from_df = _add_set_from_df
1458
+ FeatureManager._add_set_from_anndata = _add_set_from_anndata
1459
+ FeatureManager._add_set_from_mudata = _add_set_from_mudata
1460
+ FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
1461
+ FeatureManager.add_schema = add_schema
1462
+ FeatureManager.add_feature_set = add_feature_set
1463
+ FeatureManager._schema_by_slot = _schema_by_slot
1464
+ FeatureManager._feature_set_by_slot = _feature_set_by_slot