lamindb 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +17 -15
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +2 -2
  8. lamindb/core/storage/_anndata_accessor.py +29 -9
  9. lamindb/curators/_legacy.py +16 -3
  10. lamindb/curators/core.py +442 -188
  11. lamindb/errors.py +6 -0
  12. lamindb/examples/cellxgene/__init__.py +8 -3
  13. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  14. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  15. lamindb/examples/croissant/__init__.py +32 -6
  16. lamindb/examples/datasets/__init__.py +2 -2
  17. lamindb/examples/datasets/_core.py +9 -2
  18. lamindb/examples/datasets/_small.py +66 -22
  19. lamindb/examples/fixtures/sheets.py +8 -2
  20. lamindb/integrations/_croissant.py +34 -11
  21. lamindb/migrations/0119_squashed.py +5 -2
  22. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  23. lamindb/migrations/0121_recorduser.py +60 -0
  24. lamindb/models/__init__.py +4 -1
  25. lamindb/models/_describe.py +2 -2
  26. lamindb/models/_feature_manager.py +131 -71
  27. lamindb/models/_from_values.py +2 -2
  28. lamindb/models/_is_versioned.py +4 -4
  29. lamindb/models/_label_manager.py +4 -4
  30. lamindb/models/artifact.py +326 -172
  31. lamindb/models/artifact_set.py +45 -1
  32. lamindb/models/can_curate.py +1 -2
  33. lamindb/models/collection.py +3 -34
  34. lamindb/models/feature.py +111 -7
  35. lamindb/models/has_parents.py +11 -11
  36. lamindb/models/project.py +18 -0
  37. lamindb/models/query_manager.py +16 -7
  38. lamindb/models/query_set.py +191 -78
  39. lamindb/models/record.py +30 -5
  40. lamindb/models/run.py +10 -33
  41. lamindb/models/save.py +6 -8
  42. lamindb/models/schema.py +54 -26
  43. lamindb/models/sqlrecord.py +152 -40
  44. lamindb/models/storage.py +59 -14
  45. lamindb/models/transform.py +17 -17
  46. lamindb/models/ulabel.py +6 -1
  47. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/METADATA +12 -18
  48. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/RECORD +50 -47
  49. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/WHEEL +1 -1
  50. {lamindb-1.10.2.dist-info/licenses → lamindb-1.11.0.dist-info}/LICENSE +0 -0
lamindb/curators/core.py CHANGED
@@ -5,6 +5,7 @@
5
5
 
6
6
  Curator
7
7
  SlotsCurator
8
+ ComponentCurator
8
9
  CatVector
9
10
  CatLookup
10
11
  DataFrameCatManager
@@ -15,7 +16,6 @@ from __future__ import annotations
15
16
 
16
17
  import copy
17
18
  import re
18
- from collections.abc import Iterable
19
19
  from typing import TYPE_CHECKING, Any, Callable
20
20
 
21
21
  import lamindb_setup as ln_setup
@@ -24,7 +24,9 @@ import pandas as pd
24
24
  import pandera.pandas as pandera
25
25
  from lamin_utils import colors, logger
26
26
  from lamindb_setup.core._docs import doc_args
27
+ from lamindb_setup.core.upath import LocalPathClasses
27
28
 
29
+ from lamindb.base.dtypes import check_dtype
28
30
  from lamindb.base.types import FieldAttr # noqa
29
31
  from lamindb.models import (
30
32
  Artifact,
@@ -48,6 +50,7 @@ from lamindb.models.feature import (
48
50
  from ..errors import InvalidArgument, ValidationError
49
51
 
50
52
  if TYPE_CHECKING:
53
+ from collections.abc import Iterable
51
54
  from typing import Any
52
55
 
53
56
  from anndata import AnnData
@@ -145,6 +148,7 @@ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
145
148
 
146
149
  SLOTS_DOCSTRING = """Access sub curators by slot."""
147
150
 
151
+ SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures."""
148
152
 
149
153
  VALIDATE_DOCSTRING = """Validate dataset against Schema.
150
154
 
@@ -197,7 +201,21 @@ class Curator:
197
201
  "MuData",
198
202
  "SpatialData",
199
203
  }:
200
- self._dataset = self._dataset.load(is_run_input=False)
204
+ # Open remote AnnData Artifacts
205
+ if not isinstance(self._artifact.path, LocalPathClasses):
206
+ if self._artifact.otype in {
207
+ "AnnData",
208
+ }:
209
+ try:
210
+ self._dataset = self._dataset.open(mode="r")
211
+ # open can raise various errors. Fall back to loading into memory if open fails
212
+ except Exception as e:
213
+ logger.warning(
214
+ f"Unable to open remote AnnData Artifact: {e}. Falling back to loading into memory."
215
+ )
216
+ self._dataset = self._dataset.load(is_run_input=False)
217
+ else:
218
+ self._dataset = self._dataset.load(is_run_input=False)
201
219
  self._schema: Schema | None = schema
202
220
  self._is_validated: bool = False
203
221
 
@@ -284,9 +302,12 @@ class Curator:
284
302
  )
285
303
 
286
304
 
305
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
287
306
  class SlotsCurator(Curator):
288
307
  """Curator for a dataset with slots.
289
308
 
309
+ {}
310
+
290
311
  Args:
291
312
  dataset: The dataset to validate & annotate.
292
313
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -298,7 +319,7 @@ class SlotsCurator(Curator):
298
319
  schema: Schema,
299
320
  ) -> None:
300
321
  super().__init__(dataset=dataset, schema=schema)
301
- self._slots: dict[str, DataFrameCurator] = {}
322
+ self._slots: dict[str, ComponentCurator] = {}
302
323
 
303
324
  # used for multimodal data structures (not AnnData)
304
325
  # in form of {table/modality_key: var_field}
@@ -308,7 +329,7 @@ class SlotsCurator(Curator):
308
329
 
309
330
  @property
310
331
  @doc_args(SLOTS_DOCSTRING)
311
- def slots(self) -> dict[str, DataFrameCurator]:
332
+ def slots(self) -> dict[str, ComponentCurator]:
312
333
  """{}""" # noqa: D415
313
334
  return self._slots
314
335
 
@@ -336,6 +357,10 @@ class SlotsCurator(Curator):
336
357
 
337
358
  if self._artifact is None:
338
359
  type_mapping = [
360
+ (
361
+ lambda dataset: isinstance(dataset, pd.DataFrame),
362
+ Artifact.from_dataframe,
363
+ ),
339
364
  (
340
365
  lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
341
366
  Artifact.from_anndata,
@@ -378,92 +403,21 @@ class SlotsCurator(Curator):
378
403
  )
379
404
 
380
405
 
381
- def is_list_of_type(value, expected_type):
382
- """Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure."""
383
- if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
384
- # handle nested lists recursively
385
- return all(is_list_of_type(item, expected_type) for item in value)
386
- return isinstance(value, expected_type)
387
-
388
-
389
- def check_dtype(expected_type) -> Callable:
390
- """Creates a check function for Pandera that validates a column's dtype.
391
-
392
- Supports both standard dtype checking and mixed list/single values for the same type.
393
- For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
394
-
395
- Args:
396
- expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
397
-
398
- Returns:
399
- A function that checks if a series has the expected dtype or contains mixed types
400
- """
401
-
402
- def check_function(series):
403
- # first check if the series is entirely of the expected dtype (fast path)
404
- if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
405
- return True
406
- elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
407
- return True
408
- elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
409
- return True
410
- elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
411
- return True
412
- elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
413
- return True
414
-
415
- # if we're here, it might be a mixed column with object dtype
416
- # need to check each value individually
417
- if series.dtype == "object" and expected_type.startswith("list"):
418
- expected_type_member = expected_type.replace("list[", "").removesuffix("]")
419
- if expected_type_member == "int":
420
- return series.apply(lambda x: is_list_of_type(x, int)).all()
421
- elif expected_type_member == "float":
422
- return series.apply(lambda x: is_list_of_type(x, float)).all()
423
- elif expected_type_member == "num":
424
- # for numeric, accept either int or float
425
- return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
426
- elif (
427
- expected_type_member == "str"
428
- or expected_type_member == "path"
429
- or expected_type_member.startswith("cat[")
430
- ):
431
- return series.apply(lambda x: is_list_of_type(x, str)).all()
432
-
433
- # if we get here, the validation failed
434
- return False
435
-
436
- return check_function
437
-
438
-
439
- # this is also currently used as DictCurator
440
- class DataFrameCurator(Curator):
441
- # the example in the docstring is tested in test_curators_quickstart_example
406
+ # This is also currently used as DictCurator by flattening dictionaries into wide DataFrames.
407
+ # Such an approach was never intended and there is room for a DictCurator in the future.
408
+ # For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
409
+ # https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836
410
+ class ComponentCurator(Curator):
442
411
  """Curator for `DataFrame`.
443
412
 
413
+ Provides all key functionality to validate Pandas DataFrames.
414
+ This class is not user facing unlike :class:`~lamindb.curators.DataFrameCurator` which extends this
415
+ class with functionality to validate the `attrs` slot.
416
+
444
417
  Args:
445
418
  dataset: The DataFrame-like object to validate & annotate.
446
419
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
447
420
  slot: Indicate the slot in a composite curator for a composite data structure.
448
-
449
- Example:
450
-
451
- For simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
452
-
453
- Here is an example that enforces a minimal set of columns in the dataframe.
454
-
455
- .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
456
- :language: python
457
-
458
- Under-the-hood, this used the following schema.
459
-
460
- .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
461
- :language: python
462
-
463
- Valid features & labels were defined as:
464
-
465
- .. literalinclude:: scripts/define_mini_immuno_features_labels.py
466
- :language: python
467
421
  """
468
422
 
469
423
  def __init__(
@@ -479,18 +433,18 @@ class DataFrameCurator(Curator):
479
433
  feature_ids: set[int] = set()
480
434
 
481
435
  if schema.flexible:
482
- features += Feature.filter(name__in=self._dataset.keys()).list()
436
+ features += Feature.filter(name__in=self._dataset.keys()).to_list()
483
437
  feature_ids = {feature.id for feature in features}
484
438
 
485
439
  if schema.n > 0:
486
440
  if schema._index_feature_uid is not None:
487
441
  schema_features = [
488
442
  feature
489
- for feature in schema.members.list()
443
+ for feature in schema.members.to_list()
490
444
  if feature.uid != schema._index_feature_uid # type: ignore
491
445
  ]
492
446
  else:
493
- schema_features = schema.members.list() # type: ignore
447
+ schema_features = schema.members.to_list() # type: ignore
494
448
  if feature_ids:
495
449
  features.extend(
496
450
  feature
@@ -581,9 +535,13 @@ class DataFrameCurator(Curator):
581
535
  # in the DataFrameCatManager, we use the
582
536
  # actual columns of the dataset, not the pandera columns
583
537
  # the pandera columns might have additional optional columns
538
+ if schema.itype == "Composite":
539
+ columns_field = Feature.name
540
+ else:
541
+ columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"]
584
542
  self._cat_manager = DataFrameCatManager(
585
543
  self._dataset,
586
- columns_field=parse_cat_dtype(schema.itype, is_itype=True)["field"],
544
+ columns_field=columns_field,
587
545
  categoricals=categoricals,
588
546
  index=schema.index,
589
547
  slot=slot,
@@ -602,6 +560,11 @@ class DataFrameCurator(Curator):
602
560
  - Adds missing columns for features
603
561
  - Fills missing values for features with default values
604
562
  """
563
+ if self._artifact is not None:
564
+ raise RuntimeError(
565
+ "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
566
+ )
567
+
605
568
  for feature in self._schema.members:
606
569
  if feature.name not in self._dataset.columns:
607
570
  if feature.default_value is not None or feature.nullable:
@@ -680,7 +643,7 @@ class DataFrameCurator(Curator):
680
643
  if not self._is_validated:
681
644
  self.validate() # raises ValidationError if doesn't validate
682
645
  if self._artifact is None:
683
- self._artifact = Artifact.from_df(
646
+ self._artifact = Artifact.from_dataframe(
684
647
  self._dataset,
685
648
  key=key,
686
649
  description=description,
@@ -697,19 +660,245 @@ class DataFrameCurator(Curator):
697
660
  )
698
661
 
699
662
 
663
+ class DataFrameCurator(SlotsCurator):
664
+ # the example in the docstring is tested in test_curators_quickstart_example
665
+ """Curator for `DataFrame`.
666
+
667
+ Args:
668
+ dataset: The DataFrame-like object to validate & annotate.
669
+ schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
670
+ slot: Indicate the slot in a composite curator for a composite data structure.
671
+
672
+ Examples:
673
+
674
+ For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_dataframe`.
675
+
676
+ Here is an example that enforces a minimal set of columns in the dataframe.
677
+
678
+ .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
679
+ :language: python
680
+
681
+ Under-the-hood, this used the following schema.
682
+
683
+ .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
684
+ :language: python
685
+
686
+ Valid features & labels were defined as:
687
+
688
+ .. literalinclude:: scripts/define_mini_immuno_features_labels.py
689
+ :language: python
690
+
691
+ It is also possible to curate the `attrs` slot.
692
+
693
+ .. literalinclude:: scripts/curate_dataframe_attrs.py
694
+ :language: python
695
+ """
696
+
697
+ def __init__(
698
+ self,
699
+ dataset: pd.DataFrame | Artifact,
700
+ schema: Schema,
701
+ slot: str | None = None,
702
+ ) -> None:
703
+ super().__init__(dataset=dataset, schema=schema)
704
+
705
+ # Create atomic curator for features only
706
+ if len(self._schema.features.all()) > 0:
707
+ self._atomic_curator = ComponentCurator(
708
+ dataset=dataset,
709
+ schema=schema,
710
+ slot=slot,
711
+ )
712
+
713
+ # Handle (nested) attrs
714
+ if slot is None and schema.slots:
715
+ for slot_name, slot_schema in schema.slots.items():
716
+ if slot_name.startswith("attrs"):
717
+ path_parts = slot_name.split(":")
718
+ attrs_dict = getattr(self._dataset, "attrs", None)
719
+ if attrs_dict is not None:
720
+ if len(path_parts) == 1:
721
+ data = attrs_dict
722
+ else:
723
+ deeper_keys = path_parts[1:]
724
+ data = _resolve_schema_slot_path(
725
+ attrs_dict, deeper_keys, slot_name, "attrs"
726
+ )
727
+ df = pd.DataFrame([data])
728
+ self._slots[slot_name] = ComponentCurator(
729
+ df, slot_schema, slot=slot_name
730
+ )
731
+ else:
732
+ raise ValueError(
733
+ f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
734
+ )
735
+
736
+ @property
737
+ def cat(self) -> DataFrameCatManager:
738
+ """Manage categoricals by updating registries."""
739
+ if hasattr(self, "_atomic_curator"):
740
+ return self._atomic_curator.cat
741
+ raise AttributeError("cat is only available for slots DataFrameCurator")
742
+
743
+ def standardize(self) -> None:
744
+ """Standardize the dataset.
745
+
746
+ - Adds missing columns for features
747
+ - Fills missing values for features with default values
748
+ """
749
+ if hasattr(self, "_atomic_curator"):
750
+ self._atomic_curator.standardize()
751
+ else:
752
+ for slot_curator in self._slots.values():
753
+ slot_curator.standardize()
754
+
755
+ @doc_args(VALIDATE_DOCSTRING)
756
+ def validate(self) -> None:
757
+ """{}."""
758
+ if hasattr(self, "_atomic_curator"):
759
+ self._atomic_curator.validate()
760
+ self._is_validated = self._atomic_curator._is_validated
761
+ if self._schema.itype == "Composite":
762
+ super().validate()
763
+
764
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
765
+ def save_artifact(
766
+ self, *, key=None, description=None, revises=None, run=None
767
+ ) -> Artifact:
768
+ """{}."""
769
+ if not self._is_validated:
770
+ self.validate()
771
+
772
+ if self._slots:
773
+ self._slots["columns"] = self._atomic_curator
774
+ try:
775
+ return super().save_artifact(
776
+ key=key, description=description, revises=revises, run=run
777
+ )
778
+ finally:
779
+ del self._slots["columns"]
780
+ else:
781
+ return self._atomic_curator.save_artifact(
782
+ key=key, description=description, revises=revises, run=run
783
+ )
784
+
785
+
786
+ def _resolve_schema_slot_path(
787
+ target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
788
+ ) -> Any:
789
+ """Resolve a schema slot path by traversing nested dictionary keys.
790
+
791
+ Args:
792
+ target_dict: Root dictionary to traverse
793
+ slot_keys: Sequence of keys defining the paths to traverse
794
+ slot_name: Schema slot identifier for error context
795
+ base_path: Base path string for error context
796
+
797
+ Returns:
798
+ The value at the resolved path
799
+ """
800
+ current = target_dict
801
+
802
+ for key in slot_keys:
803
+ base_path += f"['{key}']"
804
+ try:
805
+ current = current[key]
806
+ except KeyError:
807
+ available = (
808
+ list(current.keys()) if isinstance(current, dict) else "not a dict"
809
+ )
810
+ raise InvalidArgument(
811
+ f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
812
+ f"not found. Available keys at this level: {available}"
813
+ ) from None
814
+
815
+ return current
816
+
817
+
818
+ def _handle_dict_slots(
819
+ dataset: ScverseDataStructures, slot: str
820
+ ) -> tuple[pd.DataFrame | None, str | None, str | None]:
821
+ """Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.
822
+
823
+ Supports two patterns:
824
+ - Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key"
825
+ - Modality dict access: "modality:uns"
826
+
827
+ Args:
828
+ dataset: The scverse datastructure object
829
+ slot: The slot path string to parse like 'uns:path:to'.
830
+
831
+ Returns:
832
+ tuple: (dataframe, modality_key, remaining_slot_path)
833
+ - dataframe: Single-row DataFrame containing the resolved data
834
+ - modality_key: Modality identifier if slot targets modality dict, else None
835
+ - remaining_slot_path: The dict attribute and nested keys as string
836
+ """
837
+ path_parts = slot.split(":")
838
+
839
+ # Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..."
840
+ if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]:
841
+ dict_attr = getattr(dataset, path_parts[0], None)
842
+ if dict_attr is not None:
843
+ if len(path_parts) == 1:
844
+ return pd.DataFrame([dict_attr]), None, path_parts[0]
845
+
846
+ deeper_keys = path_parts[1:]
847
+ data = _resolve_schema_slot_path(
848
+ dict_attr, deeper_keys, slot, path_parts[0]
849
+ )
850
+ return pd.DataFrame([data]), None, ":".join(path_parts[1:])
851
+
852
+ # Handle modality dict slots: "modality:uns", "modality:uns:key1:key2"
853
+ elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]:
854
+ modality, dict_name = path_parts[0], path_parts[1]
855
+ try:
856
+ modality_dataset = dataset[modality]
857
+ dict_attr = getattr(modality_dataset, dict_name, None)
858
+ if dict_attr is not None:
859
+ if len(path_parts) == 2:
860
+ return pd.DataFrame([dict_attr]), modality, dict_name
861
+
862
+ deeper_keys = path_parts[2:]
863
+ data = _resolve_schema_slot_path(
864
+ dict_attr, deeper_keys, slot, f"{modality}.{dict_name}"
865
+ )
866
+ return pd.DataFrame([data]), modality, ":".join(path_parts[1:])
867
+ except (KeyError, AttributeError):
868
+ pass
869
+ else:
870
+ raise InvalidArgument(
871
+ f"Invalid dict slot pattern '{slot}'. Expected formats: "
872
+ f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'"
873
+ )
874
+
875
+ return None, None, None
876
+
877
+
878
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
700
879
  class AnnDataCurator(SlotsCurator):
701
880
  """Curator for `AnnData`.
702
881
 
882
+ {}
883
+
703
884
  Args:
704
885
  dataset: The AnnData-like object to validate & annotate.
705
886
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
706
887
 
707
- Example:
888
+ Examples:
889
+
890
+ Curate Ensembl gene IDs and valid features in obs:
708
891
 
709
892
  .. literalinclude:: scripts/curate_anndata_flexible.py
710
893
  :language: python
711
894
  :caption: curate_anndata_flexible.py
712
895
 
896
+ Curate `uns` dictionary:
897
+
898
+ .. literalinclude:: scripts/curate_anndata_uns.py
899
+ :language: python
900
+ :caption: curate_anndata_uns.py
901
+
713
902
  See Also:
714
903
  :meth:`~lamindb.Artifact.from_anndata`.
715
904
  """
@@ -722,34 +911,37 @@ class AnnDataCurator(SlotsCurator):
722
911
  super().__init__(dataset=dataset, schema=schema)
723
912
  if not data_is_scversedatastructure(self._dataset, "AnnData"):
724
913
  raise InvalidArgument("dataset must be AnnData-like.")
725
- if schema.otype != "AnnData":
914
+ if schema.otype and schema.otype != "AnnData":
726
915
  raise InvalidArgument("Schema otype must be 'AnnData'.")
727
- self._slots = {
728
- slot: DataFrameCurator(
729
- (
916
+
917
+ for slot, slot_schema in schema.slots.items():
918
+ if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"):
919
+ raise ValueError(
920
+ f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}"
921
+ )
922
+ if slot.startswith("uns"):
923
+ df, _, _ = _handle_dict_slots(self._dataset, slot)
924
+ elif slot in {"obs", "var", "var.T"}:
925
+ df = (
730
926
  getattr(self._dataset, slot.strip(".T")).T
731
927
  if slot == "var.T"
732
928
  or (
733
- # backward compat
734
929
  slot == "var"
735
930
  and schema.slots["var"].itype not in {None, "Feature"}
736
931
  )
737
932
  else getattr(self._dataset, slot)
738
- ),
739
- slot_schema,
740
- slot=slot,
741
- )
742
- for slot, slot_schema in schema.slots.items()
743
- if slot in {"obs", "var", "var.T", "uns"}
744
- }
745
- if "var" in self._slots and schema.slots["var"].itype not in {None, "Feature"}:
746
- logger.warning(
747
- "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
748
- )
749
- self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
750
- "var"
751
- ].cat._cat_vectors.pop("columns")
752
- self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
933
+ )
934
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
935
+
936
+ # Handle var index naming for backward compat
937
+ if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}:
938
+ logger.warning(
939
+ "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
940
+ )
941
+ self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
942
+ "var"
943
+ ].cat._cat_vectors.pop("columns")
944
+ self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
753
945
 
754
946
 
755
947
  def _assign_var_fields_categoricals_multimodal(
@@ -759,11 +951,10 @@ def _assign_var_fields_categoricals_multimodal(
759
951
  slot_schema: Schema,
760
952
  var_fields: dict[str, FieldAttr],
761
953
  cat_vectors: dict[str, dict[str, CatVector]],
762
- slots: dict[str, DataFrameCurator],
954
+ slots: dict[str, ComponentCurator],
763
955
  ) -> None:
764
956
  """Assigns var_fields and categoricals for multimodal data curators."""
765
957
  if modality is not None:
766
- # Makes sure that all tables are present
767
958
  var_fields[modality] = None
768
959
  cat_vectors[modality] = {}
769
960
 
@@ -784,15 +975,17 @@ def _assign_var_fields_categoricals_multimodal(
784
975
  cat_vectors[modality] = obs_fields
785
976
 
786
977
 
978
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
787
979
  class MuDataCurator(SlotsCurator):
788
980
  """Curator for `MuData`.
789
981
 
982
+ {}
983
+
790
984
  Args:
791
985
  dataset: The MuData-like object to validate & annotate.
792
986
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
793
987
 
794
988
  Example:
795
-
796
989
  .. literalinclude:: scripts/curate_mudata.py
797
990
  :language: python
798
991
  :caption: curate_mudata.py
@@ -813,12 +1006,32 @@ class MuDataCurator(SlotsCurator):
813
1006
  raise InvalidArgument("Schema otype must be 'MuData'.")
814
1007
 
815
1008
  for slot, slot_schema in schema.slots.items():
816
- if ":" in slot:
817
- modality, modality_slot = slot.split(":")
818
- schema_dataset = self._dataset.__getitem__(modality)
1009
+ # Handle slots: "mdata.uns", "modality:uns"
1010
+ if "uns" in slot:
1011
+ df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)
819
1012
  else:
820
- modality, modality_slot = None, slot
821
- schema_dataset = self._dataset
1013
+ # Handle slots: "modality:obs", "modality:var"
1014
+ parts = slot.split(":")
1015
+ if len(parts) == 2:
1016
+ modality, modality_slot = parts
1017
+ try:
1018
+ schema_dataset = self._dataset[modality]
1019
+ df = getattr(schema_dataset, modality_slot.rstrip(".T"))
1020
+ except KeyError:
1021
+ raise InvalidArgument(
1022
+ f"Modality '{modality}' not found in MuData"
1023
+ ) from None
1024
+ except AttributeError:
1025
+ raise InvalidArgument(
1026
+ f"Attribute '{modality_slot}' not found on modality '{modality}'"
1027
+ ) from None
1028
+ else:
1029
+ # Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above)
1030
+ modality, modality_slot = None, slot
1031
+ schema_dataset = self._dataset
1032
+ df = getattr(schema_dataset, modality_slot.rstrip(".T"))
1033
+
1034
+ # Transpose var if necessary
822
1035
  if modality_slot == "var" and schema.slots[slot].itype not in {
823
1036
  None,
824
1037
  "Feature",
@@ -826,19 +1039,12 @@ class MuDataCurator(SlotsCurator):
826
1039
  logger.warning(
827
1040
  "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
828
1041
  )
829
- self._slots[slot] = DataFrameCurator(
830
- (
831
- getattr(schema_dataset, modality_slot.rstrip(".T")).T
832
- if modality_slot == "var.T"
833
- or (
834
- # backward compat
835
- modality_slot == "var"
836
- and schema.slots[slot].itype not in {None, "Feature"}
837
- )
838
- else getattr(schema_dataset, modality_slot)
839
- ),
840
- slot_schema,
841
- )
1042
+ df = df.T
1043
+ elif modality_slot == "var.T":
1044
+ df = df.T
1045
+
1046
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
1047
+
842
1048
  _assign_var_fields_categoricals_multimodal(
843
1049
  modality=modality,
844
1050
  slot_type=modality_slot,
@@ -848,18 +1054,21 @@ class MuDataCurator(SlotsCurator):
848
1054
  cat_vectors=self._cat_vectors,
849
1055
  slots=self._slots,
850
1056
  )
1057
+
851
1058
  self._columns_field = self._var_fields
852
1059
 
853
1060
 
1061
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
854
1062
  class SpatialDataCurator(SlotsCurator):
855
1063
  """Curator for `SpatialData`.
856
1064
 
1065
+ {}
1066
+
857
1067
  Args:
858
1068
  dataset: The SpatialData-like object to validate & annotate.
859
1069
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
860
1070
 
861
1071
  Example:
862
-
863
1072
  .. literalinclude:: scripts/curate_spatialdata.py
864
1073
  :language: python
865
1074
  :caption: curate_spatialdata.py
@@ -880,69 +1089,75 @@ class SpatialDataCurator(SlotsCurator):
880
1089
  raise InvalidArgument("Schema otype must be 'SpatialData'.")
881
1090
 
882
1091
  for slot, slot_schema in schema.slots.items():
883
- split_result = slot.split(":")
884
- if (len(split_result) == 2 and split_result[0] == "table") or (
885
- len(split_result) == 3 and split_result[0] == "tables"
886
- ):
887
- if len(split_result) == 2:
888
- table_key, sub_slot = split_result
889
- logger.warning(
890
- f"please prefix slot {slot} with 'tables:' going forward"
891
- )
1092
+ # Handle slots: "sdata:attrs"
1093
+ if slot.startswith("attrs"):
1094
+ df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)
1095
+ else:
1096
+ parts = slot.split(":")
1097
+ # Handle slots: "tables:table_key:obs", "tables:table_key:var"
1098
+ if len(parts) == 3 and parts[0] == "tables":
1099
+ table_key, table_slot = parts[1], parts[2]
1100
+ try:
1101
+ slot_object = self._dataset.tables[table_key]
1102
+ df = getattr(slot_object, table_slot.rstrip(".T"))
1103
+ except KeyError:
1104
+ raise InvalidArgument(
1105
+ f"Table '{table_key}' not found in sdata.tables"
1106
+ ) from None
1107
+ except AttributeError:
1108
+ raise InvalidArgument(
1109
+ f"Attribute '{table_slot}' not found on table '{table_key}'"
1110
+ ) from None
892
1111
  else:
893
- table_key, sub_slot = split_result[1], split_result[2]
894
- slot_object = self._dataset.tables.__getitem__(table_key)
895
- if sub_slot == "var" and schema.slots[slot].itype not in {
896
- None,
897
- "Feature",
898
- }:
899
- logger.warning(
900
- "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
901
- )
902
- data_object = (
903
- getattr(slot_object, sub_slot.rstrip(".T")).T
904
- if sub_slot == "var.T"
905
- or (
906
- # backward compat
907
- sub_slot == "var"
908
- and schema.slots[slot].itype not in {None, "Feature"}
909
- )
910
- else getattr(slot_object, sub_slot)
911
- )
912
- elif len(split_result) == 1 or (
913
- len(split_result) > 1 and split_result[0] == "attrs"
914
- ):
915
- table_key = None
916
- if len(split_result) == 1:
917
- if split_result[0] != "attrs":
1112
+ # Handle legacy single keys for backward compatibility
1113
+ if len(parts) == 1 and parts[0] != "attrs":
918
1114
  logger.warning(
919
1115
  f"please prefix slot {slot} with 'attrs:' going forward"
920
1116
  )
921
- sub_slot = slot
922
- data_object = self._dataset.attrs[slot]
1117
+ try:
1118
+ df = pd.DataFrame([self._dataset.attrs[slot]])
1119
+ table_key = None
1120
+ table_slot = slot
1121
+ except KeyError:
1122
+ raise InvalidArgument(
1123
+ f"Slot '{slot}' not found in sdata.attrs"
1124
+ ) from None
923
1125
  else:
924
- sub_slot = "attrs"
925
- data_object = self._dataset.attrs
926
- elif len(split_result) == 2:
927
- sub_slot = split_result[1]
928
- data_object = self._dataset.attrs[split_result[1]]
929
- data_object = pd.DataFrame([data_object])
930
- self._slots[slot] = DataFrameCurator(data_object, slot_schema, slot)
1126
+ raise InvalidArgument(f"Unrecognized slot format: {slot}")
1127
+
1128
+ # Handle var transposition logic
1129
+ if table_slot == "var" and schema.slots[slot].itype not in {
1130
+ None,
1131
+ "Feature",
1132
+ }:
1133
+ logger.warning(
1134
+ "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
1135
+ )
1136
+ df = df.T
1137
+ elif table_slot == "var.T":
1138
+ df = df.T
1139
+
1140
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot)
1141
+
931
1142
  _assign_var_fields_categoricals_multimodal(
932
1143
  modality=table_key,
933
- slot_type=sub_slot,
1144
+ slot_type=table_slot,
934
1145
  slot=slot,
935
1146
  slot_schema=slot_schema,
936
1147
  var_fields=self._var_fields,
937
1148
  cat_vectors=self._cat_vectors,
938
1149
  slots=self._slots,
939
1150
  )
1151
+
940
1152
  self._columns_field = self._var_fields
941
1153
 
942
1154
 
1155
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
943
1156
  class TiledbsomaExperimentCurator(SlotsCurator):
944
1157
  """Curator for `tiledbsoma.Experiment`.
945
1158
 
1159
+ {}
1160
+
946
1161
  Args:
947
1162
  dataset: The `tiledbsoma.Experiment` object.
948
1163
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -979,7 +1194,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
979
1194
  .drop("soma_joinid", axis=1, errors="ignore")
980
1195
  )
981
1196
 
982
- self._slots[slot] = DataFrameCurator(
1197
+ self._slots[slot] = ComponentCurator(
983
1198
  (schema_dataset.T if modality_slot == "var.T" else schema_dataset),
984
1199
  slot_schema,
985
1200
  )
@@ -992,7 +1207,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
992
1207
  .to_pandas()
993
1208
  .drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
994
1209
  )
995
- self._slots[slot] = DataFrameCurator(
1210
+ self._slots[slot] = ComponentCurator(
996
1211
  schema_dataset,
997
1212
  slot_schema,
998
1213
  )
@@ -1042,9 +1257,12 @@ class CatVector:
1042
1257
  self._maximal_set = maximal_set
1043
1258
 
1044
1259
  self._all_filters = {"source": self._source, "organism": self._organism}
1260
+
1045
1261
  if self._subtype_str and "=" in self._subtype_str:
1046
1262
  self._all_filters.update(
1047
- resolve_relation_filters(parse_filter_string(self._subtype_str), self) # type: ignore
1263
+ resolve_relation_filters(
1264
+ parse_filter_string(self._subtype_str), self._field.field.model
1265
+ ) # type: ignore
1048
1266
  )
1049
1267
 
1050
1268
  if hasattr(field.field.model, "_name_field"):
@@ -1243,7 +1461,7 @@ class CatVector:
1243
1461
  type_record = registry.get(name=self._subtype_str)
1244
1462
  if df is not None and registry == Feature:
1245
1463
  nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
1246
- non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
1464
+ non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])
1247
1465
  else:
1248
1466
  if (
1249
1467
  self._organism
@@ -1345,7 +1563,7 @@ class CatVector:
1345
1563
  warning_message += "\n for remaining terms:\n"
1346
1564
  warning_message += f" → fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}"
1347
1565
  if self._subtype_query_set is not None:
1348
- warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.list('name')}"
1566
+ warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.to_list('name')}"
1349
1567
  logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
1350
1568
  logger.warning(warning_message)
1351
1569
  if self._cat_manager is not None:
@@ -1495,6 +1713,30 @@ class DataFrameCatManager:
1495
1713
  """The categorical features."""
1496
1714
  return self._categoricals
1497
1715
 
1716
+ def __repr__(self) -> str:
1717
+ cls_name = colors.green(self.__class__.__name__)
1718
+
1719
+ status_str = (
1720
+ f"{colors.green('validated')}"
1721
+ if self._is_validated
1722
+ else f"{colors.yellow('unvalidated')}"
1723
+ )
1724
+
1725
+ info_parts = []
1726
+
1727
+ cat_count = len(self._categoricals)
1728
+ if cat_count > 0:
1729
+ info_parts.append(f"categorical_features={cat_count}")
1730
+
1731
+ if self._slot:
1732
+ info_parts.append(f"slot: {colors.italic(self._slot)}")
1733
+
1734
+ info_str = ", ".join(info_parts)
1735
+ if info_str:
1736
+ return f"{cls_name}({info_str}, {status_str})"
1737
+ else:
1738
+ return f"{cls_name}({status_str})"
1739
+
1498
1740
  def lookup(self, public: bool = False) -> CatLookup:
1499
1741
  """Lookup categories.
1500
1742
 
@@ -1539,7 +1781,9 @@ class DataFrameCatManager:
1539
1781
  key: The key referencing the column in the DataFrame to standardize.
1540
1782
  """
1541
1783
  if self._artifact is not None:
1542
- raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1784
+ raise RuntimeError(
1785
+ "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
1786
+ )
1543
1787
 
1544
1788
  if key == "all":
1545
1789
  logger.warning(
@@ -1612,7 +1856,7 @@ def get_organism_kwargs(
1612
1856
  def annotate_artifact(
1613
1857
  artifact: Artifact,
1614
1858
  *,
1615
- curator: AnnDataCurator | SlotsCurator | None = None,
1859
+ curator: SlotsCurator | None = None,
1616
1860
  cat_vectors: dict[str, CatVector] | None = None,
1617
1861
  ) -> Artifact:
1618
1862
  from .. import settings
@@ -1645,7 +1889,9 @@ def annotate_artifact(
1645
1889
  )
1646
1890
 
1647
1891
  # annotate with inferred schemas aka feature sets
1648
- if artifact.otype == "DataFrame":
1892
+ if (
1893
+ artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None
1894
+ ): # Prevent overwriting user-defined schemas that contain slots
1649
1895
  features = cat_vectors["columns"].records
1650
1896
  if features is not None:
1651
1897
  index_feature = artifact.schema.index
@@ -1665,7 +1911,11 @@ def annotate_artifact(
1665
1911
  logger.important(
1666
1912
  f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
1667
1913
  )
1668
- itype = parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
1914
+ itype = (
1915
+ Feature.name
1916
+ if artifact.schema.itype == "Composite"
1917
+ else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
1918
+ )
1669
1919
  feature_set = Schema(itype=itype, n=len(features))
1670
1920
  artifact.feature_sets.add(
1671
1921
  feature_set.save(), through_defaults={"slot": "columns"}
@@ -1700,9 +1950,13 @@ def annotate_artifact(
1700
1950
  logger.important(
1701
1951
  f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
1702
1952
  )
1703
- itype = parse_cat_dtype(
1704
- artifact.schema.slots[slot].itype, is_itype=True
1705
- )["field"]
1953
+ itype = (
1954
+ Feature.name
1955
+ if artifact.schema.slots[slot].itype == "Composite"
1956
+ else parse_cat_dtype(
1957
+ artifact.schema.slots[slot].itype, is_itype=True
1958
+ )["field"]
1959
+ )
1706
1960
  feature_set = Schema(itype=itype, n=len(features))
1707
1961
  artifact.feature_sets.add(
1708
1962
  feature_set.save(), through_defaults={"slot": slot}