lamindb 1.10.1__py3-none-any.whl → 1.11a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +14 -12
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +45 -2
  8. lamindb/core/storage/_anndata_accessor.py +118 -26
  9. lamindb/core/storage/_backed_access.py +10 -7
  10. lamindb/core/storage/_spatialdata_accessor.py +15 -4
  11. lamindb/core/storage/_zarr.py +3 -0
  12. lamindb/curators/_legacy.py +16 -3
  13. lamindb/curators/core.py +439 -191
  14. lamindb/examples/cellxgene/__init__.py +8 -3
  15. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  16. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  17. lamindb/examples/croissant/__init__.py +12 -2
  18. lamindb/examples/datasets/__init__.py +2 -2
  19. lamindb/examples/datasets/_core.py +1 -1
  20. lamindb/examples/datasets/_small.py +66 -22
  21. lamindb/examples/datasets/mini_immuno.py +1 -0
  22. lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
  23. lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
  24. lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
  25. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  26. lamindb/migrations/0121_recorduser.py +53 -0
  27. lamindb/models/__init__.py +3 -1
  28. lamindb/models/_describe.py +2 -2
  29. lamindb/models/_feature_manager.py +53 -53
  30. lamindb/models/_from_values.py +2 -2
  31. lamindb/models/_is_versioned.py +4 -4
  32. lamindb/models/_label_manager.py +4 -4
  33. lamindb/models/artifact.py +336 -136
  34. lamindb/models/artifact_set.py +36 -1
  35. lamindb/models/can_curate.py +1 -2
  36. lamindb/models/collection.py +3 -34
  37. lamindb/models/feature.py +111 -7
  38. lamindb/models/has_parents.py +11 -11
  39. lamindb/models/project.py +42 -2
  40. lamindb/models/query_manager.py +16 -7
  41. lamindb/models/query_set.py +59 -34
  42. lamindb/models/record.py +25 -4
  43. lamindb/models/run.py +8 -6
  44. lamindb/models/schema.py +54 -26
  45. lamindb/models/sqlrecord.py +123 -25
  46. lamindb/models/storage.py +59 -14
  47. lamindb/models/transform.py +17 -17
  48. lamindb/models/ulabel.py +6 -1
  49. {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/METADATA +3 -3
  50. {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/RECORD +52 -47
  51. {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/LICENSE +0 -0
  52. {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/WHEEL +0 -0
lamindb/curators/core.py CHANGED
@@ -5,6 +5,7 @@
5
5
 
6
6
  Curator
7
7
  SlotsCurator
8
+ ComponentCurator
8
9
  CatVector
9
10
  CatLookup
10
11
  DataFrameCatManager
@@ -15,7 +16,6 @@ from __future__ import annotations
15
16
 
16
17
  import copy
17
18
  import re
18
- from collections.abc import Iterable
19
19
  from typing import TYPE_CHECKING, Any, Callable
20
20
 
21
21
  import lamindb_setup as ln_setup
@@ -24,7 +24,9 @@ import pandas as pd
24
24
  import pandera.pandas as pandera
25
25
  from lamin_utils import colors, logger
26
26
  from lamindb_setup.core._docs import doc_args
27
+ from lamindb_setup.core.upath import LocalPathClasses
27
28
 
29
+ from lamindb.base.dtypes import check_dtype
28
30
  from lamindb.base.types import FieldAttr # noqa
29
31
  from lamindb.models import (
30
32
  Artifact,
@@ -48,6 +50,7 @@ from lamindb.models.feature import (
48
50
  from ..errors import InvalidArgument, ValidationError
49
51
 
50
52
  if TYPE_CHECKING:
53
+ from collections.abc import Iterable
51
54
  from typing import Any
52
55
 
53
56
  from anndata import AnnData
@@ -145,6 +148,7 @@ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
145
148
 
146
149
  SLOTS_DOCSTRING = """Access sub curators by slot."""
147
150
 
151
+ SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures."""
148
152
 
149
153
  VALIDATE_DOCSTRING = """Validate dataset against Schema.
150
154
 
@@ -197,7 +201,21 @@ class Curator:
197
201
  "MuData",
198
202
  "SpatialData",
199
203
  }:
200
- self._dataset = self._dataset.load(is_run_input=False)
204
+ # Open remote AnnData Artifacts
205
+ if not isinstance(self._artifact.path, LocalPathClasses):
206
+ if self._artifact.otype in {
207
+ "AnnData",
208
+ }:
209
+ try:
210
+ self._dataset = self._dataset.open(mode="r")
211
+ # open can raise various errors. Fall back to loading into memory if open fails
212
+ except Exception as e:
213
+ logger.warning(
214
+ f"Unable to open remote AnnData Artifact: {e}. Falling back to loading into memory."
215
+ )
216
+ self._dataset = self._dataset.load(is_run_input=False)
217
+ else:
218
+ self._dataset = self._dataset.load(is_run_input=False)
201
219
  self._schema: Schema | None = schema
202
220
  self._is_validated: bool = False
203
221
 
@@ -284,9 +302,12 @@ class Curator:
284
302
  )
285
303
 
286
304
 
305
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
287
306
  class SlotsCurator(Curator):
288
307
  """Curator for a dataset with slots.
289
308
 
309
+ {}
310
+
290
311
  Args:
291
312
  dataset: The dataset to validate & annotate.
292
313
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -298,7 +319,7 @@ class SlotsCurator(Curator):
298
319
  schema: Schema,
299
320
  ) -> None:
300
321
  super().__init__(dataset=dataset, schema=schema)
301
- self._slots: dict[str, DataFrameCurator] = {}
322
+ self._slots: dict[str, ComponentCurator] = {}
302
323
 
303
324
  # used for multimodal data structures (not AnnData)
304
325
  # in form of {table/modality_key: var_field}
@@ -308,7 +329,7 @@ class SlotsCurator(Curator):
308
329
 
309
330
  @property
310
331
  @doc_args(SLOTS_DOCSTRING)
311
- def slots(self) -> dict[str, DataFrameCurator]:
332
+ def slots(self) -> dict[str, ComponentCurator]:
312
333
  """{}""" # noqa: D415
313
334
  return self._slots
314
335
 
@@ -336,6 +357,10 @@ class SlotsCurator(Curator):
336
357
 
337
358
  if self._artifact is None:
338
359
  type_mapping = [
360
+ (
361
+ lambda dataset: isinstance(dataset, pd.DataFrame),
362
+ Artifact.from_dataframe,
363
+ ),
339
364
  (
340
365
  lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
341
366
  Artifact.from_anndata,
@@ -364,12 +389,13 @@ class SlotsCurator(Curator):
364
389
  )
365
390
  break
366
391
 
367
- self._artifact.schema = self._schema
368
- self._artifact.save()
369
392
  cat_vectors = {}
370
393
  for curator in self._slots.values():
371
394
  for key, cat_vector in curator.cat._cat_vectors.items():
372
395
  cat_vectors[key] = cat_vector
396
+
397
+ self._artifact.schema = self._schema
398
+ self._artifact.save()
373
399
  return annotate_artifact( # type: ignore
374
400
  self._artifact,
375
401
  curator=self,
@@ -377,92 +403,21 @@ class SlotsCurator(Curator):
377
403
  )
378
404
 
379
405
 
380
- def is_list_of_type(value, expected_type):
381
- """Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure."""
382
- if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
383
- # handle nested lists recursively
384
- return all(is_list_of_type(item, expected_type) for item in value)
385
- return isinstance(value, expected_type)
386
-
387
-
388
- def check_dtype(expected_type) -> Callable:
389
- """Creates a check function for Pandera that validates a column's dtype.
390
-
391
- Supports both standard dtype checking and mixed list/single values for the same type.
392
- For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
393
-
394
- Args:
395
- expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
396
-
397
- Returns:
398
- A function that checks if a series has the expected dtype or contains mixed types
399
- """
400
-
401
- def check_function(series):
402
- # first check if the series is entirely of the expected dtype (fast path)
403
- if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
404
- return True
405
- elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
406
- return True
407
- elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
408
- return True
409
- elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
410
- return True
411
- elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
412
- return True
413
-
414
- # if we're here, it might be a mixed column with object dtype
415
- # need to check each value individually
416
- if series.dtype == "object" and expected_type.startswith("list"):
417
- expected_type_member = expected_type.replace("list[", "").removesuffix("]")
418
- if expected_type_member == "int":
419
- return series.apply(lambda x: is_list_of_type(x, int)).all()
420
- elif expected_type_member == "float":
421
- return series.apply(lambda x: is_list_of_type(x, float)).all()
422
- elif expected_type_member == "num":
423
- # for numeric, accept either int or float
424
- return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
425
- elif (
426
- expected_type_member == "str"
427
- or expected_type_member == "path"
428
- or expected_type_member.startswith("cat[")
429
- ):
430
- return series.apply(lambda x: is_list_of_type(x, str)).all()
431
-
432
- # if we get here, the validation failed
433
- return False
434
-
435
- return check_function
436
-
437
-
438
- # this is also currently used as DictCurator
439
- class DataFrameCurator(Curator):
440
- # the example in the docstring is tested in test_curators_quickstart_example
406
+ # This is also currently used as DictCurator by flattening dictionaries into wide DataFrames.
407
+ # Such an approach was never intended and there is room for a DictCurator in the future.
408
+ # For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
409
+ # https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836
410
+ class ComponentCurator(Curator):
441
411
  """Curator for `DataFrame`.
442
412
 
413
+ Provides all key functionality to validate Pandas DataFrames.
414
+ This class is not user facing unlike :class:`~lamindb.DataFrameCurator` which extends this
415
+ class with functionality to validate the `attrs` slot.
416
+
443
417
  Args:
444
418
  dataset: The DataFrame-like object to validate & annotate.
445
419
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
446
420
  slot: Indicate the slot in a composite curator for a composite data structure.
447
-
448
- Example:
449
-
450
- For simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
451
-
452
- Here is an example that enforces a minimal set of columns in the dataframe.
453
-
454
- .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
455
- :language: python
456
-
457
- Under-the-hood, this used the following schema.
458
-
459
- .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
460
- :language: python
461
-
462
- Valid features & labels were defined as:
463
-
464
- .. literalinclude:: scripts/define_mini_immuno_features_labels.py
465
- :language: python
466
421
  """
467
422
 
468
423
  def __init__(
@@ -478,18 +433,18 @@ class DataFrameCurator(Curator):
478
433
  feature_ids: set[int] = set()
479
434
 
480
435
  if schema.flexible:
481
- features += Feature.filter(name__in=self._dataset.keys()).list()
436
+ features += Feature.filter(name__in=self._dataset.keys()).to_list()
482
437
  feature_ids = {feature.id for feature in features}
483
438
 
484
439
  if schema.n > 0:
485
440
  if schema._index_feature_uid is not None:
486
441
  schema_features = [
487
442
  feature
488
- for feature in schema.members.list()
443
+ for feature in schema.members.to_list()
489
444
  if feature.uid != schema._index_feature_uid # type: ignore
490
445
  ]
491
446
  else:
492
- schema_features = schema.members.list() # type: ignore
447
+ schema_features = schema.members.to_list() # type: ignore
493
448
  if feature_ids:
494
449
  features.extend(
495
450
  feature
@@ -580,9 +535,13 @@ class DataFrameCurator(Curator):
580
535
  # in the DataFrameCatManager, we use the
581
536
  # actual columns of the dataset, not the pandera columns
582
537
  # the pandera columns might have additional optional columns
538
+ if schema.itype == "Composite":
539
+ columns_field = Feature.name
540
+ else:
541
+ columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"]
583
542
  self._cat_manager = DataFrameCatManager(
584
543
  self._dataset,
585
- columns_field=parse_cat_dtype(schema.itype, is_itype=True)["field"],
544
+ columns_field=columns_field,
586
545
  categoricals=categoricals,
587
546
  index=schema.index,
588
547
  slot=slot,
@@ -601,6 +560,11 @@ class DataFrameCurator(Curator):
601
560
  - Adds missing columns for features
602
561
  - Fills missing values for features with default values
603
562
  """
563
+ if self._artifact is not None:
564
+ raise RuntimeError(
565
+ "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
566
+ )
567
+
604
568
  for feature in self._schema.members:
605
569
  if feature.name not in self._dataset.columns:
606
570
  if feature.default_value is not None or feature.nullable:
@@ -679,25 +643,244 @@ class DataFrameCurator(Curator):
679
643
  if not self._is_validated:
680
644
  self.validate() # raises ValidationError if doesn't validate
681
645
  if self._artifact is None:
682
- self._artifact = Artifact.from_df(
646
+ self._artifact = Artifact.from_dataframe(
683
647
  self._dataset,
684
648
  key=key,
685
649
  description=description,
686
650
  revises=revises,
687
651
  run=run,
688
- format=".csv" if key.endswith(".csv") else None,
652
+ format=".csv" if key is not None and key.endswith(".csv") else None,
689
653
  )
690
- self._artifact.schema = self._schema
691
- self._artifact.save()
654
+
655
+ self._artifact.schema = self._schema
656
+ self._artifact.save()
692
657
  return annotate_artifact( # type: ignore
693
658
  self._artifact,
694
659
  cat_vectors=self.cat._cat_vectors,
695
660
  )
696
661
 
697
662
 
663
+ class DataFrameCurator(SlotsCurator):
664
+ # the example in the docstring is tested in test_curators_quickstart_example
665
+ """Curator for `DataFrame`.
666
+
667
+ Args:
668
+ dataset: The DataFrame-like object to validate & annotate.
669
+ schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
670
+ slot: Indicate the slot in a composite curator for a composite data structure.
671
+
672
+ Examples:
673
+
674
+ For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
675
+
676
+ Here is an example that enforces a minimal set of columns in the dataframe.
677
+
678
+ .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
679
+ :language: python
680
+
681
+ Under-the-hood, this used the following schema.
682
+
683
+ .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
684
+ :language: python
685
+
686
+ Valid features & labels were defined as:
687
+
688
+ .. literalinclude:: scripts/define_mini_immuno_features_labels.py
689
+ :language: python
690
+
691
+ It is also possible to curate the `attrs` slot.
692
+
693
+ .. literalinclude:: scripts/curate_dataframe_attrs.py
694
+ :language: python
695
+ """
696
+
697
+ def __init__(
698
+ self,
699
+ dataset: pd.DataFrame | Artifact,
700
+ schema: Schema,
701
+ slot: str | None = None,
702
+ ) -> None:
703
+ super().__init__(dataset=dataset, schema=schema)
704
+
705
+ # Create atomic curator for features only
706
+ if len(self._schema.features.all()) > 0:
707
+ self._atomic_curator = ComponentCurator(
708
+ dataset=dataset,
709
+ schema=schema,
710
+ slot=slot,
711
+ )
712
+
713
+ # Handle (nested) attrs
714
+ if slot is None and schema.slots:
715
+ for slot_name, slot_schema in schema.slots.items():
716
+ if slot_name.startswith("attrs"):
717
+ path_parts = slot_name.split(":")
718
+ attrs_dict = getattr(self._dataset, "attrs", None)
719
+ if attrs_dict is not None:
720
+ if len(path_parts) == 1:
721
+ data = attrs_dict
722
+ else:
723
+ deeper_keys = path_parts[1:]
724
+ data = _resolve_schema_slot_path(
725
+ attrs_dict, deeper_keys, slot_name, "attrs"
726
+ )
727
+ df = pd.DataFrame([data])
728
+ self._slots[slot_name] = ComponentCurator(
729
+ df, slot_schema, slot=slot_name
730
+ )
731
+ else:
732
+ raise ValueError(
733
+ f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
734
+ )
735
+
736
+ @property
737
+ def cat(self) -> DataFrameCatManager:
738
+ """Manage categoricals by updating registries."""
739
+ if hasattr(self, "_atomic_curator"):
740
+ return self._atomic_curator.cat
741
+ raise AttributeError("cat is only available for slots DataFrameCurator")
742
+
743
+ def standardize(self) -> None:
744
+ """Standardize the dataset.
745
+
746
+ - Adds missing columns for features
747
+ - Fills missing values for features with default values
748
+ """
749
+ if hasattr(self, "_atomic_curator"):
750
+ self._atomic_curator.standardize()
751
+ else:
752
+ for slot_curator in self._slots.values():
753
+ slot_curator.standardize()
754
+
755
+ @doc_args(VALIDATE_DOCSTRING)
756
+ def validate(self) -> None:
757
+ """{}."""
758
+ if hasattr(self, "_atomic_curator"):
759
+ self._atomic_curator.validate()
760
+ self._is_validated = self._atomic_curator._is_validated
761
+ if self._schema.itype == "Composite":
762
+ super().validate()
763
+
764
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
765
+ def save_artifact(
766
+ self, *, key=None, description=None, revises=None, run=None
767
+ ) -> Artifact:
768
+ """{}."""
769
+ if not self._is_validated:
770
+ self.validate()
771
+
772
+ if self._slots:
773
+ self._slots["columns"] = self._atomic_curator
774
+ try:
775
+ return super().save_artifact(
776
+ key=key, description=description, revises=revises, run=run
777
+ )
778
+ finally:
779
+ del self._slots["columns"]
780
+ else:
781
+ return self._atomic_curator.save_artifact(
782
+ key=key, description=description, revises=revises, run=run
783
+ )
784
+
785
+
786
+ def _resolve_schema_slot_path(
787
+ target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
788
+ ) -> Any:
789
+ """Resolve a schema slot path by traversing nested dictionary keys.
790
+
791
+ Args:
792
+ target_dict: Root dictionary to traverse
793
+ slot_keys: Sequence of keys defining the paths to traverse
794
+ slot_name: Schema slot identifier for error context
795
+ base_path: Base path string for error context
796
+
797
+ Returns:
798
+ The value at the resolved path
799
+ """
800
+ current = target_dict
801
+
802
+ for key in slot_keys:
803
+ base_path += f"['{key}']"
804
+ try:
805
+ current = current[key]
806
+ except KeyError:
807
+ available = (
808
+ list(current.keys()) if isinstance(current, dict) else "not a dict"
809
+ )
810
+ raise InvalidArgument(
811
+ f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
812
+ f"not found. Available keys at this level: {available}"
813
+ ) from None
814
+
815
+ return current
816
+
817
+
818
+ def _handle_dict_slots(
819
+ dataset: ScverseDataStructures, slot: str
820
+ ) -> tuple[pd.DataFrame | None, str | None, str | None]:
821
+ """Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.
822
+
823
+ Supports two patterns:
824
+ - Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key"
825
+ - Modality dict access: "modality:uns"
826
+
827
+ Args:
828
+ dataset: The scverse datastructure object
829
+ slot: The slot path string to parse like 'uns:path:to'.
830
+
831
+ Returns:
832
+ tuple: (dataframe, modality_key, remaining_slot_path)
833
+ - dataframe: Single-row DataFrame containing the resolved data
834
+ - modality_key: Modality identifier if slot targets modality dict, else None
835
+ - remaining_slot_path: The dict attribute and nested keys as string
836
+ """
837
+ path_parts = slot.split(":")
838
+
839
+ # Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..."
840
+ if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]:
841
+ dict_attr = getattr(dataset, path_parts[0], None)
842
+ if dict_attr is not None:
843
+ if len(path_parts) == 1:
844
+ return pd.DataFrame([dict_attr]), None, path_parts[0]
845
+
846
+ deeper_keys = path_parts[1:]
847
+ data = _resolve_schema_slot_path(
848
+ dict_attr, deeper_keys, slot, path_parts[0]
849
+ )
850
+ return pd.DataFrame([data]), None, ":".join(path_parts[1:])
851
+
852
+ # Handle modality dict slots: "modality:uns", "modality:uns:key1:key2"
853
+ elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]:
854
+ modality, dict_name = path_parts[0], path_parts[1]
855
+ try:
856
+ modality_dataset = dataset[modality]
857
+ dict_attr = getattr(modality_dataset, dict_name, None)
858
+ if dict_attr is not None:
859
+ if len(path_parts) == 2:
860
+ return pd.DataFrame([dict_attr]), modality, dict_name
861
+
862
+ deeper_keys = path_parts[2:]
863
+ data = _resolve_schema_slot_path(
864
+ dict_attr, deeper_keys, slot, f"{modality}.{dict_name}"
865
+ )
866
+ return pd.DataFrame([data]), modality, ":".join(path_parts[1:])
867
+ except (KeyError, AttributeError):
868
+ pass
869
+ else:
870
+ raise InvalidArgument(
871
+ f"Invalid dict slot pattern '{slot}'. Expected formats: "
872
+ f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'"
873
+ )
874
+
875
+ return None, None, None
876
+
877
+
878
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
698
879
  class AnnDataCurator(SlotsCurator):
699
880
  """Curator for `AnnData`.
700
881
 
882
+ {}
883
+
701
884
  Args:
702
885
  dataset: The AnnData-like object to validate & annotate.
703
886
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -722,32 +905,35 @@ class AnnDataCurator(SlotsCurator):
722
905
  raise InvalidArgument("dataset must be AnnData-like.")
723
906
  if schema.otype != "AnnData":
724
907
  raise InvalidArgument("Schema otype must be 'AnnData'.")
725
- self._slots = {
726
- slot: DataFrameCurator(
727
- (
908
+
909
+ for slot, slot_schema in schema.slots.items():
910
+ if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"):
911
+ raise ValueError(
912
+ f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}"
913
+ )
914
+ if slot.startswith("uns"):
915
+ df, _, _ = _handle_dict_slots(self._dataset, slot)
916
+ elif slot in {"obs", "var", "var.T"}:
917
+ df = (
728
918
  getattr(self._dataset, slot.strip(".T")).T
729
919
  if slot == "var.T"
730
920
  or (
731
- # backward compat
732
921
  slot == "var"
733
922
  and schema.slots["var"].itype not in {None, "Feature"}
734
923
  )
735
924
  else getattr(self._dataset, slot)
736
- ),
737
- slot_schema,
738
- slot=slot,
739
- )
740
- for slot, slot_schema in schema.slots.items()
741
- if slot in {"obs", "var", "var.T", "uns"}
742
- }
743
- if "var" in self._slots and schema.slots["var"].itype not in {None, "Feature"}:
744
- logger.warning(
745
- "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
746
- )
747
- self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
748
- "var"
749
- ].cat._cat_vectors.pop("columns")
750
- self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
925
+ )
926
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
927
+
928
+ # Handle var index naming for backward compat
929
+ if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}:
930
+ logger.warning(
931
+ "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
932
+ )
933
+ self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
934
+ "var"
935
+ ].cat._cat_vectors.pop("columns")
936
+ self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
751
937
 
752
938
 
753
939
  def _assign_var_fields_categoricals_multimodal(
@@ -757,11 +943,10 @@ def _assign_var_fields_categoricals_multimodal(
757
943
  slot_schema: Schema,
758
944
  var_fields: dict[str, FieldAttr],
759
945
  cat_vectors: dict[str, dict[str, CatVector]],
760
- slots: dict[str, DataFrameCurator],
946
+ slots: dict[str, ComponentCurator],
761
947
  ) -> None:
762
948
  """Assigns var_fields and categoricals for multimodal data curators."""
763
949
  if modality is not None:
764
- # Makes sure that all tables are present
765
950
  var_fields[modality] = None
766
951
  cat_vectors[modality] = {}
767
952
 
@@ -782,15 +967,17 @@ def _assign_var_fields_categoricals_multimodal(
782
967
  cat_vectors[modality] = obs_fields
783
968
 
784
969
 
970
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
785
971
  class MuDataCurator(SlotsCurator):
786
972
  """Curator for `MuData`.
787
973
 
974
+ {}
975
+
788
976
  Args:
789
977
  dataset: The MuData-like object to validate & annotate.
790
978
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
791
979
 
792
980
  Example:
793
-
794
981
  .. literalinclude:: scripts/curate_mudata.py
795
982
  :language: python
796
983
  :caption: curate_mudata.py
@@ -811,12 +998,32 @@ class MuDataCurator(SlotsCurator):
811
998
  raise InvalidArgument("Schema otype must be 'MuData'.")
812
999
 
813
1000
  for slot, slot_schema in schema.slots.items():
814
- if ":" in slot:
815
- modality, modality_slot = slot.split(":")
816
- schema_dataset = self._dataset.__getitem__(modality)
1001
+ # Handle slots: "mdata.uns", "modality:uns"
1002
+ if "uns" in slot:
1003
+ df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)
817
1004
  else:
818
- modality, modality_slot = None, slot
819
- schema_dataset = self._dataset
1005
+ # Handle slots: "modality:obs", "modality:var"
1006
+ parts = slot.split(":")
1007
+ if len(parts) == 2:
1008
+ modality, modality_slot = parts
1009
+ try:
1010
+ schema_dataset = self._dataset[modality]
1011
+ df = getattr(schema_dataset, modality_slot.rstrip(".T"))
1012
+ except KeyError:
1013
+ raise InvalidArgument(
1014
+ f"Modality '{modality}' not found in MuData"
1015
+ ) from None
1016
+ except AttributeError:
1017
+ raise InvalidArgument(
1018
+ f"Attribute '{modality_slot}' not found on modality '{modality}'"
1019
+ ) from None
1020
+ else:
1021
+ # Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above)
1022
+ modality, modality_slot = None, slot
1023
+ schema_dataset = self._dataset
1024
+ df = getattr(schema_dataset, modality_slot.rstrip(".T"))
1025
+
1026
+ # Transpose var if necessary
820
1027
  if modality_slot == "var" and schema.slots[slot].itype not in {
821
1028
  None,
822
1029
  "Feature",
@@ -824,19 +1031,12 @@ class MuDataCurator(SlotsCurator):
824
1031
  logger.warning(
825
1032
  "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
826
1033
  )
827
- self._slots[slot] = DataFrameCurator(
828
- (
829
- getattr(schema_dataset, modality_slot.rstrip(".T")).T
830
- if modality_slot == "var.T"
831
- or (
832
- # backward compat
833
- modality_slot == "var"
834
- and schema.slots[slot].itype not in {None, "Feature"}
835
- )
836
- else getattr(schema_dataset, modality_slot)
837
- ),
838
- slot_schema,
839
- )
1034
+ df = df.T
1035
+ elif modality_slot == "var.T":
1036
+ df = df.T
1037
+
1038
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
1039
+
840
1040
  _assign_var_fields_categoricals_multimodal(
841
1041
  modality=modality,
842
1042
  slot_type=modality_slot,
@@ -846,18 +1046,21 @@ class MuDataCurator(SlotsCurator):
846
1046
  cat_vectors=self._cat_vectors,
847
1047
  slots=self._slots,
848
1048
  )
1049
+
849
1050
  self._columns_field = self._var_fields
850
1051
 
851
1052
 
1053
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
852
1054
  class SpatialDataCurator(SlotsCurator):
853
1055
  """Curator for `SpatialData`.
854
1056
 
1057
+ {}
1058
+
855
1059
  Args:
856
1060
  dataset: The SpatialData-like object to validate & annotate.
857
1061
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
858
1062
 
859
1063
  Example:
860
-
861
1064
  .. literalinclude:: scripts/curate_spatialdata.py
862
1065
  :language: python
863
1066
  :caption: curate_spatialdata.py
@@ -878,69 +1081,75 @@ class SpatialDataCurator(SlotsCurator):
878
1081
  raise InvalidArgument("Schema otype must be 'SpatialData'.")
879
1082
 
880
1083
  for slot, slot_schema in schema.slots.items():
881
- split_result = slot.split(":")
882
- if (len(split_result) == 2 and split_result[0] == "table") or (
883
- len(split_result) == 3 and split_result[0] == "tables"
884
- ):
885
- if len(split_result) == 2:
886
- table_key, sub_slot = split_result
887
- logger.warning(
888
- f"please prefix slot {slot} with 'tables:' going forward"
889
- )
1084
+ # Handle slots: "sdata:attrs"
1085
+ if slot.startswith("attrs"):
1086
+ df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)
1087
+ else:
1088
+ parts = slot.split(":")
1089
+ # Handle slots: "tables:table_key:obs", "tables:table_key:var"
1090
+ if len(parts) == 3 and parts[0] == "tables":
1091
+ table_key, table_slot = parts[1], parts[2]
1092
+ try:
1093
+ slot_object = self._dataset.tables[table_key]
1094
+ df = getattr(slot_object, table_slot.rstrip(".T"))
1095
+ except KeyError:
1096
+ raise InvalidArgument(
1097
+ f"Table '{table_key}' not found in sdata.tables"
1098
+ ) from None
1099
+ except AttributeError:
1100
+ raise InvalidArgument(
1101
+ f"Attribute '{table_slot}' not found on table '{table_key}'"
1102
+ ) from None
890
1103
  else:
891
- table_key, sub_slot = split_result[1], split_result[2]
892
- slot_object = self._dataset.tables.__getitem__(table_key)
893
- if sub_slot == "var" and schema.slots[slot].itype not in {
894
- None,
895
- "Feature",
896
- }:
897
- logger.warning(
898
- "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
899
- )
900
- data_object = (
901
- getattr(slot_object, sub_slot.rstrip(".T")).T
902
- if sub_slot == "var.T"
903
- or (
904
- # backward compat
905
- sub_slot == "var"
906
- and schema.slots[slot].itype not in {None, "Feature"}
907
- )
908
- else getattr(slot_object, sub_slot)
909
- )
910
- elif len(split_result) == 1 or (
911
- len(split_result) > 1 and split_result[0] == "attrs"
912
- ):
913
- table_key = None
914
- if len(split_result) == 1:
915
- if split_result[0] != "attrs":
1104
+ # Handle legacy single keys for backward compatibility
1105
+ if len(parts) == 1 and parts[0] != "attrs":
916
1106
  logger.warning(
917
1107
  f"please prefix slot {slot} with 'attrs:' going forward"
918
1108
  )
919
- sub_slot = slot
920
- data_object = self._dataset.attrs[slot]
1109
+ try:
1110
+ df = pd.DataFrame([self._dataset.attrs[slot]])
1111
+ table_key = None
1112
+ table_slot = slot
1113
+ except KeyError:
1114
+ raise InvalidArgument(
1115
+ f"Slot '{slot}' not found in sdata.attrs"
1116
+ ) from None
921
1117
  else:
922
- sub_slot = "attrs"
923
- data_object = self._dataset.attrs
924
- elif len(split_result) == 2:
925
- sub_slot = split_result[1]
926
- data_object = self._dataset.attrs[split_result[1]]
927
- data_object = pd.DataFrame([data_object])
928
- self._slots[slot] = DataFrameCurator(data_object, slot_schema, slot)
1118
+ raise InvalidArgument(f"Unrecognized slot format: {slot}")
1119
+
1120
+ # Handle var transposition logic
1121
+ if table_slot == "var" and schema.slots[slot].itype not in {
1122
+ None,
1123
+ "Feature",
1124
+ }:
1125
+ logger.warning(
1126
+ "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
1127
+ )
1128
+ df = df.T
1129
+ elif table_slot == "var.T":
1130
+ df = df.T
1131
+
1132
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot)
1133
+
929
1134
  _assign_var_fields_categoricals_multimodal(
930
1135
  modality=table_key,
931
- slot_type=sub_slot,
1136
+ slot_type=table_slot,
932
1137
  slot=slot,
933
1138
  slot_schema=slot_schema,
934
1139
  var_fields=self._var_fields,
935
1140
  cat_vectors=self._cat_vectors,
936
1141
  slots=self._slots,
937
1142
  )
1143
+
938
1144
  self._columns_field = self._var_fields
939
1145
 
940
1146
 
1147
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
941
1148
  class TiledbsomaExperimentCurator(SlotsCurator):
942
1149
  """Curator for `tiledbsoma.Experiment`.
943
1150
 
1151
+ {}
1152
+
944
1153
  Args:
945
1154
  dataset: The `tiledbsoma.Experiment` object.
946
1155
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -977,7 +1186,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
977
1186
  .drop("soma_joinid", axis=1, errors="ignore")
978
1187
  )
979
1188
 
980
- self._slots[slot] = DataFrameCurator(
1189
+ self._slots[slot] = ComponentCurator(
981
1190
  (schema_dataset.T if modality_slot == "var.T" else schema_dataset),
982
1191
  slot_schema,
983
1192
  )
@@ -990,7 +1199,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
990
1199
  .to_pandas()
991
1200
  .drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
992
1201
  )
993
- self._slots[slot] = DataFrameCurator(
1202
+ self._slots[slot] = ComponentCurator(
994
1203
  schema_dataset,
995
1204
  slot_schema,
996
1205
  )
@@ -1040,9 +1249,12 @@ class CatVector:
1040
1249
  self._maximal_set = maximal_set
1041
1250
 
1042
1251
  self._all_filters = {"source": self._source, "organism": self._organism}
1252
+
1043
1253
  if self._subtype_str and "=" in self._subtype_str:
1044
1254
  self._all_filters.update(
1045
- resolve_relation_filters(parse_filter_string(self._subtype_str), self) # type: ignore
1255
+ resolve_relation_filters(
1256
+ parse_filter_string(self._subtype_str), self._field.field.model
1257
+ ) # type: ignore
1046
1258
  )
1047
1259
 
1048
1260
  if hasattr(field.field.model, "_name_field"):
@@ -1241,7 +1453,7 @@ class CatVector:
1241
1453
  type_record = registry.get(name=self._subtype_str)
1242
1454
  if df is not None and registry == Feature:
1243
1455
  nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
1244
- non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
1456
+ non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])
1245
1457
  else:
1246
1458
  if (
1247
1459
  self._organism
@@ -1343,7 +1555,7 @@ class CatVector:
1343
1555
  warning_message += "\n for remaining terms:\n"
1344
1556
  warning_message += f" → fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}"
1345
1557
  if self._subtype_query_set is not None:
1346
- warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.list('name')}"
1558
+ warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.to_list('name')}"
1347
1559
  logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
1348
1560
  logger.warning(warning_message)
1349
1561
  if self._cat_manager is not None:
@@ -1493,6 +1705,30 @@ class DataFrameCatManager:
1493
1705
  """The categorical features."""
1494
1706
  return self._categoricals
1495
1707
 
1708
+ def __repr__(self) -> str:
1709
+ cls_name = colors.green(self.__class__.__name__)
1710
+
1711
+ status_str = (
1712
+ f"{colors.green('validated')}"
1713
+ if self._is_validated
1714
+ else f"{colors.yellow('unvalidated')}"
1715
+ )
1716
+
1717
+ info_parts = []
1718
+
1719
+ cat_count = len(self._categoricals)
1720
+ if cat_count > 0:
1721
+ info_parts.append(f"categorical_features={cat_count}")
1722
+
1723
+ if self._slot:
1724
+ info_parts.append(f"slot: {colors.italic(self._slot)}")
1725
+
1726
+ info_str = ", ".join(info_parts)
1727
+ if info_str:
1728
+ return f"{cls_name}({info_str}, {status_str})"
1729
+ else:
1730
+ return f"{cls_name}({status_str})"
1731
+
1496
1732
  def lookup(self, public: bool = False) -> CatLookup:
1497
1733
  """Lookup categories.
1498
1734
 
@@ -1537,7 +1773,9 @@ class DataFrameCatManager:
1537
1773
  key: The key referencing the column in the DataFrame to standardize.
1538
1774
  """
1539
1775
  if self._artifact is not None:
1540
- raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1776
+ raise RuntimeError(
1777
+ "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
1778
+ )
1541
1779
 
1542
1780
  if key == "all":
1543
1781
  logger.warning(
@@ -1610,7 +1848,7 @@ def get_organism_kwargs(
1610
1848
  def annotate_artifact(
1611
1849
  artifact: Artifact,
1612
1850
  *,
1613
- curator: AnnDataCurator | SlotsCurator | None = None,
1851
+ curator: SlotsCurator | None = None,
1614
1852
  cat_vectors: dict[str, CatVector] | None = None,
1615
1853
  ) -> Artifact:
1616
1854
  from .. import settings
@@ -1643,7 +1881,9 @@ def annotate_artifact(
1643
1881
  )
1644
1882
 
1645
1883
  # annotate with inferred schemas aka feature sets
1646
- if artifact.otype == "DataFrame":
1884
+ if (
1885
+ artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None
1886
+ ): # Prevent overwriting user-defined schemas that contain slots
1647
1887
  features = cat_vectors["columns"].records
1648
1888
  if features is not None:
1649
1889
  index_feature = artifact.schema.index
@@ -1663,7 +1903,11 @@ def annotate_artifact(
1663
1903
  logger.important(
1664
1904
  f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
1665
1905
  )
1666
- itype = parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
1906
+ itype = (
1907
+ Feature.name
1908
+ if artifact.schema.itype == "Composite"
1909
+ else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
1910
+ )
1667
1911
  feature_set = Schema(itype=itype, n=len(features))
1668
1912
  artifact.feature_sets.add(
1669
1913
  feature_set.save(), through_defaults={"slot": "columns"}
@@ -1698,9 +1942,13 @@ def annotate_artifact(
1698
1942
  logger.important(
1699
1943
  f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
1700
1944
  )
1701
- itype = parse_cat_dtype(
1702
- artifact.schema.slots[slot].itype, is_itype=True
1703
- )["field"]
1945
+ itype = (
1946
+ Feature.name
1947
+ if artifact.schema.slots[slot].itype == "Composite"
1948
+ else parse_cat_dtype(
1949
+ artifact.schema.slots[slot].itype, is_itype=True
1950
+ )["field"]
1951
+ )
1704
1952
  feature_set = Schema(itype=itype, n=len(features))
1705
1953
  artifact.feature_sets.add(
1706
1954
  feature_set.save(), through_defaults={"slot": slot}