lamindb 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +17 -15
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +45 -2
  8. lamindb/core/storage/_anndata_accessor.py +118 -26
  9. lamindb/core/storage/_backed_access.py +10 -7
  10. lamindb/core/storage/_spatialdata_accessor.py +15 -4
  11. lamindb/core/storage/_zarr.py +3 -0
  12. lamindb/curators/_legacy.py +16 -3
  13. lamindb/curators/core.py +449 -193
  14. lamindb/errors.py +6 -0
  15. lamindb/examples/cellxgene/__init__.py +8 -3
  16. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  17. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  18. lamindb/examples/croissant/__init__.py +32 -6
  19. lamindb/examples/datasets/__init__.py +2 -2
  20. lamindb/examples/datasets/_core.py +9 -2
  21. lamindb/examples/datasets/_small.py +66 -22
  22. lamindb/examples/fixtures/sheets.py +8 -2
  23. lamindb/integrations/_croissant.py +34 -11
  24. lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
  25. lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
  26. lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
  27. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  28. lamindb/migrations/0121_recorduser.py +60 -0
  29. lamindb/models/__init__.py +4 -1
  30. lamindb/models/_describe.py +2 -2
  31. lamindb/models/_feature_manager.py +131 -71
  32. lamindb/models/_from_values.py +2 -2
  33. lamindb/models/_is_versioned.py +4 -4
  34. lamindb/models/_label_manager.py +4 -4
  35. lamindb/models/artifact.py +357 -192
  36. lamindb/models/artifact_set.py +45 -1
  37. lamindb/models/can_curate.py +1 -2
  38. lamindb/models/collection.py +3 -34
  39. lamindb/models/feature.py +111 -7
  40. lamindb/models/has_parents.py +11 -11
  41. lamindb/models/project.py +42 -2
  42. lamindb/models/query_manager.py +16 -7
  43. lamindb/models/query_set.py +191 -78
  44. lamindb/models/record.py +30 -5
  45. lamindb/models/run.py +10 -33
  46. lamindb/models/save.py +6 -8
  47. lamindb/models/schema.py +54 -26
  48. lamindb/models/sqlrecord.py +152 -40
  49. lamindb/models/storage.py +59 -14
  50. lamindb/models/transform.py +17 -17
  51. lamindb/models/ulabel.py +6 -1
  52. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/METADATA +11 -16
  53. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/RECORD +55 -50
  54. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/LICENSE +0 -0
  55. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/WHEEL +0 -0
lamindb/curators/core.py CHANGED
@@ -5,6 +5,7 @@
5
5
 
6
6
  Curator
7
7
  SlotsCurator
8
+ ComponentCurator
8
9
  CatVector
9
10
  CatLookup
10
11
  DataFrameCatManager
@@ -15,7 +16,6 @@ from __future__ import annotations
15
16
 
16
17
  import copy
17
18
  import re
18
- from collections.abc import Iterable
19
19
  from typing import TYPE_CHECKING, Any, Callable
20
20
 
21
21
  import lamindb_setup as ln_setup
@@ -24,7 +24,9 @@ import pandas as pd
24
24
  import pandera.pandas as pandera
25
25
  from lamin_utils import colors, logger
26
26
  from lamindb_setup.core._docs import doc_args
27
+ from lamindb_setup.core.upath import LocalPathClasses
27
28
 
29
+ from lamindb.base.dtypes import check_dtype
28
30
  from lamindb.base.types import FieldAttr # noqa
29
31
  from lamindb.models import (
30
32
  Artifact,
@@ -48,6 +50,7 @@ from lamindb.models.feature import (
48
50
  from ..errors import InvalidArgument, ValidationError
49
51
 
50
52
  if TYPE_CHECKING:
53
+ from collections.abc import Iterable
51
54
  from typing import Any
52
55
 
53
56
  from anndata import AnnData
@@ -145,6 +148,7 @@ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
145
148
 
146
149
  SLOTS_DOCSTRING = """Access sub curators by slot."""
147
150
 
151
+ SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures."""
148
152
 
149
153
  VALIDATE_DOCSTRING = """Validate dataset against Schema.
150
154
 
@@ -197,7 +201,21 @@ class Curator:
197
201
  "MuData",
198
202
  "SpatialData",
199
203
  }:
200
- self._dataset = self._dataset.load(is_run_input=False)
204
+ # Open remote AnnData Artifacts
205
+ if not isinstance(self._artifact.path, LocalPathClasses):
206
+ if self._artifact.otype in {
207
+ "AnnData",
208
+ }:
209
+ try:
210
+ self._dataset = self._dataset.open(mode="r")
211
+ # open can raise various errors. Fall back to loading into memory if open fails
212
+ except Exception as e:
213
+ logger.warning(
214
+ f"Unable to open remote AnnData Artifact: {e}. Falling back to loading into memory."
215
+ )
216
+ self._dataset = self._dataset.load(is_run_input=False)
217
+ else:
218
+ self._dataset = self._dataset.load(is_run_input=False)
201
219
  self._schema: Schema | None = schema
202
220
  self._is_validated: bool = False
203
221
 
@@ -284,9 +302,12 @@ class Curator:
284
302
  )
285
303
 
286
304
 
305
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
287
306
  class SlotsCurator(Curator):
288
307
  """Curator for a dataset with slots.
289
308
 
309
+ {}
310
+
290
311
  Args:
291
312
  dataset: The dataset to validate & annotate.
292
313
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -298,7 +319,7 @@ class SlotsCurator(Curator):
298
319
  schema: Schema,
299
320
  ) -> None:
300
321
  super().__init__(dataset=dataset, schema=schema)
301
- self._slots: dict[str, DataFrameCurator] = {}
322
+ self._slots: dict[str, ComponentCurator] = {}
302
323
 
303
324
  # used for multimodal data structures (not AnnData)
304
325
  # in form of {table/modality_key: var_field}
@@ -308,7 +329,7 @@ class SlotsCurator(Curator):
308
329
 
309
330
  @property
310
331
  @doc_args(SLOTS_DOCSTRING)
311
- def slots(self) -> dict[str, DataFrameCurator]:
332
+ def slots(self) -> dict[str, ComponentCurator]:
312
333
  """{}""" # noqa: D415
313
334
  return self._slots
314
335
 
@@ -336,6 +357,10 @@ class SlotsCurator(Curator):
336
357
 
337
358
  if self._artifact is None:
338
359
  type_mapping = [
360
+ (
361
+ lambda dataset: isinstance(dataset, pd.DataFrame),
362
+ Artifact.from_dataframe,
363
+ ),
339
364
  (
340
365
  lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
341
366
  Artifact.from_anndata,
@@ -364,12 +389,13 @@ class SlotsCurator(Curator):
364
389
  )
365
390
  break
366
391
 
367
- self._artifact.schema = self._schema
368
- self._artifact.save()
369
392
  cat_vectors = {}
370
393
  for curator in self._slots.values():
371
394
  for key, cat_vector in curator.cat._cat_vectors.items():
372
395
  cat_vectors[key] = cat_vector
396
+
397
+ self._artifact.schema = self._schema
398
+ self._artifact.save()
373
399
  return annotate_artifact( # type: ignore
374
400
  self._artifact,
375
401
  curator=self,
@@ -377,92 +403,21 @@ class SlotsCurator(Curator):
377
403
  )
378
404
 
379
405
 
380
- def is_list_of_type(value, expected_type):
381
- """Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure."""
382
- if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
383
- # handle nested lists recursively
384
- return all(is_list_of_type(item, expected_type) for item in value)
385
- return isinstance(value, expected_type)
386
-
387
-
388
- def check_dtype(expected_type) -> Callable:
389
- """Creates a check function for Pandera that validates a column's dtype.
390
-
391
- Supports both standard dtype checking and mixed list/single values for the same type.
392
- For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
393
-
394
- Args:
395
- expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
396
-
397
- Returns:
398
- A function that checks if a series has the expected dtype or contains mixed types
399
- """
400
-
401
- def check_function(series):
402
- # first check if the series is entirely of the expected dtype (fast path)
403
- if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
404
- return True
405
- elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
406
- return True
407
- elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
408
- return True
409
- elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
410
- return True
411
- elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
412
- return True
413
-
414
- # if we're here, it might be a mixed column with object dtype
415
- # need to check each value individually
416
- if series.dtype == "object" and expected_type.startswith("list"):
417
- expected_type_member = expected_type.replace("list[", "").removesuffix("]")
418
- if expected_type_member == "int":
419
- return series.apply(lambda x: is_list_of_type(x, int)).all()
420
- elif expected_type_member == "float":
421
- return series.apply(lambda x: is_list_of_type(x, float)).all()
422
- elif expected_type_member == "num":
423
- # for numeric, accept either int or float
424
- return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
425
- elif (
426
- expected_type_member == "str"
427
- or expected_type_member == "path"
428
- or expected_type_member.startswith("cat[")
429
- ):
430
- return series.apply(lambda x: is_list_of_type(x, str)).all()
431
-
432
- # if we get here, the validation failed
433
- return False
434
-
435
- return check_function
436
-
437
-
438
- # this is also currently used as DictCurator
439
- class DataFrameCurator(Curator):
440
- # the example in the docstring is tested in test_curators_quickstart_example
406
+ # This is also currently used as DictCurator by flattening dictionaries into wide DataFrames.
407
+ # Such an approach was never intended and there is room for a DictCurator in the future.
408
+ # For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
409
+ # https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836
410
+ class ComponentCurator(Curator):
441
411
  """Curator for `DataFrame`.
442
412
 
413
+ Provides all key functionality to validate Pandas DataFrames.
414
+ This class is not user facing unlike :class:`~lamindb.curators.DataFrameCurator` which extends this
415
+ class with functionality to validate the `attrs` slot.
416
+
443
417
  Args:
444
418
  dataset: The DataFrame-like object to validate & annotate.
445
419
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
446
420
  slot: Indicate the slot in a composite curator for a composite data structure.
447
-
448
- Example:
449
-
450
- For simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
451
-
452
- Here is an example that enforces a minimal set of columns in the dataframe.
453
-
454
- .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
455
- :language: python
456
-
457
- Under-the-hood, this used the following schema.
458
-
459
- .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
460
- :language: python
461
-
462
- Valid features & labels were defined as:
463
-
464
- .. literalinclude:: scripts/define_mini_immuno_features_labels.py
465
- :language: python
466
421
  """
467
422
 
468
423
  def __init__(
@@ -478,18 +433,18 @@ class DataFrameCurator(Curator):
478
433
  feature_ids: set[int] = set()
479
434
 
480
435
  if schema.flexible:
481
- features += Feature.filter(name__in=self._dataset.keys()).list()
436
+ features += Feature.filter(name__in=self._dataset.keys()).to_list()
482
437
  feature_ids = {feature.id for feature in features}
483
438
 
484
439
  if schema.n > 0:
485
440
  if schema._index_feature_uid is not None:
486
441
  schema_features = [
487
442
  feature
488
- for feature in schema.members.list()
443
+ for feature in schema.members.to_list()
489
444
  if feature.uid != schema._index_feature_uid # type: ignore
490
445
  ]
491
446
  else:
492
- schema_features = schema.members.list() # type: ignore
447
+ schema_features = schema.members.to_list() # type: ignore
493
448
  if feature_ids:
494
449
  features.extend(
495
450
  feature
@@ -580,9 +535,13 @@ class DataFrameCurator(Curator):
580
535
  # in the DataFrameCatManager, we use the
581
536
  # actual columns of the dataset, not the pandera columns
582
537
  # the pandera columns might have additional optional columns
538
+ if schema.itype == "Composite":
539
+ columns_field = Feature.name
540
+ else:
541
+ columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"]
583
542
  self._cat_manager = DataFrameCatManager(
584
543
  self._dataset,
585
- columns_field=parse_cat_dtype(schema.itype, is_itype=True)["field"],
544
+ columns_field=columns_field,
586
545
  categoricals=categoricals,
587
546
  index=schema.index,
588
547
  slot=slot,
@@ -601,6 +560,11 @@ class DataFrameCurator(Curator):
601
560
  - Adds missing columns for features
602
561
  - Fills missing values for features with default values
603
562
  """
563
+ if self._artifact is not None:
564
+ raise RuntimeError(
565
+ "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
566
+ )
567
+
604
568
  for feature in self._schema.members:
605
569
  if feature.name not in self._dataset.columns:
606
570
  if feature.default_value is not None or feature.nullable:
@@ -679,35 +643,262 @@ class DataFrameCurator(Curator):
679
643
  if not self._is_validated:
680
644
  self.validate() # raises ValidationError if doesn't validate
681
645
  if self._artifact is None:
682
- self._artifact = Artifact.from_df(
646
+ self._artifact = Artifact.from_dataframe(
683
647
  self._dataset,
684
648
  key=key,
685
649
  description=description,
686
650
  revises=revises,
687
651
  run=run,
688
- format=".csv" if key.endswith(".csv") else None,
652
+ format=".csv" if key is not None and key.endswith(".csv") else None,
689
653
  )
690
- self._artifact.schema = self._schema
691
- self._artifact.save()
654
+
655
+ self._artifact.schema = self._schema
656
+ self._artifact.save()
692
657
  return annotate_artifact( # type: ignore
693
658
  self._artifact,
694
659
  cat_vectors=self.cat._cat_vectors,
695
660
  )
696
661
 
697
662
 
663
+ class DataFrameCurator(SlotsCurator):
664
+ # the example in the docstring is tested in test_curators_quickstart_example
665
+ """Curator for `DataFrame`.
666
+
667
+ Args:
668
+ dataset: The DataFrame-like object to validate & annotate.
669
+ schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
670
+ slot: Indicate the slot in a composite curator for a composite data structure.
671
+
672
+ Examples:
673
+
674
+ For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_dataframe`.
675
+
676
+ Here is an example that enforces a minimal set of columns in the dataframe.
677
+
678
+ .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
679
+ :language: python
680
+
681
+ Under-the-hood, this used the following schema.
682
+
683
+ .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
684
+ :language: python
685
+
686
+ Valid features & labels were defined as:
687
+
688
+ .. literalinclude:: scripts/define_mini_immuno_features_labels.py
689
+ :language: python
690
+
691
+ It is also possible to curate the `attrs` slot.
692
+
693
+ .. literalinclude:: scripts/curate_dataframe_attrs.py
694
+ :language: python
695
+ """
696
+
697
+ def __init__(
698
+ self,
699
+ dataset: pd.DataFrame | Artifact,
700
+ schema: Schema,
701
+ slot: str | None = None,
702
+ ) -> None:
703
+ super().__init__(dataset=dataset, schema=schema)
704
+
705
+ # Create atomic curator for features only
706
+ if len(self._schema.features.all()) > 0:
707
+ self._atomic_curator = ComponentCurator(
708
+ dataset=dataset,
709
+ schema=schema,
710
+ slot=slot,
711
+ )
712
+
713
+ # Handle (nested) attrs
714
+ if slot is None and schema.slots:
715
+ for slot_name, slot_schema in schema.slots.items():
716
+ if slot_name.startswith("attrs"):
717
+ path_parts = slot_name.split(":")
718
+ attrs_dict = getattr(self._dataset, "attrs", None)
719
+ if attrs_dict is not None:
720
+ if len(path_parts) == 1:
721
+ data = attrs_dict
722
+ else:
723
+ deeper_keys = path_parts[1:]
724
+ data = _resolve_schema_slot_path(
725
+ attrs_dict, deeper_keys, slot_name, "attrs"
726
+ )
727
+ df = pd.DataFrame([data])
728
+ self._slots[slot_name] = ComponentCurator(
729
+ df, slot_schema, slot=slot_name
730
+ )
731
+ else:
732
+ raise ValueError(
733
+ f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
734
+ )
735
+
736
+ @property
737
+ def cat(self) -> DataFrameCatManager:
738
+ """Manage categoricals by updating registries."""
739
+ if hasattr(self, "_atomic_curator"):
740
+ return self._atomic_curator.cat
741
+ raise AttributeError("cat is only available for slots DataFrameCurator")
742
+
743
+ def standardize(self) -> None:
744
+ """Standardize the dataset.
745
+
746
+ - Adds missing columns for features
747
+ - Fills missing values for features with default values
748
+ """
749
+ if hasattr(self, "_atomic_curator"):
750
+ self._atomic_curator.standardize()
751
+ else:
752
+ for slot_curator in self._slots.values():
753
+ slot_curator.standardize()
754
+
755
+ @doc_args(VALIDATE_DOCSTRING)
756
+ def validate(self) -> None:
757
+ """{}."""
758
+ if hasattr(self, "_atomic_curator"):
759
+ self._atomic_curator.validate()
760
+ self._is_validated = self._atomic_curator._is_validated
761
+ if self._schema.itype == "Composite":
762
+ super().validate()
763
+
764
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
765
+ def save_artifact(
766
+ self, *, key=None, description=None, revises=None, run=None
767
+ ) -> Artifact:
768
+ """{}."""
769
+ if not self._is_validated:
770
+ self.validate()
771
+
772
+ if self._slots:
773
+ self._slots["columns"] = self._atomic_curator
774
+ try:
775
+ return super().save_artifact(
776
+ key=key, description=description, revises=revises, run=run
777
+ )
778
+ finally:
779
+ del self._slots["columns"]
780
+ else:
781
+ return self._atomic_curator.save_artifact(
782
+ key=key, description=description, revises=revises, run=run
783
+ )
784
+
785
+
786
+ def _resolve_schema_slot_path(
787
+ target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
788
+ ) -> Any:
789
+ """Resolve a schema slot path by traversing nested dictionary keys.
790
+
791
+ Args:
792
+ target_dict: Root dictionary to traverse
793
+ slot_keys: Sequence of keys defining the paths to traverse
794
+ slot_name: Schema slot identifier for error context
795
+ base_path: Base path string for error context
796
+
797
+ Returns:
798
+ The value at the resolved path
799
+ """
800
+ current = target_dict
801
+
802
+ for key in slot_keys:
803
+ base_path += f"['{key}']"
804
+ try:
805
+ current = current[key]
806
+ except KeyError:
807
+ available = (
808
+ list(current.keys()) if isinstance(current, dict) else "not a dict"
809
+ )
810
+ raise InvalidArgument(
811
+ f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
812
+ f"not found. Available keys at this level: {available}"
813
+ ) from None
814
+
815
+ return current
816
+
817
+
818
+ def _handle_dict_slots(
819
+ dataset: ScverseDataStructures, slot: str
820
+ ) -> tuple[pd.DataFrame | None, str | None, str | None]:
821
+ """Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.
822
+
823
+ Supports two patterns:
824
+ - Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key"
825
+ - Modality dict access: "modality:uns"
826
+
827
+ Args:
828
+ dataset: The scverse datastructure object
829
+ slot: The slot path string to parse like 'uns:path:to'.
830
+
831
+ Returns:
832
+ tuple: (dataframe, modality_key, remaining_slot_path)
833
+ - dataframe: Single-row DataFrame containing the resolved data
834
+ - modality_key: Modality identifier if slot targets modality dict, else None
835
+ - remaining_slot_path: The dict attribute and nested keys as string
836
+ """
837
+ path_parts = slot.split(":")
838
+
839
+ # Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..."
840
+ if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]:
841
+ dict_attr = getattr(dataset, path_parts[0], None)
842
+ if dict_attr is not None:
843
+ if len(path_parts) == 1:
844
+ return pd.DataFrame([dict_attr]), None, path_parts[0]
845
+
846
+ deeper_keys = path_parts[1:]
847
+ data = _resolve_schema_slot_path(
848
+ dict_attr, deeper_keys, slot, path_parts[0]
849
+ )
850
+ return pd.DataFrame([data]), None, ":".join(path_parts[1:])
851
+
852
+ # Handle modality dict slots: "modality:uns", "modality:uns:key1:key2"
853
+ elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]:
854
+ modality, dict_name = path_parts[0], path_parts[1]
855
+ try:
856
+ modality_dataset = dataset[modality]
857
+ dict_attr = getattr(modality_dataset, dict_name, None)
858
+ if dict_attr is not None:
859
+ if len(path_parts) == 2:
860
+ return pd.DataFrame([dict_attr]), modality, dict_name
861
+
862
+ deeper_keys = path_parts[2:]
863
+ data = _resolve_schema_slot_path(
864
+ dict_attr, deeper_keys, slot, f"{modality}.{dict_name}"
865
+ )
866
+ return pd.DataFrame([data]), modality, ":".join(path_parts[1:])
867
+ except (KeyError, AttributeError):
868
+ pass
869
+ else:
870
+ raise InvalidArgument(
871
+ f"Invalid dict slot pattern '{slot}'. Expected formats: "
872
+ f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'"
873
+ )
874
+
875
+ return None, None, None
876
+
877
+
878
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
698
879
  class AnnDataCurator(SlotsCurator):
699
880
  """Curator for `AnnData`.
700
881
 
882
+ {}
883
+
701
884
  Args:
702
885
  dataset: The AnnData-like object to validate & annotate.
703
886
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
704
887
 
705
- Example:
888
+ Examples:
889
+
890
+ Curate Ensembl gene IDs and valid features in obs:
706
891
 
707
892
  .. literalinclude:: scripts/curate_anndata_flexible.py
708
893
  :language: python
709
894
  :caption: curate_anndata_flexible.py
710
895
 
896
+ Curate `uns` dictionary:
897
+
898
+ .. literalinclude:: scripts/curate_anndata_uns.py
899
+ :language: python
900
+ :caption: curate_anndata_uns.py
901
+
711
902
  See Also:
712
903
  :meth:`~lamindb.Artifact.from_anndata`.
713
904
  """
@@ -720,34 +911,37 @@ class AnnDataCurator(SlotsCurator):
720
911
  super().__init__(dataset=dataset, schema=schema)
721
912
  if not data_is_scversedatastructure(self._dataset, "AnnData"):
722
913
  raise InvalidArgument("dataset must be AnnData-like.")
723
- if schema.otype != "AnnData":
914
+ if schema.otype and schema.otype != "AnnData":
724
915
  raise InvalidArgument("Schema otype must be 'AnnData'.")
725
- self._slots = {
726
- slot: DataFrameCurator(
727
- (
916
+
917
+ for slot, slot_schema in schema.slots.items():
918
+ if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"):
919
+ raise ValueError(
920
+ f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}"
921
+ )
922
+ if slot.startswith("uns"):
923
+ df, _, _ = _handle_dict_slots(self._dataset, slot)
924
+ elif slot in {"obs", "var", "var.T"}:
925
+ df = (
728
926
  getattr(self._dataset, slot.strip(".T")).T
729
927
  if slot == "var.T"
730
928
  or (
731
- # backward compat
732
929
  slot == "var"
733
930
  and schema.slots["var"].itype not in {None, "Feature"}
734
931
  )
735
932
  else getattr(self._dataset, slot)
736
- ),
737
- slot_schema,
738
- slot=slot,
739
- )
740
- for slot, slot_schema in schema.slots.items()
741
- if slot in {"obs", "var", "var.T", "uns"}
742
- }
743
- if "var" in self._slots and schema.slots["var"].itype not in {None, "Feature"}:
744
- logger.warning(
745
- "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
746
- )
747
- self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
748
- "var"
749
- ].cat._cat_vectors.pop("columns")
750
- self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
933
+ )
934
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
935
+
936
+ # Handle var index naming for backward compat
937
+ if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}:
938
+ logger.warning(
939
+ "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
940
+ )
941
+ self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
942
+ "var"
943
+ ].cat._cat_vectors.pop("columns")
944
+ self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
751
945
 
752
946
 
753
947
  def _assign_var_fields_categoricals_multimodal(
@@ -757,11 +951,10 @@ def _assign_var_fields_categoricals_multimodal(
757
951
  slot_schema: Schema,
758
952
  var_fields: dict[str, FieldAttr],
759
953
  cat_vectors: dict[str, dict[str, CatVector]],
760
- slots: dict[str, DataFrameCurator],
954
+ slots: dict[str, ComponentCurator],
761
955
  ) -> None:
762
956
  """Assigns var_fields and categoricals for multimodal data curators."""
763
957
  if modality is not None:
764
- # Makes sure that all tables are present
765
958
  var_fields[modality] = None
766
959
  cat_vectors[modality] = {}
767
960
 
@@ -782,15 +975,17 @@ def _assign_var_fields_categoricals_multimodal(
782
975
  cat_vectors[modality] = obs_fields
783
976
 
784
977
 
978
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
785
979
  class MuDataCurator(SlotsCurator):
786
980
  """Curator for `MuData`.
787
981
 
982
+ {}
983
+
788
984
  Args:
789
985
  dataset: The MuData-like object to validate & annotate.
790
986
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
791
987
 
792
988
  Example:
793
-
794
989
  .. literalinclude:: scripts/curate_mudata.py
795
990
  :language: python
796
991
  :caption: curate_mudata.py
@@ -811,12 +1006,32 @@ class MuDataCurator(SlotsCurator):
811
1006
  raise InvalidArgument("Schema otype must be 'MuData'.")
812
1007
 
813
1008
  for slot, slot_schema in schema.slots.items():
814
- if ":" in slot:
815
- modality, modality_slot = slot.split(":")
816
- schema_dataset = self._dataset.__getitem__(modality)
1009
+ # Handle slots: "mdata.uns", "modality:uns"
1010
+ if "uns" in slot:
1011
+ df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)
817
1012
  else:
818
- modality, modality_slot = None, slot
819
- schema_dataset = self._dataset
1013
+ # Handle slots: "modality:obs", "modality:var"
1014
+ parts = slot.split(":")
1015
+ if len(parts) == 2:
1016
+ modality, modality_slot = parts
1017
+ try:
1018
+ schema_dataset = self._dataset[modality]
1019
+ df = getattr(schema_dataset, modality_slot.rstrip(".T"))
1020
+ except KeyError:
1021
+ raise InvalidArgument(
1022
+ f"Modality '{modality}' not found in MuData"
1023
+ ) from None
1024
+ except AttributeError:
1025
+ raise InvalidArgument(
1026
+ f"Attribute '{modality_slot}' not found on modality '{modality}'"
1027
+ ) from None
1028
+ else:
1029
+ # Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above)
1030
+ modality, modality_slot = None, slot
1031
+ schema_dataset = self._dataset
1032
+ df = getattr(schema_dataset, modality_slot.rstrip(".T"))
1033
+
1034
+ # Transpose var if necessary
820
1035
  if modality_slot == "var" and schema.slots[slot].itype not in {
821
1036
  None,
822
1037
  "Feature",
@@ -824,19 +1039,12 @@ class MuDataCurator(SlotsCurator):
824
1039
  logger.warning(
825
1040
  "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
826
1041
  )
827
- self._slots[slot] = DataFrameCurator(
828
- (
829
- getattr(schema_dataset, modality_slot.rstrip(".T")).T
830
- if modality_slot == "var.T"
831
- or (
832
- # backward compat
833
- modality_slot == "var"
834
- and schema.slots[slot].itype not in {None, "Feature"}
835
- )
836
- else getattr(schema_dataset, modality_slot)
837
- ),
838
- slot_schema,
839
- )
1042
+ df = df.T
1043
+ elif modality_slot == "var.T":
1044
+ df = df.T
1045
+
1046
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
1047
+
840
1048
  _assign_var_fields_categoricals_multimodal(
841
1049
  modality=modality,
842
1050
  slot_type=modality_slot,
@@ -846,18 +1054,21 @@ class MuDataCurator(SlotsCurator):
846
1054
  cat_vectors=self._cat_vectors,
847
1055
  slots=self._slots,
848
1056
  )
1057
+
849
1058
  self._columns_field = self._var_fields
850
1059
 
851
1060
 
1061
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
852
1062
  class SpatialDataCurator(SlotsCurator):
853
1063
  """Curator for `SpatialData`.
854
1064
 
1065
+ {}
1066
+
855
1067
  Args:
856
1068
  dataset: The SpatialData-like object to validate & annotate.
857
1069
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
858
1070
 
859
1071
  Example:
860
-
861
1072
  .. literalinclude:: scripts/curate_spatialdata.py
862
1073
  :language: python
863
1074
  :caption: curate_spatialdata.py
@@ -878,69 +1089,75 @@ class SpatialDataCurator(SlotsCurator):
878
1089
  raise InvalidArgument("Schema otype must be 'SpatialData'.")
879
1090
 
880
1091
  for slot, slot_schema in schema.slots.items():
881
- split_result = slot.split(":")
882
- if (len(split_result) == 2 and split_result[0] == "table") or (
883
- len(split_result) == 3 and split_result[0] == "tables"
884
- ):
885
- if len(split_result) == 2:
886
- table_key, sub_slot = split_result
887
- logger.warning(
888
- f"please prefix slot {slot} with 'tables:' going forward"
889
- )
1092
+ # Handle slots: "sdata:attrs"
1093
+ if slot.startswith("attrs"):
1094
+ df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)
1095
+ else:
1096
+ parts = slot.split(":")
1097
+ # Handle slots: "tables:table_key:obs", "tables:table_key:var"
1098
+ if len(parts) == 3 and parts[0] == "tables":
1099
+ table_key, table_slot = parts[1], parts[2]
1100
+ try:
1101
+ slot_object = self._dataset.tables[table_key]
1102
+ df = getattr(slot_object, table_slot.rstrip(".T"))
1103
+ except KeyError:
1104
+ raise InvalidArgument(
1105
+ f"Table '{table_key}' not found in sdata.tables"
1106
+ ) from None
1107
+ except AttributeError:
1108
+ raise InvalidArgument(
1109
+ f"Attribute '{table_slot}' not found on table '{table_key}'"
1110
+ ) from None
890
1111
  else:
891
- table_key, sub_slot = split_result[1], split_result[2]
892
- slot_object = self._dataset.tables.__getitem__(table_key)
893
- if sub_slot == "var" and schema.slots[slot].itype not in {
894
- None,
895
- "Feature",
896
- }:
897
- logger.warning(
898
- "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
899
- )
900
- data_object = (
901
- getattr(slot_object, sub_slot.rstrip(".T")).T
902
- if sub_slot == "var.T"
903
- or (
904
- # backward compat
905
- sub_slot == "var"
906
- and schema.slots[slot].itype not in {None, "Feature"}
907
- )
908
- else getattr(slot_object, sub_slot)
909
- )
910
- elif len(split_result) == 1 or (
911
- len(split_result) > 1 and split_result[0] == "attrs"
912
- ):
913
- table_key = None
914
- if len(split_result) == 1:
915
- if split_result[0] != "attrs":
1112
+ # Handle legacy single keys for backward compatibility
1113
+ if len(parts) == 1 and parts[0] != "attrs":
916
1114
  logger.warning(
917
1115
  f"please prefix slot {slot} with 'attrs:' going forward"
918
1116
  )
919
- sub_slot = slot
920
- data_object = self._dataset.attrs[slot]
1117
+ try:
1118
+ df = pd.DataFrame([self._dataset.attrs[slot]])
1119
+ table_key = None
1120
+ table_slot = slot
1121
+ except KeyError:
1122
+ raise InvalidArgument(
1123
+ f"Slot '{slot}' not found in sdata.attrs"
1124
+ ) from None
921
1125
  else:
922
- sub_slot = "attrs"
923
- data_object = self._dataset.attrs
924
- elif len(split_result) == 2:
925
- sub_slot = split_result[1]
926
- data_object = self._dataset.attrs[split_result[1]]
927
- data_object = pd.DataFrame([data_object])
928
- self._slots[slot] = DataFrameCurator(data_object, slot_schema, slot)
1126
+ raise InvalidArgument(f"Unrecognized slot format: {slot}")
1127
+
1128
+ # Handle var transposition logic
1129
+ if table_slot == "var" and schema.slots[slot].itype not in {
1130
+ None,
1131
+ "Feature",
1132
+ }:
1133
+ logger.warning(
1134
+ "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
1135
+ )
1136
+ df = df.T
1137
+ elif table_slot == "var.T":
1138
+ df = df.T
1139
+
1140
+ self._slots[slot] = ComponentCurator(df, slot_schema, slot)
1141
+
929
1142
  _assign_var_fields_categoricals_multimodal(
930
1143
  modality=table_key,
931
- slot_type=sub_slot,
1144
+ slot_type=table_slot,
932
1145
  slot=slot,
933
1146
  slot_schema=slot_schema,
934
1147
  var_fields=self._var_fields,
935
1148
  cat_vectors=self._cat_vectors,
936
1149
  slots=self._slots,
937
1150
  )
1151
+
938
1152
  self._columns_field = self._var_fields
939
1153
 
940
1154
 
1155
+ @doc_args(SLOTS_DETAILS_DOCSTRING)
941
1156
  class TiledbsomaExperimentCurator(SlotsCurator):
942
1157
  """Curator for `tiledbsoma.Experiment`.
943
1158
 
1159
+ {}
1160
+
944
1161
  Args:
945
1162
  dataset: The `tiledbsoma.Experiment` object.
946
1163
  schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -977,7 +1194,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
977
1194
  .drop("soma_joinid", axis=1, errors="ignore")
978
1195
  )
979
1196
 
980
- self._slots[slot] = DataFrameCurator(
1197
+ self._slots[slot] = ComponentCurator(
981
1198
  (schema_dataset.T if modality_slot == "var.T" else schema_dataset),
982
1199
  slot_schema,
983
1200
  )
@@ -990,7 +1207,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
990
1207
  .to_pandas()
991
1208
  .drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
992
1209
  )
993
- self._slots[slot] = DataFrameCurator(
1210
+ self._slots[slot] = ComponentCurator(
994
1211
  schema_dataset,
995
1212
  slot_schema,
996
1213
  )
@@ -1040,9 +1257,12 @@ class CatVector:
1040
1257
  self._maximal_set = maximal_set
1041
1258
 
1042
1259
  self._all_filters = {"source": self._source, "organism": self._organism}
1260
+
1043
1261
  if self._subtype_str and "=" in self._subtype_str:
1044
1262
  self._all_filters.update(
1045
- resolve_relation_filters(parse_filter_string(self._subtype_str), self) # type: ignore
1263
+ resolve_relation_filters(
1264
+ parse_filter_string(self._subtype_str), self._field.field.model
1265
+ ) # type: ignore
1046
1266
  )
1047
1267
 
1048
1268
  if hasattr(field.field.model, "_name_field"):
@@ -1241,7 +1461,7 @@ class CatVector:
1241
1461
  type_record = registry.get(name=self._subtype_str)
1242
1462
  if df is not None and registry == Feature:
1243
1463
  nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
1244
- non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
1464
+ non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])
1245
1465
  else:
1246
1466
  if (
1247
1467
  self._organism
@@ -1343,7 +1563,7 @@ class CatVector:
1343
1563
  warning_message += "\n for remaining terms:\n"
1344
1564
  warning_message += f" → fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}"
1345
1565
  if self._subtype_query_set is not None:
1346
- warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.list('name')}"
1566
+ warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.to_list('name')}"
1347
1567
  logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
1348
1568
  logger.warning(warning_message)
1349
1569
  if self._cat_manager is not None:
@@ -1493,6 +1713,30 @@ class DataFrameCatManager:
1493
1713
  """The categorical features."""
1494
1714
  return self._categoricals
1495
1715
 
1716
+ def __repr__(self) -> str:
1717
+ cls_name = colors.green(self.__class__.__name__)
1718
+
1719
+ status_str = (
1720
+ f"{colors.green('validated')}"
1721
+ if self._is_validated
1722
+ else f"{colors.yellow('unvalidated')}"
1723
+ )
1724
+
1725
+ info_parts = []
1726
+
1727
+ cat_count = len(self._categoricals)
1728
+ if cat_count > 0:
1729
+ info_parts.append(f"categorical_features={cat_count}")
1730
+
1731
+ if self._slot:
1732
+ info_parts.append(f"slot: {colors.italic(self._slot)}")
1733
+
1734
+ info_str = ", ".join(info_parts)
1735
+ if info_str:
1736
+ return f"{cls_name}({info_str}, {status_str})"
1737
+ else:
1738
+ return f"{cls_name}({status_str})"
1739
+
1496
1740
  def lookup(self, public: bool = False) -> CatLookup:
1497
1741
  """Lookup categories.
1498
1742
 
@@ -1537,7 +1781,9 @@ class DataFrameCatManager:
1537
1781
  key: The key referencing the column in the DataFrame to standardize.
1538
1782
  """
1539
1783
  if self._artifact is not None:
1540
- raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1784
+ raise RuntimeError(
1785
+ "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
1786
+ )
1541
1787
 
1542
1788
  if key == "all":
1543
1789
  logger.warning(
@@ -1610,7 +1856,7 @@ def get_organism_kwargs(
1610
1856
  def annotate_artifact(
1611
1857
  artifact: Artifact,
1612
1858
  *,
1613
- curator: AnnDataCurator | SlotsCurator | None = None,
1859
+ curator: SlotsCurator | None = None,
1614
1860
  cat_vectors: dict[str, CatVector] | None = None,
1615
1861
  ) -> Artifact:
1616
1862
  from .. import settings
@@ -1643,7 +1889,9 @@ def annotate_artifact(
1643
1889
  )
1644
1890
 
1645
1891
  # annotate with inferred schemas aka feature sets
1646
- if artifact.otype == "DataFrame":
1892
+ if (
1893
+ artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None
1894
+ ): # Prevent overwriting user-defined schemas that contain slots
1647
1895
  features = cat_vectors["columns"].records
1648
1896
  if features is not None:
1649
1897
  index_feature = artifact.schema.index
@@ -1663,7 +1911,11 @@ def annotate_artifact(
1663
1911
  logger.important(
1664
1912
  f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
1665
1913
  )
1666
- itype = parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
1914
+ itype = (
1915
+ Feature.name
1916
+ if artifact.schema.itype == "Composite"
1917
+ else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
1918
+ )
1667
1919
  feature_set = Schema(itype=itype, n=len(features))
1668
1920
  artifact.feature_sets.add(
1669
1921
  feature_set.save(), through_defaults={"slot": "columns"}
@@ -1698,9 +1950,13 @@ def annotate_artifact(
1698
1950
  logger.important(
1699
1951
  f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
1700
1952
  )
1701
- itype = parse_cat_dtype(
1702
- artifact.schema.slots[slot].itype, is_itype=True
1703
- )["field"]
1953
+ itype = (
1954
+ Feature.name
1955
+ if artifact.schema.slots[slot].itype == "Composite"
1956
+ else parse_cat_dtype(
1957
+ artifact.schema.slots[slot].itype, is_itype=True
1958
+ )["field"]
1959
+ )
1704
1960
  feature_set = Schema(itype=itype, n=len(features))
1705
1961
  artifact.feature_sets.add(
1706
1962
  feature_set.save(), through_defaults={"slot": slot}