PyPI - lamindb - Versions diffs - 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

lamindb 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

lamindb/__init__.py +14 -5
lamindb/_artifact.py +174 -57
lamindb/_can_curate.py +27 -8
lamindb/_collection.py +85 -51
lamindb/_feature.py +177 -41
lamindb/_finish.py +222 -81
lamindb/_from_values.py +83 -98
lamindb/_parents.py +4 -4
lamindb/_query_set.py +59 -17
lamindb/_record.py +171 -53
lamindb/_run.py +4 -4
lamindb/_save.py +33 -10
lamindb/_schema.py +135 -38
lamindb/_storage.py +1 -1
lamindb/_tracked.py +106 -0
lamindb/_transform.py +21 -8
lamindb/_ulabel.py +5 -14
lamindb/base/validation.py +2 -6
lamindb/core/__init__.py +13 -14
lamindb/core/_context.py +39 -36
lamindb/core/_data.py +29 -25
lamindb/core/_describe.py +1 -1
lamindb/core/_django.py +1 -1
lamindb/core/_feature_manager.py +54 -44
lamindb/core/_label_manager.py +4 -4
lamindb/core/_mapped_collection.py +20 -7
lamindb/core/datasets/__init__.py +6 -1
lamindb/core/datasets/_core.py +12 -11
lamindb/core/datasets/_small.py +66 -20
lamindb/core/exceptions.py +1 -90
lamindb/core/loaders.py +7 -13
lamindb/core/relations.py +6 -4
lamindb/core/storage/_anndata_accessor.py +41 -0
lamindb/core/storage/_backed_access.py +2 -2
lamindb/core/storage/_pyarrow_dataset.py +25 -15
lamindb/core/storage/_tiledbsoma.py +56 -12
lamindb/core/storage/paths.py +41 -22
lamindb/core/subsettings/_creation_settings.py +4 -16
lamindb/curators/__init__.py +2168 -833
lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
lamindb/errors.py +96 -0
lamindb/integrations/_vitessce.py +3 -3
lamindb/migrations/0069_squashed.py +76 -75
lamindb/migrations/0075_lamindbv1_part5.py +4 -5
lamindb/migrations/0082_alter_feature_dtype.py +21 -0
lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
lamindb/migrations/0086_various.py +95 -0
lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
lamindb/migrations/0088_schema_components.py +273 -0
lamindb/migrations/0088_squashed.py +4372 -0
lamindb/models.py +423 -156
{lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
lamindb-1.1.0.dist-info/RECORD +95 -0
lamindb/curators/_spatial.py +0 -528
lamindb/migrations/0052_squashed.py +0 -1261
lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
lamindb/migrations/0060_alter_artifact__actions.py +0 -22
lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
lamindb/migrations/0062_add_is_latest_field.py +0 -32
lamindb/migrations/0063_populate_latest_field.py +0 -45
lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
lamindb-1.0.4.dist-info/RECORD +0 -102
{lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
{lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0

lamindb/curators/__init__.py CHANGED Viewed

@@ -1,21 +1,52 @@
+"""Curators.
+.. autosummary::
+   :toctree: .
+   Curator
+   DataFrameCurator
+   AnnDataCurator
+"""
 from __future__ import annotations
 import copy
-import warnings
+import random
+import re
+from importlib import resources
 from itertools import chain
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Literal
 import anndata as ad
 import lamindb_setup as ln_setup
 import pandas as pd
+import pandera
 import pyarrow as pa
 from lamin_utils import colors, logger
+from lamindb_setup.core import deprecated, upath
 from lamindb_setup.core._docs import doc_args
 from lamindb_setup.core.upath import UPath
+from lamindb.core.storage._backed_access import backed_access
+from ._cellxgene_schemas import _read_schema_versions
+if TYPE_CHECKING:
+    from anndata import AnnData
+    from lamindb_setup.core.types import UPathStr
+    from lamindb.base.types import FieldAttr
+    from lamindb.models import Record
+from lamindb._feature import parse_dtype, parse_dtype_single_cat
 from lamindb.base.types import FieldAttr  # noqa
+from lamindb.core._data import add_labels
+from lamindb.core._feature_manager import parse_staged_feature_sets_from_anndata
+from lamindb.core._settings import settings
 from lamindb.models import (
     Artifact,
+    CanCurate,
+    Collection,
     Feature,
     Record,
     Run,
@@ -23,15 +54,25 @@ from lamindb.models import (
     ULabel,
 )
+from .._artifact import data_is_anndata
 from .._from_values import _format_values
-from ..core.exceptions import ValidationError
+from ..errors import InvalidArgument, ValidationError
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Iterable, MutableMapping
     from typing import Any
     from lamindb_setup.core.types import UPathStr
     from mudata import MuData
+    from spatialdata import SpatialData
+    from lamindb._query_set import RecordList
+def strip_ansi_codes(text):
+    # This pattern matches ANSI escape sequences
+    ansi_pattern = re.compile(r"\x1b\[[0-9;]*m")
+    return ansi_pattern.sub("", text)
 class CurateLookup:
@@ -40,8 +81,6 @@ class CurateLookup:
     Args:
         categoricals: A dictionary of categorical fields to lookup.
         slots: A dictionary of slot fields to lookup.
-        using_key: The key of the instance to lookup from. Defaults to the
-            current instance if not specified.
         public: Whether to lookup from the public instance. Defaults to False.
     Example:
@@ -55,48 +94,43 @@ class CurateLookup:
         self,
         categoricals: dict[str, FieldAttr],
         slots: dict[str, FieldAttr] = None,
-        using_key: str | None = None,
         public: bool = False,
     ) -> None:
         slots = slots or {}
-        self._fields = {**categoricals, **slots}
-        self._using_key = None if using_key == "default" else using_key
-        self._using_key_name = self._using_key or ln_setup.settings.instance.slug
+        self._categoricals = {**categoricals, **slots}
         self._public = public
-        debug_message = f"Lookup objects from {colors.italic(self._using_key_name)}"
-        logger.debug(debug_message)
     def __getattr__(self, name):
-        if name in self._fields:
-            registry = self._fields[name].field.model
+        if name in self._categoricals:
+            registry = self._categoricals[name].field.model
             if self._public and hasattr(registry, "public"):
                 return registry.public().lookup()
             else:
-                return get_registry_instance(registry, self._using_key).lookup()
+                return registry.lookup()
         raise AttributeError(
             f'"{self.__class__.__name__}" object has no attribute "{name}"'
         )
     def __getitem__(self, name):
-        if name in self._fields:
-            registry = self._fields[name].field.model
+        if name in self._categoricals:
+            registry = self._categoricals[name].field.model
             if self._public and hasattr(registry, "public"):
                 return registry.public().lookup()
             else:
-                return get_registry_instance(registry, self._using_key).lookup()
+                return registry.lookup()
         raise AttributeError(
             f'"{self.__class__.__name__}" object has no attribute "{name}"'
         )
     def __repr__(self) -> str:
-        if len(self._fields) > 0:
+        if len(self._categoricals) > 0:
             getattr_keys = "\n ".join(
-                [f".{key}" for key in self._fields if key.isidentifier()]
+                [f".{key}" for key in self._categoricals if key.isidentifier()]
             )
             getitem_keys = "\n ".join(
-                [str([key]) for key in self._fields if not key.isidentifier()]
+                [str([key]) for key in self._categoricals if not key.isidentifier()]
             )
-            ref = "public" if self._public else self._using_key_name
+            ref = "public" if self._public else "registries"
             return (
                 f"Lookup objects from the {colors.italic(ref)}:\n "
                 f"{colors.green(getattr_keys)}\n "
@@ -105,21 +139,422 @@ class CurateLookup:
                 "    → categories.alveolar_type_1_fibroblast_cell\n\n"
                 "To look up public ontologies, use .lookup(public=True)"
             )
-        else:  # pragma: no cover
+        else:  # pdagma: no cover
             return colors.warning("No fields are found!")
-class BaseCurator:
-    """Curate a dataset."""
+CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
+VALIDATE_DOCSTRING = """Validate dataset.
+Raises:
+    lamindb.errors.ValidationError: If validation fails.
+"""
+SAVE_ARTIFACT_DOCSTRING = """Save an annotated artifact.
+Args:
+    key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
+    description: A description.
+    revises: Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
+    run: The run that creates the artifact.
+Returns:
+    A saved artifact record.
+"""
+class Curator:
+    """Dataset curator.
+    A `Curator` object makes it easy to validate, standardize & annotate datasets.
+    See:
+        - :class:`~lamindb.curators.DataFrameCurator`
+        - :class:`~lamindb.curators.AnnDataCurator`
+    """
+    def __init__(self, dataset: Any, schema: Schema | None = None):
+        self._artifact: Artifact = None  # pass the dataset as an artifact
+        self._dataset: Any = dataset  # pass the dataset as a UPathStr or data object
+        if isinstance(self._dataset, Artifact):
+            self._artifact = self._dataset
+            if self._artifact.otype in {"DataFrame", "AnnData"}:
+                self._dataset = self._dataset.load()
+        self._schema: Schema | None = schema
+        self._is_validated: bool = False
+        self._cat_manager: CatManager = None  # is None for CatManager curators
+    @doc_args(VALIDATE_DOCSTRING)
+    def validate(self) -> bool | str:
+        """{}"""  # noqa: D415
+        pass  # pdagma: no cover
+    @doc_args(SAVE_ARTIFACT_DOCSTRING)
+    def save_artifact(
+        self,
+        *,
+        key: str | None = None,
+        description: str | None = None,
+        revises: Artifact | None = None,
+        run: Run | None = None,
+    ) -> Artifact:
+        """{}"""  # noqa: D415
+        # Note that this docstring has to be consistent with the Artifact()
+        # constructor signature
+        pass
+class DataFrameCurator(Curator):
+    # the example in the docstring is tested in test_curators_quickstart_example
+    """Curator for a DataFrame object.
+    See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
+    Args:
+        dataset: The DataFrame-like object to validate & annotate.
+        schema: A `Schema` object that defines the validation constraints.
+    Example::
+        import lamindb as ln
+        import bionty as bt
+        # define valid labels
+        cell_medium = ln.ULabel(name="CellMedium", is_type=True).save()
+        ln.ULabel(name="DMSO", type=cell_medium).save()
+        ln.ULabel(name="IFNG", type=cell_medium).save()
+        bt.CellType.from_source(name="B cell").save()
+        bt.CellType.from_source(name="T cell").save()
+        # define schema
+        schema = ln.Schema(
+            name="small_dataset1_obs_level_metadata",
+            features=[
+                ln.Feature(name="cell_medium", dtype="cat[ULabel[CellMedium]]").save(),
+                ln.Feature(name="sample_note", dtype=str).save(),
+                ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
+                ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
+            ],
+        ).save()
+        # curate a DataFrame
+        df = datasets.small_dataset1(otype="DataFrame")
+        curator = ln.curators.DataFrameCurator(df, schema)
+        artifact = curator.save_artifact(key="example_datasets/dataset1.parquet")
+        assert artifact.schema == schema
+    """
+    def __init__(
+        self,
+        dataset: pd.DataFrame | Artifact,
+        schema: Schema,
+    ) -> None:
+        super().__init__(dataset=dataset, schema=schema)
+        if schema.n > 0:
+            # populate features
+            pandera_columns = {}
+            categoricals = {}
+            for feature in schema.features.all():
+                pandera_dtype = (
+                    feature.dtype if not feature.dtype.startswith("cat") else "category"
+                )
+                pandera_columns[feature.name] = pandera.Column(
+                    pandera_dtype, nullable=feature.nullable
+                )
+                if feature.dtype.startswith("cat"):
+                    categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
+            self._pandera_schema = pandera.DataFrameSchema(
+                pandera_columns, coerce=schema.coerce_dtype
+            )
+            # now deal with detailed validation of categoricals
+            self._cat_manager = DataFrameCatManager(
+                self._dataset,
+                categoricals=categoricals,
+            )
+        else:
+            assert schema.itype is not None  # noqa: S101
+    @property
+    @doc_args(CAT_MANAGER_DOCSTRING)
+    def cat(self) -> CatManager:
+        """{}"""  # noqa: D415
+        return self._cat_manager
+    def standardize(self) -> None:
+        """Standardize the dataset.
+        - Adds missing columns if a default value for a feature is defined.
+        - Fills missing values with the default value if a default value for a feature is defined.
+        """
+        for feature in self._schema.members:
+            if feature.name not in self._dataset.columns:
+                if feature.default_value is not None:
+                    self._dataset[feature.name] = feature.default_value
+                else:
+                    raise ValidationError(
+                        f"Missing column {feature.name} cannot be added because no default value is defined for this feature"
+                    )
+            else:
+                if feature.default_value is not None:
+                    if isinstance(
+                        self._dataset[feature.name].dtype, pd.CategoricalDtype
+                    ):
+                        if (
+                            feature.default_value
+                            not in self._dataset[feature.name].cat.categories
+                        ):
+                            self._dataset[feature.name] = self._dataset[
+                                feature.name
+                            ].cat.add_categories(feature.default_value)
+                    self._dataset[feature.name] = self._dataset[feature.name].fillna(
+                        feature.default_value
+                    )
+    @doc_args(VALIDATE_DOCSTRING)
+    def validate(self) -> None:
+        """{}"""  # noqa: D415
+        if self._schema.n > 0:
+            self._cat_manager.validate()
+            try:
+                self._pandera_schema.validate(self._dataset)
+                if self._cat_manager._is_validated:
+                    self._is_validated = True
+                else:
+                    self._is_validated = False
+                    raise ValidationError(
+                        self._cat_manager._validate_category_error_messages
+                    )
+            except pandera.errors.SchemaError as err:
+                self._is_validated = False
+                # .exconly() doesn't exist on SchemaError
+                raise ValidationError(str(err)) from err
+        else:
+            result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
+            registry: CanCurate = result["registry"]
+            inspector = registry.inspect(
+                self._dataset.columns,
+                result["field"],
+                mute=True,
+            )
+            if len(inspector.non_validated) > 0:
+                # also check public ontology
+                if hasattr(registry, "public"):
+                    registry.from_values(
+                        inspector.non_validated, result["field"], mute=True
+                    ).save()
+                    inspector = registry.inspect(
+                        inspector.non_validated, result["field"], mute=True
+                    )
+                if len(inspector.non_validated) > 0:
+                    self._is_validated = False
+                    raise ValidationError(
+                        f"Invalid identifiers for {self._schema.itype}: {inspector.non_validated}"
+                    )
+    @doc_args(SAVE_ARTIFACT_DOCSTRING)
+    def save_artifact(
+        self,
+        *,
+        key: str | None = None,
+        description: str | None = None,
+        revises: Artifact | None = None,
+        run: Run | None = None,
+    ):
+        """{}"""  # noqa: D415
+        if not self._is_validated:
+            self.validate()  # raises ValidationError if doesn't validate
+        result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
+        return save_artifact(  # type: ignore
+            self._dataset,
+            description=description,
+            fields=self._cat_manager.categoricals,
+            columns_field=result["field"],
+            key=key,
+            artifact=self._artifact,
+            revises=revises,
+            run=run,
+            schema=self._schema,
+        )
+class AnnDataCurator(Curator):
+    # the example in the docstring is tested in test_curators_quickstart_example
+    """Curator for a DataFrame object.
+    See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
+    Args:
+        dataset: The AnnData-like object to validate & annotate.
+        schema: A `Schema` object that defines the validation constraints.
+    Example::
+        import lamindb as ln
+        import bionty as bt
+        # define valid labels
+        cell_medium = ln.ULabel(name="CellMedium", is_type=True).save()
+        ln.ULabel(name="DMSO", type=cell_medium).save()
+        ln.ULabel(name="IFNG", type=cell_medium).save()
+        bt.CellType.from_source(name="B cell").save()
+        bt.CellType.from_source(name="T cell").save()
+        # define obs schema
+        obs_schema = ln.Schema(
+            name="small_dataset1_obs_level_metadata",
+            features=[
+                ln.Feature(name="cell_medium", dtype="cat[ULabel[CellMedium]]").save(),
+                ln.Feature(name="sample_note", dtype=str).save(),
+                ln.Feature(name="cell_type_by_expert", dtype=bt.CellType").save(),
+                ln.Feature(name="cell_type_by_model", dtype=bt.CellType").save(),
+            ],
+        ).save()
+        # define var schema
+        var_schema = ln.Schema(
+            name="scRNA_seq_var_schema",
+            itype=bt.Gene.ensembl_gene_id,
+            dtype="num",
+        ).save()
+        # define composite schema
+        anndata_schema = ln.Schema(
+            name="small_dataset1_anndata_schema",
+            otype="AnnData",
+            components={"obs": obs_schema, "var": var_schema},
+        ).save()
+        # curate an AnnData
+        adata = datasets.small_dataset1(otype="AnnData")
+        curator = ln.curators.AnnDataCurator(adata, anndata_schema)
+        artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
+        assert artifact.schema == anndata_schema
+    """
+    def __init__(
+        self,
+        dataset: AnnData | Artifact,
+        schema: Schema,
+    ) -> None:
+        super().__init__(dataset=dataset, schema=schema)
+        if not data_is_anndata(self._dataset):
+            raise InvalidArgument("dataset must be AnnData-like.")
+        if schema.otype != "AnnData":
+            raise InvalidArgument("Schema otype must be 'AnnData'.")
+        self._obs_curator = DataFrameCurator(
+            self._dataset.obs, schema._get_component("obs")
+        )
+        self._var_curator = DataFrameCurator(
+            self._dataset.var.T, schema._get_component("var")
+        )
+    @doc_args(VALIDATE_DOCSTRING)
+    def validate(self) -> None:
+        """{}"""  # noqa: D415
+        self._obs_curator.validate()
+        self._var_curator.validate()
+        self._is_validated = True
+    @doc_args(SAVE_ARTIFACT_DOCSTRING)
+    def save_artifact(self, *, key=None, description=None, revises=None, run=None):
+        """{}"""  # noqa: D415
+        if not self._is_validated:
+            self.validate()  # raises ValidationError if doesn't validate
+        result = parse_dtype_single_cat(self._var_curator._schema.itype, is_itype=True)
+        return save_artifact(  # type: ignore
+            self._dataset,
+            description=description,
+            fields=self._obs_curator._cat_manager.categoricals,
+            columns_field=result["field"],
+            key=key,
+            artifact=self._artifact,
+            revises=revises,
+            run=run,
+            schema=self._schema,
+        )
+class CatManager:
+    """Manage valid categoricals by updating registries.
+    A `CatManager` object makes it easy to validate, standardize & annotate datasets.
+    Example:
+    >>> cat_manager = ln.CatManager(
+    >>>     dataset,
+    >>>     # define validation criteria as mappings
+    >>>     columns=Feature.name,  # map column names
+    >>>     categoricals={"perturbation": ULabel.name},  # map categories
+    >>> )
+    >>> cat_manager.validate()  # validate the dataframe
+    >>> artifact = cat_manager.save_artifact(description="my RNA-seq")
+    >>> artifact.describe()  # see annotations
+    `cat_manager.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
+    If you find non-validated values, you have several options:
+    - new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCatManager.add_new_from`
+    - non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCatManager.non_validated` and addressed manually
+    """
+    def __init__(
+        self, *, dataset, categoricals, sources, organism, exclude, columns_field=None
+    ):
+        # the below is shared with Curator
+        self._artifact: Artifact = None  # pass the dataset as an artifact
+        self._dataset: Any = dataset  # pass the dataset as a UPathStr or data object
+        if isinstance(self._dataset, Artifact):
+            self._artifact = self._dataset
+            if self._artifact.otype in {"DataFrame", "AnnData"}:
+                self._dataset = self._dataset.load()
+        self._is_validated: bool = False
+        # shared until here
+        self._categoricals = categoricals or {}
+        self._non_validated = None
+        self._organism = organism
+        self._sources = sources or {}
+        self._exclude = exclude or {}
+        self._columns_field = columns_field
+        self._validate_category_error_messages: str = ""
+    @property
+    def non_validated(self) -> dict[str, list[str]]:
+        """Return the non-validated features and labels."""
+        if self._non_validated is None:
+            raise ValidationError("Please run validate() first!")
+        return self._non_validated
-    def __init_subclass__(cls, **kwargs):
-        super().__init_subclass__(**kwargs)
-        import sys
+    @property
+    def categoricals(self) -> dict:
+        """Return the columns fields to validate against."""
+        return self._categoricals
-        # Deprecated methods
-        if "sphinx" not in sys.modules:
-            if hasattr(cls, "_add_new_from_columns"):
-                cls.add_new_from_columns = cls._add_new_from_columns
+    def _replace_synonyms(
+        self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
+    ):
+        # replace the values in df
+        std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
+        # remove the standardized values from self.non_validated
+        non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
+        if len(non_validated) == 0:
+            self._non_validated.pop(key, None)  # type: ignore
+        else:
+            self._non_validated[key] = non_validated  # type: ignore
+        # logging
+        n = len(syn_mapper)
+        if n > 0:
+            syn_mapper_print = _format_values(
+                [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
+            )
+            s = "s" if n > 1 else ""
+            logger.success(
+                f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
+            )
+        return std_values
     def validate(self) -> bool:
         """Validate dataset.
@@ -127,9 +562,9 @@ class BaseCurator:
         This method also registers the validated records in the current instance.
         Returns:
-            Boolean indicating whether the dataset is validated.
+            The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
         """
-        pass  # pragma: no cover
+        pass
     def standardize(self, key: str) -> None:
         """Replace synonyms with standardized values.
@@ -142,30 +577,48 @@ class BaseCurator:
         Returns:
             None
         """
-        pass  # pragma: no cover
+        pass  # pdagma: no cover
+    @doc_args(SAVE_ARTIFACT_DOCSTRING)
     def save_artifact(
         self,
-        description: str | None = None,
+        *,
         key: str | None = None,
+        description: str | None = None,
         revises: Artifact | None = None,
         run: Run | None = None,
     ) -> Artifact:
-        """Save the dataset as artifact.
+        """{}"""  # noqa: D415
+        from lamindb.core._settings import settings
-        Args:
-            description: A description of the DataFrame object.
-            key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
-            revises: Previous version of the artifact. Triggers a revision.
-            run: The run that creates the artifact.
+        if not self._is_validated:
+            self.validate()  # returns True or False
+            if not self._is_validated:  # need to raise error manually
+                raise ValidationError("Dataset does not validate. Please curate.")
-        Returns:
-            A saved artifact record.
-        """
-        pass  # pragma: no cover
+        # Make sure all labels are saved in the current instance
+        verbosity = settings.verbosity
+        try:
+            settings.verbosity = "warning"
+            self._artifact = save_artifact(  # type: ignore
+                self._dataset,
+                description=description,
+                fields=self.categoricals,
+                columns_field=self._columns_field,
+                key=key,
+                artifact=self._artifact,
+                revises=revises,
+                run=run,
+                schema=None,
+                organism=self._organism,
+            )
+        finally:
+            settings.verbosity = verbosity
+        return self._artifact
-class DataFrameCurator(BaseCurator):
+class DataFrameCatManager(CatManager):
     """Curation flow for a DataFrame object.
     See also :class:`~lamindb.Curator`.
@@ -174,7 +627,6 @@ class DataFrameCurator(BaseCurator):
         df: The DataFrame object to curate.
         columns: The field attribute for the feature column.
         categoricals: A dictionary mapping column names to registry_field.
-        using_key: The reference instance containing registries to validate against.
         verbosity: The verbosity level.
         organism: The organism name.
         sources: A dictionary mapping column names to Source records.
@@ -191,165 +643,103 @@ class DataFrameCurator(BaseCurator):
         ...     df,
         ...     categoricals={
         ...         "cell_type_ontology_id": bt.CellType.ontology_id,
-        ...         "donor_id": ln.ULabel.name
+        ...         "donor_id": ULabel.name
         ...     }
         ... )
     """
     def __init__(
         self,
-        df: pd.DataFrame,
+        df: pd.DataFrame | Artifact,
         columns: FieldAttr = Feature.name,
         categoricals: dict[str, FieldAttr] | None = None,
-        using_key: str | None = None,
         verbosity: str = "hint",
         organism: str | None = None,
         sources: dict[str, Record] | None = None,
         exclude: dict | None = None,
-        check_valid_keys: bool = True,
     ) -> None:
         from lamindb.core._settings import settings
         if organism is not None and not isinstance(organism, str):
             raise ValueError("organism must be a string such as 'human' or 'mouse'!")
-        self._df = df
-        self._fields = categoricals or {}
-        self._columns_field = columns
-        self._using_key = using_key
-        # TODO: change verbosity back
         settings.verbosity = verbosity
-        self._artifact = None
-        self._collection = None
-        self._validated = False
-        self._kwargs = {"organism": organism} if organism else {}
-        self._sources = sources or {}
-        self._exclude = exclude or {}
         self._non_validated = None
-        if check_valid_keys:
-            self._check_valid_keys()
+        super().__init__(
+            dataset=df,
+            columns_field=columns,
+            organism=organism,
+            categoricals=categoricals,
+            sources=sources,
+            exclude=exclude,
+        )
         self._save_columns()
-    @property
-    def non_validated(self) -> dict[str, list[str]]:
-        """Return the non-validated features and labels."""
-        if self._non_validated is None:
-            raise ValidationError("Please run validate() first!")
-        return self._non_validated
-    @property
-    def fields(self) -> dict:
-        """Return the columns fields to validate against."""
-        return self._fields
-    def lookup(
-        self, using_key: str | None = None, public: bool = False
-    ) -> CurateLookup:
+    def lookup(self, public: bool = False) -> CurateLookup:
         """Lookup categories.
         Args:
-            using_key: The instance where the lookup is performed.
-                if "public", the lookup is performed on the public reference.
+            public: If "public", the lookup is performed on the public reference.
         """
         return CurateLookup(
-            categoricals=self._fields,
+            categoricals=self._categoricals,
             slots={"columns": self._columns_field},
-            using_key=using_key or self._using_key,
             public=public,
         )
-    def _check_valid_keys(self, extra: set | None = None) -> None:
-        extra = extra or set()
-        for name, d in {
-            "categoricals": self._fields,
-            "sources": self._sources,
-            "exclude": self._exclude,
-        }.items():
-            if not isinstance(d, dict):
-                raise TypeError(f"{name} must be a dictionary!")
-            valid_keys = set(self._df.columns) | {"columns"} | extra
-            nonval_keys = [key for key in d.keys() if key not in valid_keys]
-            n = len(nonval_keys)
-            s = "s" if n > 1 else ""
-            are = "are" if n > 1 else "is"
-            if len(nonval_keys) > 0:
-                raise ValidationError(
-                    f"key{s} passed to {name} {are} not present in columns: {colors.yellow(_format_values(nonval_keys))}"
-                )
     def _save_columns(self, validated_only: bool = True) -> None:
         """Save column name records."""
         # Always save features specified as the fields keys
         update_registry(
-            values=list(self.fields.keys()),
+            values=list(self.categoricals.keys()),
             field=self._columns_field,
             key="columns",
-            using_key=self._using_key,
             validated_only=False,
             source=self._sources.get("columns"),
             exclude=self._exclude.get("columns"),
-            **self._kwargs,  # type: ignore
         )
         # Save the rest of the columns based on validated_only
-        additional_columns = set(self._df.columns) - set(self.fields.keys())
+        additional_columns = set(self._dataset.columns) - set(self.categoricals.keys())
         if additional_columns:
             update_registry(
                 values=list(additional_columns),
                 field=self._columns_field,
                 key="columns",
-                using_key=self._using_key,
                 validated_only=validated_only,
-                df=self._df,  # Get the Feature type from df
+                df=self._dataset,  # Get the Feature type from df
                 source=self._sources.get("columns"),
                 exclude=self._exclude.get("columns"),
-                **self._kwargs,  # type: ignore
             )
-    def add_new_from(self, key: str, organism: str | None = None, **kwargs):
-        """Add validated & new categories.
+    @deprecated(new_name="is run by default")
+    def add_new_from_columns(self, organism: str | None = None, **kwargs):
+        pass
+    def validate(self) -> bool:
+        """Validate variables and categorical observations.
+        This method also registers the validated records in the current instance:
+        - from public sources
         Args:
-            key: The key referencing the slot in the DataFrame from which to draw terms.
             organism: The organism name.
-            **kwargs: Additional keyword arguments to pass to create new records
+        Returns:
+            Whether the DataFrame is validated.
         """
-        if len(kwargs) > 0 and key == "all":
-            raise ValueError("Cannot pass additional arguments to 'all' key!")
-        self._kwargs.update({"organism": organism} if organism else {})
-        self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
-    def _add_new_from_columns(self, organism: str | None = None, **kwargs):
-        """Deprecated to run by default during init."""
-        warnings.warn(
-            "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
-            DeprecationWarning,
-            stacklevel=2,
+        # add all validated records to the current instance
+        self._update_registry_all()
+        self._validate_category_error_messages = ""  # reset the error messages
+        self._is_validated, self._non_validated = validate_categories_in_df(  # type: ignore
+            self._dataset,
+            fields=self.categoricals,
+            sources=self._sources,
+            exclude=self._exclude,
+            curator=self,
+            organism=self._organism,
         )
-        pass
-    def _replace_synonyms(
-        self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
-    ):
-        # replace the values in df
-        std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
-        # remove the standardized values from self.non_validated
-        non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
-        if len(non_validated) == 0:
-            self._non_validated.pop(key, None)  # type: ignore
-        else:
-            self._non_validated[key] = non_validated  # type: ignore
-        # logging
-        n = len(syn_mapper)
-        if n > 0:
-            syn_mapper_print = _format_values(
-                [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
-            )
-            s = "s" if n > 1 else ""
-            logger.success(
-                f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
-            )
-        return std_values
+        return self._is_validated
     def standardize(self, key: str) -> None:
         """Replace synonyms with standardized values.
@@ -359,6 +749,8 @@ class DataFrameCurator(BaseCurator):
         Args:
             key: The key referencing the column in the DataFrame to standardize.
         """
+        if self._artifact is not None:
+            raise RuntimeError("can't mutate the dataset when an artifact is passed!")
         # list is needed to avoid RuntimeError: dictionary changed size during iteration
         avail_keys = list(self.non_validated.keys())
         if len(avail_keys) == 0:
@@ -367,137 +759,74 @@ class DataFrameCurator(BaseCurator):
         if key == "all":
             for k in avail_keys:
-                if k in self._fields:  # needed to exclude var_index
+                if k in self._categoricals:  # needed to exclude var_index
                     syn_mapper = standardize_categories(
                         self.non_validated[k],
-                        field=self._fields[k],
-                        using_key=self._using_key,
+                        field=self._categoricals[k],
                         source=self._sources.get(k),
-                        **self._kwargs,
                     )
-                    self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k])
+                    self._dataset[k] = self._replace_synonyms(
+                        k, syn_mapper, self._dataset[k]
+                    )
         else:
             if key not in avail_keys:
-                if key in self._fields:
+                if key in self._categoricals:
                     logger.info(f"No unstandardized values found for {key!r}")
                 else:
                     raise KeyError(
                         f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
                     )
             else:
-                if key in self._fields:  # needed to exclude var_index
+                if key in self._categoricals:  # needed to exclude var_index
                     syn_mapper = standardize_categories(
                         self.non_validated[key],
-                        field=self._fields[key],
-                        using_key=self._using_key,
+                        field=self._categoricals[key],
                         source=self._sources.get(key),
-                        **self._kwargs,
+                        organism=self._organism,
                     )
-                    self._df[key] = self._replace_synonyms(
-                        key, syn_mapper, self._df[key]
+                    self._dataset[key] = self._replace_synonyms(
+                        key, syn_mapper, self._dataset[key]
                     )
+    def _update_registry_all(self, validated_only: bool = True, **kwargs):
+        """Save labels for all features."""
+        for name in self.categoricals.keys():
+            self._update_registry(name, validated_only=validated_only, **kwargs)
     def _update_registry(
         self, categorical: str, validated_only: bool = True, **kwargs
     ) -> None:
         if categorical == "all":
             self._update_registry_all(validated_only=validated_only, **kwargs)
         else:
-            if categorical not in self.fields:
+            if categorical not in self.categoricals:
                 raise ValidationError(
                     f"Feature {categorical} is not part of the fields!"
                 )
             update_registry(
-                values=_flatten_unique(self._df[categorical]),
-                field=self.fields[categorical],
+                values=_flatten_unique(self._dataset[categorical]),
+                field=self.categoricals[categorical],
                 key=categorical,
-                using_key=self._using_key,
                 validated_only=validated_only,
                 source=self._sources.get(categorical),
                 exclude=self._exclude.get(categorical),
-                **kwargs,
+                organism=self._organism,
             )
             # adding new records removes them from non_validated
             if not validated_only and self._non_validated:
                 self._non_validated.pop(categorical, None)  # type: ignore
-    def _update_registry_all(self, validated_only: bool = True, **kwargs):
-        """Save labels for all features."""
-        for name in self.fields.keys():
-            self._update_registry(name, validated_only=validated_only, **kwargs)
-    def validate(self, organism: str | None = None) -> bool:
-        """Validate variables and categorical observations.
-        This method also registers the validated records in the current instance:
-        - from public sources
-        - from the using_key instance
+    def add_new_from(self, key: str, **kwargs):
+        """Add validated & new categories.
         Args:
+            key: The key referencing the slot in the DataFrame from which to draw terms.
             organism: The organism name.
-        Returns:
-            Whether the DataFrame is validated.
-        """
-        self._kwargs.update({"organism": organism} if organism else {})
-        # add all validated records to the current instance
-        self._update_registry_all()
-        self._validated, self._non_validated = validate_categories_in_df(  # type: ignore
-            self._df,
-            fields=self.fields,
-            using_key=self._using_key,
-            sources=self._sources,
-            exclude=self._exclude,
-            **self._kwargs,
-        )
-        return self._validated
-    def save_artifact(
-        self,
-        description: str | None = None,
-        key: str | None = None,
-        revises: Artifact | None = None,
-        run: Run | None = None,
-    ) -> Artifact:
-        """Save the validated DataFrame and metadata.
-        Args:
-            description: Description of the DataFrame object.
-            key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
-                Artifacts with the same key form a revision family.
-            revises: Previous version of the artifact. Triggers a revision.
-            run: The run that creates the artifact.
-        Returns:
-            A saved artifact record.
+            **kwargs: Additional keyword arguments to pass to create new records
         """
-        from lamindb.core._settings import settings
-        if not self._validated:
-            self.validate()
-            if not self._validated:
-                raise ValidationError("Dataset does not validate. Please curate.")
-        # Make sure all labels are saved in the current instance
-        verbosity = settings.verbosity
-        try:
-            settings.verbosity = "warning"
-            self._artifact = save_artifact(
-                self._df,
-                description=description,
-                fields=self.fields,
-                columns_field=self._columns_field,
-                key=key,
-                revises=revises,
-                run=run,
-                **self._kwargs,
-            )
-        finally:
-            settings.verbosity = verbosity
-        return self._artifact
+        if len(kwargs) > 0 and key == "all":
+            raise ValueError("Cannot pass additional arguments to 'all' key!")
+        self._update_registry(key, validated_only=False, **kwargs)
     def clean_up_failed_runs(self):
         """Clean up previous failed runs that don't save any outputs."""
@@ -509,21 +838,14 @@ class DataFrameCurator(BaseCurator):
             ).delete()
-class AnnDataCurator(DataFrameCurator):
-    """Curation flow for ``AnnData``.
-    See also :class:`~lamindb.Curator`.
-    Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
-    See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
+class AnnDataCatManager(CatManager):
+    """Manage categorical curation.
     Args:
         data: The AnnData object or an AnnData-like path.
         var_index: The registry field for mapping the ``.var`` index.
         categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
         obs_columns: The registry field for mapping the ``.obs.columns``.
-        using_key: A reference LaminDB instance.
         verbosity: The verbosity level.
         organism: The organism name.
         sources: A dictionary mapping ``.obs.columns`` to Source records.
@@ -538,7 +860,7 @@ class AnnDataCurator(DataFrameCurator):
         ...     var_index=bt.Gene.ensembl_gene_id,
         ...     categoricals={
         ...         "cell_type_ontology_id": bt.CellType.ontology_id,
-        ...         "donor_id": ln.ULabel.name
+        ...         "donor_id": ULabel.name
         ...     },
         ...     organism="human",
         ... )
@@ -546,56 +868,48 @@ class AnnDataCurator(DataFrameCurator):
     def __init__(
         self,
-        data: ad.AnnData | UPathStr,
+        data: ad.AnnData | Artifact,
         var_index: FieldAttr,
         categoricals: dict[str, FieldAttr] | None = None,
         obs_columns: FieldAttr = Feature.name,
-        using_key: str | None = None,
         verbosity: str = "hint",
         organism: str | None = None,
         sources: dict[str, Record] | None = None,
         exclude: dict | None = None,
     ) -> None:
-        from lamindb_setup.core import upath
         if isinstance(var_index, str):
             raise TypeError("var_index parameter has to be a bionty field")
-        from .._artifact import data_is_anndata
         if sources is None:
             sources = {}
         if not data_is_anndata(data):
-            raise TypeError(
-                "data has to be an AnnData object or a path to AnnData-like"
-            )
-        if isinstance(data, ad.AnnData):
-            self._adata = data
-        else:  # pragma: no cover
-            from lamindb.core.storage._backed_access import backed_access
-            self._adata = backed_access(upath.create_path(data))
+            raise TypeError("data has to be an AnnData object")
         if "symbol" in str(var_index):
             logger.warning(
                 "indexing datasets with gene symbols can be problematic: https://docs.lamin.ai/faq/symbol-mapping"
             )
-        self._data = data
+        self._obs_fields = categoricals or {}
         self._var_field = var_index
         super().__init__(
-            df=self._adata.obs,
+            dataset=data,
             categoricals=categoricals,
+            sources=sources,
+            organism=organism,
+            exclude=exclude,
+            columns_field=var_index,
+        )
+        self._adata = self._dataset
+        self._obs_df_curator = DataFrameCatManager(
+            df=self._adata.obs,
+            categoricals=self.categoricals,
             columns=obs_columns,
-            using_key=using_key,
             verbosity=verbosity,
-            organism=organism,
+            organism=None,
             sources=sources,
             exclude=exclude,
-            check_valid_keys=False,
         )
-        self._obs_fields = categoricals or {}
-        self._check_valid_keys(extra={"var_index"})
     @property
     def var_index(self) -> FieldAttr:
@@ -607,54 +921,53 @@ class AnnDataCurator(DataFrameCurator):
         """Return the obs fields to validate against."""
         return self._obs_fields
-    def lookup(
-        self, using_key: str | None = None, public: bool = False
-    ) -> CurateLookup:
+    def lookup(self, public: bool = False) -> CurateLookup:
         """Lookup categories.
         Args:
-            using_key: The instance where the lookup is performed.
-                if "public", the lookup is performed on the public reference.
+            public: If "public", the lookup is performed on the public reference.
         """
         return CurateLookup(
             categoricals=self._obs_fields,
             slots={"columns": self._columns_field, "var_index": self._var_field},
-            using_key=using_key or self._using_key,
             public=public,
         )
     def _save_from_var_index(
-        self, validated_only: bool = True, organism: str | None = None
+        self,
+        validated_only: bool = True,
     ):
         """Save variable records."""
         update_registry(
             values=list(self._adata.var.index),
             field=self.var_index,
             key="var_index",
-            using_key=self._using_key,
             validated_only=validated_only,
-            organism=organism,
+            organism=self._organism,
             source=self._sources.get("var_index"),
             exclude=self._exclude.get("var_index"),
         )
-    def _update_registry_all(self, validated_only: bool = True, **kwargs):
-        """Save labels for all features."""
-        self._save_from_var_index(validated_only=validated_only, **self._kwargs)
-        for name in self._obs_fields.keys():
-            self._update_registry(name, validated_only=validated_only, **self._kwargs)
+    def add_new_from(self, key: str, **kwargs):
+        """Add validated & new categories.
-    def add_new_from_var_index(self, organism: str | None = None, **kwargs):
+        Args:
+            key: The key referencing the slot in the DataFrame from which to draw terms.
+            organism: The organism name.
+            **kwargs: Additional keyword arguments to pass to create new records
+        """
+        self._obs_df_curator.add_new_from(key, **kwargs)
+    def add_new_from_var_index(self, **kwargs):
         """Update variable records.
         Args:
             organism: The organism name.
             **kwargs: Additional keyword arguments to pass to create new records.
         """
-        self._kwargs.update({"organism": organism} if organism else {})
-        self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
+        self._save_from_var_index(validated_only=False, **kwargs)
-    def validate(self, organism: str | None = None) -> bool:
+    def validate(self) -> bool:
         """Validate categories.
         This method also registers the validated records in the current instance.
@@ -665,38 +978,25 @@ class AnnDataCurator(DataFrameCurator):
         Returns:
             Whether the AnnData object is validated.
         """
-        self._kwargs.update({"organism": organism} if organism else {})
-        if self._using_key is not None and self._using_key != "default":
-            logger.important(
-                f"validating metadata using registries of instance {colors.italic(self._using_key)}"
-            )
+        self._validate_category_error_messages = ""  # reset the error messages
         # add all validated records to the current instance
-        self._update_registry_all()
+        self._save_from_var_index(validated_only=True)
         validated_var, non_validated_var = validate_categories(
             self._adata.var.index,
             field=self._var_field,
             key="var_index",
-            using_key=self._using_key,
             source=self._sources.get("var_index"),
             hint_print=".add_new_from_var_index()",
             exclude=self._exclude.get("var_index"),
-            **self._kwargs,  # type: ignore
-        )
-        validated_obs, non_validated_obs = validate_categories_in_df(
-            self._adata.obs,
-            fields=self.categoricals,
-            using_key=self._using_key,
-            sources=self._sources,
-            exclude=self._exclude,
-            **self._kwargs,
+            organism=self._organism,  # type: ignore
         )
-        self._non_validated = non_validated_obs  # type: ignore
+        validated_obs = self._obs_df_curator.validate()
+        self._non_validated = self._obs_df_curator._non_validated  # type: ignore
         if len(non_validated_var) > 0:
             self._non_validated["var_index"] = non_validated_var  # type: ignore
-        self._validated = validated_var and validated_obs
-        return self._validated
+        self._is_validated = validated_var and validated_obs
+        return self._is_validated
     def standardize(self, key: str):
         """Replace synonyms with standardized values.
@@ -709,68 +1009,26 @@ class AnnDataCurator(DataFrameCurator):
         Inplace modification of the dataset.
         """
+        if self._artifact is not None:
+            raise RuntimeError("can't mutate the dataset when an artifact is passed!")
         if key in self._adata.obs.columns or key == "all":
             # standardize obs columns
-            super().standardize(key)
+            self._obs_df_curator.standardize(key)
         # in addition to the obs columns, standardize the var.index
         if key == "var_index" or key == "all":
             syn_mapper = standardize_categories(
                 self._adata.var.index,
                 field=self.var_index,
-                using_key=self._using_key,
                 source=self._sources.get("var_index"),
-                **self._kwargs,
+                organism=self._organism,
             )
             if "var_index" in self._non_validated:  # type: ignore
                 self._adata.var.index = self._replace_synonyms(
                     "var_index", syn_mapper, self._adata.var.index
                 )
-    def save_artifact(
-        self,
-        description: str | None = None,
-        key: str | None = None,
-        revises: Artifact | None = None,
-        run: Run | None = None,
-    ) -> Artifact:
-        """Save the validated ``AnnData`` and metadata.
-        Args:
-            description: A description of the ``AnnData`` object.
-            key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
-                Artifacts with the same key form a revision family.
-            revises: Previous version of the artifact. Triggers a revision.
-            run: The run that creates the artifact.
-        Returns:
-            A saved artifact record.
-        """
-        from lamindb.core._settings import settings
-        if not self._validated:
-            self.validate()
-            if not self._validated:
-                raise ValidationError("Dataset does not validate. Please curate.")
-        verbosity = settings.verbosity
-        try:
-            settings.verbosity = "warning"
-            self._artifact = save_artifact(
-                self._data,
-                adata=self._adata,
-                description=description,
-                columns_field=self.var_index,
-                fields=self.categoricals,
-                key=key,
-                revises=revises,
-                run=run,
-                **self._kwargs,
-            )
-        finally:
-            settings.verbosity = verbosity
-        return self._artifact
-class MuDataCurator:
+class MuDataCatManager(CatManager):
     """Curation flow for a ``MuData`` object.
     See also :class:`~lamindb.Curator`.
@@ -782,10 +1040,9 @@ class MuDataCurator:
         mdata: The MuData object to curate.
         var_index: The registry field for mapping the ``.var`` index for each modality.
             For example:
-            ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
+            ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
         categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
             Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
-        using_key: A reference LaminDB instance.
         verbosity: The verbosity level.
         organism: The organism name.
         sources: A dictionary mapping ``.obs.columns`` to Source records.
@@ -799,11 +1056,11 @@ class MuDataCurator:
         ...     mdata,
         ...     var_index={
         ...         "rna": bt.Gene.ensembl_gene_id,
-        ...         "adt": ln.CellMarker.name
+        ...         "adt": CellMarker.name
         ...     },
         ...     categoricals={
         ...         "cell_type_ontology_id": bt.CellType.ontology_id,
-        ...         "donor_id": ln.ULabel.name
+        ...         "donor_id": ULabel.name
         ...     },
         ...     organism="human",
         ... )
@@ -811,52 +1068,47 @@ class MuDataCurator:
     def __init__(
         self,
-        mdata: MuData,
+        mdata: MuData | Artifact,
         var_index: dict[str, FieldAttr],
         categoricals: dict[str, FieldAttr] | None = None,
-        using_key: str | None = None,
         verbosity: str = "hint",
         organism: str | None = None,
         sources: dict[str, Record] | None = None,
         exclude: dict | None = None,  # {modality: {field: [values]}}
     ) -> None:
-        if sources is None:
-            sources = {}
-        self._sources = sources
-        if exclude is None:
-            exclude = {}
-        self._exclude = exclude
-        self._mdata = mdata
-        self._kwargs = {"organism": organism} if organism else {}
+        super().__init__(
+            dataset=mdata,
+            categoricals={},
+            sources=sources,
+            organism=organism,
+            exclude=exclude,
+        )
+        self._columns_field = var_index  # this is for consistency with BaseCatManager
         self._var_fields = var_index
         self._verify_modality(self._var_fields.keys())
         self._obs_fields = self._parse_categoricals(categoricals)
         self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
-        self._using_key = using_key
         self._verbosity = verbosity
         self._obs_df_curator = None
         if "obs" in self._modalities:
-            self._obs_df_curator = DataFrameCurator(
-                df=mdata.obs,
+            self._obs_df_curator = DataFrameCatManager(
+                df=self._dataset.obs,
                 columns=Feature.name,
                 categoricals=self._obs_fields.get("obs", {}),
-                using_key=using_key,
                 verbosity=verbosity,
                 sources=self._sources.get("obs"),
                 exclude=self._exclude.get("obs"),
-                check_valid_keys=False,
-                **self._kwargs,
+                organism=organism,
             )
         self._mod_adata_curators = {
-            modality: AnnDataCurator(
-                data=mdata[modality],
+            modality: AnnDataCatManager(
+                data=self._dataset[modality],
                 var_index=var_index.get(modality),
                 categoricals=self._obs_fields.get(modality),
-                using_key=using_key,
                 verbosity=verbosity,
                 sources=self._sources.get(modality),
                 exclude=self._exclude.get(modality),
-                **self._kwargs,
+                organism=organism,
             )
             for modality in self._modalities
             if modality != "obs"
@@ -874,7 +1126,7 @@ class MuDataCurator:
         return self._obs_fields
     @property
-    def non_validated(self) -> dict[str, dict[str, list[str]]]:
+    def non_validated(self) -> dict[str, dict[str, list[str]]]:  # type: ignore
         """Return the non-validated features and labels."""
         if self._non_validated is None:
             raise ValidationError("Please run validate() first!")
@@ -883,15 +1135,15 @@ class MuDataCurator:
     def _verify_modality(self, modalities: Iterable[str]):
         """Verify the modality exists."""
         for modality in modalities:
-            if modality not in self._mdata.mod.keys():
+            if modality not in self._dataset.mod.keys():
                 raise ValidationError(f"modality '{modality}' does not exist!")
     def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
         """Parse the categorical fields."""
-        prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
+        prefixes = {f"{k}:" for k in self._dataset.mod.keys()}
         obs_fields: dict[str, dict[str, FieldAttr]] = {}
         for k, v in categoricals.items():
-            if k not in self._mdata.obs.columns:
+            if k not in self._dataset.obs.columns:
                 raise ValidationError(f"column '{k}' does not exist in mdata.obs!")
             if any(k.startswith(prefix) for prefix in prefixes):
                 modality, col = k.split(":")[0], k.split(":")[1]
@@ -904,14 +1156,11 @@ class MuDataCurator:
                 obs_fields["obs"][k] = v
         return obs_fields
-    def lookup(
-        self, using_key: str | None = None, public: bool = False
-    ) -> CurateLookup:
+    def lookup(self, public: bool = False) -> CurateLookup:
         """Lookup categories.
         Args:
-            using_key: The instance where the lookup is performed.
-                if "public", the lookup is performed on the public reference.
+            public: Perform lookup on public source ontologies.
         """
         obs_fields = {}
         for mod, fields in self._obs_fields.items():
@@ -925,27 +1174,19 @@ class MuDataCurator:
             slots={
                 **{f"{k}_var_index": v for k, v in self._var_fields.items()},
             },
-            using_key=using_key or self._using_key,
             public=public,
         )
+    @deprecated(new_name="is run by default")
     def add_new_from_columns(
         self,
         modality: str,
         column_names: list[str] | None = None,
-        organism: str | None = None,
         **kwargs,
     ):
-        """Update columns records."""
-        warnings.warn(
-            "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
+        pass
-    def add_new_from_var_index(
-        self, modality: str, organism: str | None = None, **kwargs
-    ):
+    def add_new_from_var_index(self, modality: str, **kwargs):
         """Update variable records.
         Args:
@@ -953,25 +1194,19 @@ class MuDataCurator:
             organism: The organism name.
             **kwargs: Additional keyword arguments to pass to create new records.
         """
-        self._kwargs.update({"organism": organism} if organism else {})
-        self._mod_adata_curators[modality].add_new_from_var_index(
-            **self._kwargs, **kwargs
-        )
+        self._mod_adata_curators[modality].add_new_from_var_index(**kwargs)
     def _update_registry_all(self):
         """Update all registries."""
         if self._obs_df_curator is not None:
-            self._obs_df_curator._update_registry_all(
-                validated_only=True, **self._kwargs
-            )
+            self._obs_df_curator._update_registry_all(validated_only=True)
         for _, adata_curator in self._mod_adata_curators.items():
-            adata_curator._update_registry_all(validated_only=True, **self._kwargs)
+            adata_curator._obs_df_curator._update_registry_all(validated_only=True)
     def add_new_from(
         self,
         key: str,
         modality: str | None = None,
-        organism: str | None = None,
         **kwargs,
     ):
         """Add validated & new categories.
@@ -984,24 +1219,17 @@ class MuDataCurator:
         """
         if len(kwargs) > 0 and key == "all":
             raise ValueError("Cannot pass additional arguments to 'all' key!")
-        self._kwargs.update({"organism": organism} if organism else {})
         modality = modality or "obs"
         if modality in self._mod_adata_curators:
             adata_curator = self._mod_adata_curators[modality]
-            adata_curator.add_new_from(key=key, **self._kwargs, **kwargs)
+            adata_curator.add_new_from(key=key, **kwargs)
         if modality == "obs":
-            self._obs_df_curator.add_new_from(key=key, **self._kwargs, **kwargs)
+            self._obs_df_curator.add_new_from(key=key, **kwargs)
-    def validate(self, organism: str | None = None) -> bool:
+    def validate(self) -> bool:
         """Validate categories."""
         from lamindb.core._settings import settings
-        self._kwargs.update({"organism": organism} if organism else {})
-        if self._using_key is not None and self._using_key != "default":
-            logger.important(
-                f"validating using registries of instance {colors.italic(self._using_key)}"
-            )
         # add all validated records to the current instance
         verbosity = settings.verbosity
         try:
@@ -1015,20 +1243,20 @@ class MuDataCurator:
         obs_validated = True
         if "obs" in self._modalities:
             logger.info('validating categoricals in "obs"...')
-            obs_validated &= self._obs_df_curator.validate(**self._kwargs)
+            obs_validated &= self._obs_df_curator.validate()
             self._non_validated["obs"] = self._obs_df_curator.non_validated  # type: ignore
             logger.print("")
         mods_validated = True
         for modality, adata_curator in self._mod_adata_curators.items():
             logger.info(f'validating categoricals in modality "{modality}"...')
-            mods_validated &= adata_curator.validate(**self._kwargs)
+            mods_validated &= adata_curator.validate()
             if len(adata_curator.non_validated) > 0:
                 self._non_validated[modality] = adata_curator.non_validated  # type: ignore
             logger.print("")
-        self._validated = obs_validated & mods_validated
-        return self._validated
+        self._is_validated = obs_validated & mods_validated
+        return self._is_validated
     def standardize(self, key: str, modality: str | None = None):
         """Replace synonyms with standardized values.
@@ -1039,6 +1267,8 @@ class MuDataCurator:
         Inplace modification of the dataset.
         """
+        if self._artifact is not None:
+            raise RuntimeError("can't mutate the dataset when an artifact is passed!")
         modality = modality or "obs"
         if modality in self._mod_adata_curators:
             adata_curator = self._mod_adata_curators[modality]
@@ -1046,47 +1276,6 @@ class MuDataCurator:
         if modality == "obs":
             self._obs_df_curator.standardize(key=key)
-    def save_artifact(
-        self,
-        description: str | None = None,
-        key: str | None = None,
-        revises: Artifact | None = None,
-        run: Run | None = None,
-    ) -> Artifact:
-        """Save the validated ``MuData`` and metadata.
-        Args:
-            description: A description of the ``MuData`` object.
-            key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
-            revises: Previous version of the artifact. Triggers a revision.
-            run: The run that creates the artifact.
-        Returns:
-            A saved artifact record.
-        """
-        from lamindb.core._settings import settings
-        if not self._validated:
-            self.validate()
-            if not self._validated:
-                raise ValidationError("Dataset does not validate. Please curate.")
-        verbosity = settings.verbosity
-        try:
-            settings.verbosity = "warning"
-            self._artifact = save_artifact(
-                self._mdata,
-                description=description,
-                columns_field=self.var_index,
-                fields=self.categoricals,
-                key=key,
-                revises=revises,
-                run=run,
-                **self._kwargs,
-            )
-        finally:
-            settings.verbosity = verbosity
-        return self._artifact
 def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
     if (n := len(nonval_keys)) > 0:
@@ -1097,8 +1286,8 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
         )
-class SOMACurator(BaseCurator):
-    """Curation flow for ``tiledbsoma``.
+class TiledbsomaCatManager(CatManager):
+    """Curation flow for `tiledbsoma.Experiment`.
     See also :class:`~lamindb.Curator`.
@@ -1123,7 +1312,7 @@ class SOMACurator(BaseCurator):
         ...     var_index={"RNA": ("var_id", bt.Gene.symbol)},
         ...     categoricals={
         ...         "cell_type_ontology_id": bt.CellType.ontology_id,
-        ...         "donor_id": ln.ULabel.name
+        ...         "donor_id": ULabel.name
         ...     },
         ...     organism="human",
         ... )
@@ -1138,23 +1327,21 @@ class SOMACurator(BaseCurator):
         organism: str | None = None,
         sources: dict[str, Record] | None = None,
         exclude: dict[str, str | list[str]] | None = None,
-        using_key: str | None = None,
     ):
         self._obs_fields = categoricals or {}
         self._var_fields = var_index
         self._columns_field = obs_columns
         if isinstance(experiment_uri, Artifact):
-            self._experiment_uri = experiment_uri.path
+            self._dataset = experiment_uri.path
             self._artifact = experiment_uri
         else:
-            self._experiment_uri = UPath(experiment_uri)
+            self._dataset = UPath(experiment_uri)
             self._artifact = None
         self._organism = organism
-        self._using_key = using_key
         self._sources = sources or {}
         self._exclude = exclude or {}
-        self._validated: bool | None = False
+        self._is_validated: bool | None = False
         self._non_validated_values: dict[str, list] | None = None
         self._validated_values: dict[str, list] = {}
         # filled by _check_save_keys
@@ -1172,7 +1359,7 @@ class SOMACurator(BaseCurator):
     def _check_save_keys(self):
         from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
-        with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
+        with _open_tiledbsoma(self._dataset, mode="r") as experiment:
             experiment_obs = experiment.obs
             self._n_obs = len(experiment_obs)
             self._obs_pa_schema = experiment_obs.schema
@@ -1228,7 +1415,6 @@ class SOMACurator(BaseCurator):
             values=register_columns,
             field=self._columns_field,
             key="columns",
-            using_key=self._using_key,
             validated_only=False,
             organism=organism,
             source=self._sources.get("columns"),
@@ -1244,7 +1430,6 @@ class SOMACurator(BaseCurator):
                 values=additional_columns,
                 field=self._columns_field,
                 key="columns",
-                using_key=self._using_key,
                 validated_only=True,
                 organism=organism,
                 source=self._sources.get("columns"),
@@ -1257,7 +1442,7 @@ class SOMACurator(BaseCurator):
         validated = True
         self._non_validated_values = {}
-        with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
+        with _open_tiledbsoma(self._dataset, mode="r") as experiment:
             for ms, (key, field) in self._var_fields.items():
                 var_ms = experiment.ms[ms].var
                 var_ms_key = f"{ms}__{key}"
@@ -1274,7 +1459,6 @@ class SOMACurator(BaseCurator):
                     values=var_ms_values,
                     field=field,
                     key=var_ms_key,
-                    using_key=self._using_key,
                     validated_only=True,
                     organism=organism,
                     source=self._sources.get(var_ms_key),
@@ -1284,7 +1468,6 @@ class SOMACurator(BaseCurator):
                     values=var_ms_values,
                     field=field,
                     key=var_ms_key,
-                    using_key=self._using_key,
                     organism=organism,
                     source=self._sources.get(var_ms_key),
                     exclude=self._exclude.get(var_ms_key),
@@ -1310,7 +1493,6 @@ class SOMACurator(BaseCurator):
                     values=values,
                     field=field,
                     key=key,
-                    using_key=self._using_key,
                     validated_only=True,
                     organism=organism,
                     source=self._sources.get(key),
@@ -1320,7 +1502,6 @@ class SOMACurator(BaseCurator):
                     values=values,
                     field=field,
                     key=key,
-                    using_key=self._using_key,
                     organism=organism,
                     source=self._sources.get(key),
                     exclude=self._exclude.get(key),
@@ -1330,8 +1511,8 @@ class SOMACurator(BaseCurator):
                     self._non_validated_values[key] = non_val
                 else:
                     self._validated_values[key] = values
-        self._validated = validated
-        return self._validated
+        self._is_validated = validated
+        return self._is_validated
     def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
         assert self._non_validated_values is not None  # noqa: S101
@@ -1346,7 +1527,7 @@ class SOMACurator(BaseCurator):
         values = self._non_validated_values.get(key, [])
         return values, field
-    def add_new_from(self, key: str) -> None:
+    def add_new_from(self, key: str, **kwargs) -> None:
         """Add validated & new categories.
         Args:
@@ -1378,11 +1559,11 @@ class SOMACurator(BaseCurator):
                 values=values,
                 field=field,
                 key=k,
-                using_key=self._using_key,
                 validated_only=False,
                 organism=organism,
                 source=self._sources.get(k),
                 exclude=self._exclude.get(k),
+                **kwargs,
             )
             # update non-validated values list but keep the key there
             # it will be removed by .validate()
@@ -1405,19 +1586,15 @@ class SOMACurator(BaseCurator):
         """Return the obs fields to validate against."""
         return self._obs_fields
-    def lookup(
-        self, using_key: str | None = None, public: bool = False
-    ) -> CurateLookup:
+    def lookup(self, public: bool = False) -> CurateLookup:
         """Lookup categories.
         Args:
-            using_key: The instance where the lookup is performed.
-                if "public", the lookup is performed on the public reference.
+            public: If "public", the lookup is performed on the public reference.
         """
         return CurateLookup(
             categoricals=self._obs_fields,
             slots={"columns": self._columns_field, **self._var_fields_flat},
-            using_key=using_key or self._using_key,
             public=public,
         )
@@ -1462,7 +1639,6 @@ class SOMACurator(BaseCurator):
             syn_mapper = standardize_categories(
                 values=values,
                 field=field,
-                using_key=self._using_key,
                 source=self._sources.get(k),
                 organism=organism,
             )
@@ -1471,7 +1647,7 @@ class SOMACurator(BaseCurator):
             from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
-            with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
+            with _open_tiledbsoma(self._dataset, mode="r") as experiment:
                 value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
                 table = slot(experiment).read(value_filter=value_filter).concat()
@@ -1484,7 +1660,7 @@ class SOMACurator(BaseCurator):
                 lambda val: syn_mapper.get(val, val)  # noqa
             )
             # write the mapped values
-            with _open_tiledbsoma(self._experiment_uri, mode="w") as experiment:
+            with _open_tiledbsoma(self._dataset, mode="w") as experiment:
                 slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
             # update non_validated dict
             non_val_k = [
@@ -1502,8 +1678,9 @@ class SOMACurator(BaseCurator):
     def save_artifact(
         self,
-        description: str | None = None,
+        *,
         key: str | None = None,
+        description: str | None = None,
         revises: Artifact | None = None,
         run: Run | None = None,
     ) -> Artifact:
@@ -1512,7 +1689,7 @@ class SOMACurator(BaseCurator):
         Args:
             description: A description of the ``tiledbsoma`` store.
             key: A path-like key to reference artifact in default storage,
-                e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a revision family.
+                e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
             revises: Previous version of the artifact. Triggers a revision.
             run: The run that creates the artifact.
@@ -1521,14 +1698,14 @@ class SOMACurator(BaseCurator):
         """
         from lamindb.core._data import add_labels
-        if not self._validated:
+        if not self._is_validated:
             self.validate()
-            if not self._validated:
+            if not self._is_validated:
                 raise ValidationError("Dataset does not validate. Please curate.")
         if self._artifact is None:
             artifact = Artifact(
-                self._experiment_uri,
+                self._dataset,
                 description=description,
                 key=key,
                 revises=revises,
@@ -1540,7 +1717,7 @@ class SOMACurator(BaseCurator):
         else:
             artifact = self._artifact
-        _schemas_m2m = {}
+        feature_sets = {}
         if len(self._obs_fields) > 0:
             organism = check_registry_organism(
                 self._columns_field.field.model, self._organism
@@ -1550,7 +1727,7 @@ class SOMACurator(BaseCurator):
                 empty_dict, schema=self._obs_pa_schema
             ).to_pandas()
             # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
-            _schemas_m2m["obs"] = Schema.from_df(
+            feature_sets["obs"] = Schema.from_df(
                 df=mock_df,
                 field=self._columns_field,
                 mute=True,
@@ -1561,238 +1738,1370 @@ class SOMACurator(BaseCurator):
             organism = check_registry_organism(
                 var_field.field.model, self._organism
             ).get("organism")
-            _schemas_m2m[f"{ms}__var"] = Schema.from_values(
+            feature_sets[f"{ms}__var"] = Schema.from_values(
                 values=self._validated_values[f"{ms}__{var_key}"],
                 field=var_field,
                 organism=organism,
                 raise_validation_error=False,
             )
-        artifact._staged__schemas_m2m = _schemas_m2m
+        artifact._staged_feature_sets = feature_sets
+        feature_ref_is_name = _ref_is_name(self._columns_field)
+        features = Feature.lookup().dict()
+        for key, field in self._obs_fields.items():
+            feature = features.get(key)
+            registry = field.field.model
+            organism = check_registry_organism(field.field.model, self._organism).get(
+                "organism"
+            )
+            labels = registry.from_values(
+                values=self._validated_values[key], field=field, organism=organism
+            )
+            if len(labels) == 0:
+                continue
+            if hasattr(registry, "_name_field"):
+                label_ref_is_name = field.field.name == registry._name_field
+                add_labels(
+                    artifact,
+                    records=labels,
+                    feature=feature,
+                    feature_ref_is_name=feature_ref_is_name,
+                    label_ref_is_name=label_ref_is_name,
+                    from_curator=True,
+                )
+        return artifact.save()
+class SpatialDataCatManager(CatManager):
+    """Curation flow for a ``Spatialdata`` object.
+    See also :class:`~lamindb.Curator`.
+    Note that if genes or other measurements are removed from the SpatialData object,
+    the object should be recreated.
+    In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
+    Args:
+        sdata: The SpatialData object to curate.
+        var_index: A dictionary mapping table keys to the ``.var`` indices.
+        categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
+        organism: The organism name.
+        sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
+        exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
+            When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
+            using the exclude parameter ensures they are not validated.
+        verbosity: The verbosity level of the logger.
+        sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
+    Examples:
+        >>> import bionty as bt
+        >>> curator = SpatialDataCatManager(
+        ...     sdata,
+        ...     var_index={
+        ...         "table_1": bt.Gene.ensembl_gene_id,
+        ...     },
+        ...     categoricals={
+        ...         "table1":
+        ...             {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
+        ...         "sample":
+        ...             {"experimental_factor": bt.ExperimentalFactor.name},
+        ...     },
+        ...     organism="human",
+        ... )
+    """
+    def __init__(
+        self,
+        sdata: Any,
+        var_index: dict[str, FieldAttr],
+        categoricals: dict[str, dict[str, FieldAttr]] | None = None,
+        verbosity: str = "hint",
+        organism: str | None = None,
+        sources: dict[str, dict[str, Record]] | None = None,
+        exclude: dict[str, dict] | None = None,
+        *,
+        sample_metadata_key: str | None = "sample",
+    ) -> None:
+        super().__init__(
+            dataset=sdata,
+            categoricals={},
+            sources=sources,
+            organism=organism,
+            exclude=exclude,
+        )
+        if isinstance(sdata, Artifact):
+            # TODO: load() doesn't yet work
+            self._sdata = sdata.load()
+        else:
+            self._sdata = self._dataset
+        self._sample_metadata_key = sample_metadata_key
+        self._var_fields = var_index
+        self._verify_accessor_exists(self._var_fields.keys())
+        self._categoricals = categoricals
+        self._table_keys = set(self._var_fields.keys()) | set(
+            self._categoricals.keys() - {self._sample_metadata_key}
+        )
+        self._verbosity = verbosity
+        self._sample_df_curator = None
+        if self._sample_metadata_key is not None:
+            self._sample_metadata = self._sdata.get_attrs(
+                key=self._sample_metadata_key, return_as="df", flatten=True
+            )
+        self._is_validated = False
+        # Check validity of keys in categoricals
+        nonval_keys = []
+        for accessor, accessor_categoricals in self._categoricals.items():
+            if (
+                accessor == self._sample_metadata_key
+                and self._sample_metadata is not None
+            ):
+                for key in accessor_categoricals.keys():
+                    if key not in self._sample_metadata.columns:
+                        nonval_keys.append(key)
+            else:
+                for key in accessor_categoricals.keys():
+                    if key not in self._sdata[accessor].obs.columns:
+                        nonval_keys.append(key)
+        _maybe_curation_keys_not_present(nonval_keys, "categoricals")
+        # check validity of keys in sources and exclude
+        for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
+            nonval_keys = []
+            for accessor, accessor_sources in dct.items():
+                if (
+                    accessor == self._sample_metadata_key
+                    and self._sample_metadata is not None
+                ):
+                    columns = self._sample_metadata.columns
+                elif accessor != self._sample_metadata_key:
+                    columns = self._sdata[accessor].obs.columns
+                else:
+                    continue
+                for key in accessor_sources:
+                    if key not in columns:
+                        nonval_keys.append(key)
+            _maybe_curation_keys_not_present(nonval_keys, name)
+        # Set up sample level metadata and table Curator objects
+        if (
+            self._sample_metadata_key is not None
+            and self._sample_metadata_key in self._categoricals
+        ):
+            self._sample_df_curator = DataFrameCatManager(
+                df=self._sample_metadata,
+                columns=Feature.name,
+                categoricals=self._categoricals.get(self._sample_metadata_key, {}),
+                verbosity=verbosity,
+                sources=self._sources.get(self._sample_metadata_key),
+                exclude=self._exclude.get(self._sample_metadata_key),
+                organism=organism,
+            )
+        self._table_adata_curators = {
+            table: AnnDataCatManager(
+                data=self._sdata[table],
+                var_index=var_index.get(table),
+                categoricals=self._categoricals.get(table),
+                verbosity=verbosity,
+                sources=self._sources.get(table),
+                exclude=self._exclude.get(table),
+                organism=organism,
+            )
+            for table in self._table_keys
+        }
+        self._non_validated = None
+    @property
+    def var_index(self) -> FieldAttr:
+        """Return the registry fields to validate variables indices against."""
+        return self._var_fields
+    @property
+    def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
+        """Return the categorical keys and fields to validate against."""
+        return self._categoricals
+    @property
+    def non_validated(self) -> dict[str, dict[str, list[str]]]:  # type: ignore
+        """Return the non-validated features and labels."""
+        if self._non_validated is None:
+            raise ValidationError("Please run validate() first!")
+        return self._non_validated
+    def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
+        """Verify that the accessors exist (either a valid table or in attrs)."""
+        for acc in accessors:
+            is_present = False
+            try:
+                self._sdata.get_attrs(key=acc)
+                is_present = True
+            except KeyError:
+                if acc in self._sdata.tables.keys():
+                    is_present = True
+            if not is_present:
+                raise ValidationError(f"Accessor '{acc}' does not exist!")
+    def lookup(self, public: bool = False) -> CurateLookup:
+        """Look up categories.
+        Args:
+            public: Whether the lookup is performed on the public reference.
+        """
+        cat_values_dict = list(self.categoricals.values())[0]
+        return CurateLookup(
+            categoricals=cat_values_dict,
+            slots={"accessors": cat_values_dict.keys()},
+            public=public,
+        )
+    def _update_registry_all(self) -> None:
+        """Saves labels of all features for sample and table metadata."""
+        if self._sample_df_curator is not None:
+            self._sample_df_curator._update_registry_all(
+                validated_only=True,
+            )
+        for _, adata_curator in self._table_adata_curators.items():
+            adata_curator._obs_df_curator._update_registry_all(
+                validated_only=True,
+            )
+    def add_new_from_var_index(self, table: str, **kwargs) -> None:
+        """Save new values from ``.var.index`` of table.
+        Args:
+            table: The table key.
+            organism: The organism name.
+            **kwargs: Additional keyword arguments to pass to create new records.
+        """
+        if self._non_validated is None:
+            raise ValidationError("Run .validate() first.")
+        self._table_adata_curators[table].add_new_from_var_index(**kwargs)
+        if table in self.non_validated.keys():
+            if "var_index" in self._non_validated[table]:
+                self._non_validated[table].pop("var_index")
+            if len(self.non_validated[table].values()) == 0:
+                self.non_validated.pop(table)
+    def add_new_from(
+        self,
+        key: str,
+        accessor: str | None = None,
+        **kwargs,
+    ) -> None:
+        """Save new values of categorical from sample level metadata or table.
+        Args:
+            key: The key referencing the slot in the DataFrame.
+            accessor: The accessor key such as 'sample' or 'table x'.
+            organism: The organism name.
+            **kwargs: Additional keyword arguments to pass to create new records.
+        """
+        if self._non_validated is None:
+            raise ValidationError("Run .validate() first.")
+        if len(kwargs) > 0 and key == "all":
+            raise ValueError("Cannot pass additional arguments to 'all' key!")
+        if accessor not in self.categoricals:
+            raise ValueError(
+                f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
+            )
+        if accessor in self._table_adata_curators:
+            adata_curator = self._table_adata_curators[accessor]
+            adata_curator.add_new_from(key=key, **kwargs)
+        if accessor == self._sample_metadata_key:
+            self._sample_df_curator.add_new_from(key=key, **kwargs)
+        if accessor in self.non_validated.keys():
+            if len(self.non_validated[accessor].values()) == 0:
+                self.non_validated.pop(accessor)
+    def standardize(self, key: str, accessor: str | None = None) -> None:
+        """Replace synonyms with canonical values.
+        Modifies the dataset inplace.
+        Args:
+            key: The key referencing the slot in the table or sample metadata.
+            accessor: The accessor key such as 'sample_key' or 'table_key'.
+        """
+        if len(self.non_validated) == 0:
+            logger.warning("values are already standardized")
+            return
+        if self._artifact is not None:
+            raise RuntimeError("can't mutate the dataset when an artifact is passed!")
+        if accessor == self._sample_metadata_key:
+            if key not in self._sample_metadata.columns:
+                raise ValueError(f"key '{key}' not present in '{accessor}'!")
+        else:
+            if (
+                key == "var_index" and self._sdata.tables[accessor].var.index is None
+            ) or (
+                key != "var_index"
+                and key not in self._sdata.tables[accessor].obs.columns
+            ):
+                raise ValueError(f"key '{key}' not present in '{accessor}'!")
+        if accessor in self._table_adata_curators.keys():
+            adata_curator = self._table_adata_curators[accessor]
+            adata_curator.standardize(key)
+        if accessor == self._sample_metadata_key:
+            self._sample_df_curator.standardize(key)
+        if len(self.non_validated[accessor].values()) == 0:
+            self.non_validated.pop(accessor)
+    def validate(self) -> bool:
+        """Validate variables and categorical observations.
+        This method also registers the validated records in the current instance:
+        - from public sources
+        Args:
+            organism: The organism name.
+        Returns:
+            Whether the SpatialData object is validated.
+        """
+        from lamindb.core._settings import settings
+        # add all validated records to the current instance
+        verbosity = settings.verbosity
+        try:
+            settings.verbosity = "error"
+            self._update_registry_all()
+        finally:
+            settings.verbosity = verbosity
+        self._non_validated = {}  # type: ignore
+        sample_validated = True
+        if self._sample_df_curator:
+            logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
+            sample_validated &= self._sample_df_curator.validate()
+            if len(self._sample_df_curator.non_validated) > 0:
+                self._non_validated["sample"] = self._sample_df_curator.non_validated  # type: ignore
+            logger.print("")
+        mods_validated = True
+        for table, adata_curator in self._table_adata_curators.items():
+            logger.info(f"validating categoricals of table '{table}' ...")
+            mods_validated &= adata_curator.validate()
+            if len(adata_curator.non_validated) > 0:
+                self._non_validated[table] = adata_curator.non_validated  # type: ignore
+            logger.print("")
+        self._is_validated = sample_validated & mods_validated
+        return self._is_validated
+    def save_artifact(
+        self,
+        *,
+        key: str | None = None,
+        description: str | None = None,
+        revises: Artifact | None = None,
+        run: Run | None = None,
+    ) -> Artifact:
+        if not self._is_validated:
+            self.validate()
+            if not self._is_validated:
+                raise ValidationError("Dataset does not validate. Please curate.")
+        verbosity = settings.verbosity
+        try:
+            settings.verbosity = "warning"
+            if self._artifact is None:
+                # Write the SpatialData object to a random path in tmp directory
+                # The Artifact constructor will move it to the cache
+                write_path = (
+                    f"{settings.cache_dir}/{random.randint(10**7, 10**8 - 1)}.zarr"
+                )
+                self._sdata.write(write_path)
+                # Create the Artifact and associate Artifact metadata
+                self._artifact = Artifact(
+                    write_path,
+                    description=description,
+                    key=key,
+                    revises=revises,
+                    run=run,
+                )
+                # According to Tim it is not easy to calculate the number of observations.
+                # We would have to write custom code to iterate over labels (which might not even exist at that point)
+                self._artifact.otype = "spatialdata"
+                self._artifact.save()
+            # Link schemas
+            feature_kwargs = check_registry_organism(
+                (list(self._var_fields.values())[0].field.model),
+                self._organism,
+            )
+            def _add_set_from_spatialdata(
+                host: Artifact | Collection | Run,
+                var_fields: dict[str, FieldAttr],
+                obs_fields: dict[str, FieldAttr] = None,
+                mute: bool = False,
+                organism: str | Record | None = None,
+            ):
+                """Add Schemas from SpatialData."""
+                if obs_fields is None:
+                    obs_fields = {}
+                assert host.otype == "spatialdata"  # noqa: S101
+                feature_sets = {}
+                # sample features
+                sample_features = Feature.from_values(self._sample_metadata.columns)  # type: ignore
+                if len(sample_features) > 0:
+                    feature_sets[self._sample_metadata_key] = Schema(
+                        features=sample_features
+                    )
+                # table features
+                for table, field in var_fields.items():
+                    table_fs = parse_staged_feature_sets_from_anndata(
+                        self._sdata[table],
+                        var_field=field,
+                        obs_field=obs_fields.get(table, Feature.name),
+                        mute=mute,
+                        organism=organism,
+                    )
+                    for k, v in table_fs.items():
+                        feature_sets[f"['{table}'].{k}"] = v
+                def _unify_staged_feature_sets_by_hash(
+                    feature_sets: MutableMapping[str, Schema],
+                ):
+                    unique_values: dict[str, Any] = {}
+                    for key, value in feature_sets.items():
+                        value_hash = (
+                            value.hash
+                        )  # Assuming each value has a .hash attribute
+                        if value_hash in unique_values:
+                            feature_sets[key] = unique_values[value_hash]
+                        else:
+                            unique_values[value_hash] = value
+                    return feature_sets
+                # link feature sets
+                host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
+                    feature_sets
+                )
+                host.save()
+            _add_set_from_spatialdata(
+                self._artifact, var_fields=self._var_fields, **feature_kwargs
+            )
+            # Link labels
+            def _add_labels_from_spatialdata(
+                data,
+                artifact: Artifact,
+                fields: dict[str, FieldAttr],
+                feature_ref_is_name: bool | None = None,
+            ):
+                """Add Labels from SpatialData."""
+                features = Feature.lookup().dict()
+                for key, field in fields.items():
+                    feature = features.get(key)
+                    registry = field.field.model
+                    filter_kwargs = check_registry_organism(registry, self._organism)
+                    filter_kwargs_current = get_current_filter_kwargs(
+                        registry, filter_kwargs
+                    )
+                    df = data if isinstance(data, pd.DataFrame) else data.obs
+                    labels = registry.from_values(
+                        df[key],
+                        field=field,
+                        **filter_kwargs_current,
+                    )
+                    if len(labels) == 0:
+                        continue
+                    label_ref_is_name = None
+                    if hasattr(registry, "_name_field"):
+                        label_ref_is_name = field.field.name == registry._name_field
+                    add_labels(
+                        artifact,
+                        records=labels,
+                        feature=feature,
+                        feature_ref_is_name=feature_ref_is_name,
+                        label_ref_is_name=label_ref_is_name,
+                        from_curator=True,
+                    )
+            for accessor, accessor_fields in self._categoricals.items():
+                column_field = self._var_fields.get(accessor)
+                if accessor == self._sample_metadata_key:
+                    _add_labels_from_spatialdata(
+                        self._sample_metadata,
+                        self._artifact,
+                        accessor_fields,
+                        feature_ref_is_name=(
+                            None if column_field is None else _ref_is_name(column_field)
+                        ),
+                    )
+                else:
+                    _add_labels_from_spatialdata(
+                        self._sdata.tables[accessor],
+                        self._artifact,
+                        accessor_fields,
+                        feature_ref_is_name=(
+                            None if column_field is None else _ref_is_name(column_field)
+                        ),
+                    )
+        finally:
+            settings.verbosity = verbosity
+        slug = ln_setup.settings.instance.slug
+        if ln_setup.settings.instance.is_remote:  # pragma: no cover
+            logger.important(
+                f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
+            )
+        return self._artifact
+def _restrict_obs_fields(
+    obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
+) -> dict[str, str]:
+    """Restrict the obs fields to name return only available obs fields.
+    To simplify the curation, we only validate against either name or ontology_id.
+    If both are available, we validate against ontology_id.
+    If none are available, we validate against name.
+    """
+    obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
+    for name, field in obs_fields.items():
+        if name.endswith("_ontology_term_id"):
+            continue
+        # if both the ontology id and the name are present, only validate on the ontology_id
+        if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
+            obs_fields_unique.pop(name)
+        # if the neither name nor ontology id are present, validate on the name
+        # this will raise error downstream, we just use name to be more readable
+        if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
+            obs_fields_unique[name] = field
+    # Only retain obs_fields_unique that have keys in adata.obs.columns
+    available_obs_fields = {
+        k: v for k, v in obs_fields_unique.items() if k in obs.columns
+    }
+    return available_obs_fields
+def _add_defaults_to_obs(
+    obs: pd.DataFrame,
+    defaults: dict[str, str],
+) -> None:
+    """Add default columns and values to obs DataFrame."""
+    added_defaults: dict = {}
+    for name, default in defaults.items():
+        if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
+            obs[name] = default
+            added_defaults[name] = default
+            logger.important(
+                f"added default value '{default}' to the adata.obs['{name}']"
+            )
+class CellxGeneAnnDataCatManager(AnnDataCatManager):
+    """Annotation flow of AnnData based on CELLxGENE schema."""
+    _controls_were_created: bool | None = None
+    def __init__(
+        self,
+        adata: ad.AnnData | UPathStr,
+        categoricals: dict[str, FieldAttr] | None = None,
+        organism: Literal["human", "mouse"] = "human",
+        *,
+        defaults: dict[str, str] = None,
+        extra_sources: dict[str, Record] = None,
+        schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
+        verbosity: str = "hint",
+    ) -> None:
+        """CELLxGENE schema curator.
+        Args:
+            adata: Path to or AnnData object to curate against the CELLxGENE schema.
+            categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
+                The CELLxGENE Curator maps against the required CELLxGENE fields by default.
+            organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
+            defaults: Default values that are set if columns or column values are missing.
+            extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
+                These extra sources are joined with the CELLxGENE fixed sources.
+                Use this parameter when subclassing.
+            exclude: A dictionary mapping column names to values to exclude.
+            schema_version: The CELLxGENE schema version to curate against.
+            verbosity: The verbosity level.
+        """
+        import bionty as bt
+        CellxGeneAnnDataCatManager._init_categoricals_additional_values()
+        var_index: FieldAttr = bt.Gene.ensembl_gene_id
+        if categoricals is None:
+            categoricals = CellxGeneAnnDataCatManager._get_categoricals()
+        self.organism = organism
+        VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
+        if schema_version not in VALID_SCHEMA_VERSIONS:
+            valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
+            raise ValueError(
+                f"Invalid schema_version: {schema_version}. "
+                f"Valid versions are: {valid_versions}"
+            )
+        self.schema_version = schema_version
+        self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
+        with resources.path(
+            "lamindb.curators._cellxgene_schemas", "schema_versions.yml"
+        ) as schema_versions_path:
+            self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
+                self.schema_version
+            ]
+        # Fetch AnnData obs to be able to set defaults and get sources
+        if isinstance(adata, ad.AnnData):
+            self._adata_obs = adata.obs
+        else:
+            self._adata_obs = backed_access(upath.create_path(adata)).obs  # type: ignore
+        # Add defaults first to ensure that we fetch valid sources
+        if defaults:
+            _add_defaults_to_obs(self._adata_obs, defaults)
+        self.sources = self._create_sources(self._adata_obs)
+        self.sources = {
+            entity: source
+            for entity, source in self.sources.items()
+            if source is not None
+        }
+        # These sources are not a part of the cellxgene schema but rather passed through.
+        # This is useful when other Curators extend the CELLxGENE curator
+        if extra_sources:
+            self.sources = self.sources | extra_sources
+        # Exclude default values from validation because they are not available in the pinned sources
+        exclude_keys = {
+            entity: default
+            for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
+            if entity in self._adata_obs.columns  # type: ignore
+        }
+        super().__init__(
+            data=adata,
+            var_index=var_index,
+            categoricals=_restrict_obs_fields(self._adata_obs, categoricals),
+            verbosity=verbosity,
+            organism=organism,
+            sources=self.sources,
+            exclude=exclude_keys,
+        )
+    @classmethod
+    def _init_categoricals_additional_values(cls) -> None:
+        import bionty as bt
+        import lamindb as ln
+        # Note: if you add another control below, be mindful to change the if condition that
+        # triggers whether creating these records is re-considered
+        if cls._controls_were_created is None:
+            cls._controls_were_created = (
+                ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
+                is not None
+            )
+        if not cls._controls_were_created:
+            logger.important("Creating control labels in the CellxGene schema.")
+            bt.CellType(
+                ontology_id="unknown",
+                name="unknown",
+                description="From CellxGene schema.",
+            ).save()
+            pato = bt.Source.filter(name="pato", version="2024-03-28").one()
+            normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
+            bt.Disease(
+                uid=normal.uid,
+                name=normal.name,
+                ontology_id=normal.ontology_id,
+                description=normal.description,
+                source=normal.source,
+            ).save()
+            bt.Ethnicity(
+                ontology_id="na", name="na", description="From CellxGene schema."
+            ).save()
+            bt.Ethnicity(
+                ontology_id="unknown",
+                name="unknown",
+                description="From CellxGene schema.",
+            ).save()
+            bt.DevelopmentalStage(
+                ontology_id="unknown",
+                name="unknown",
+                description="From CellxGene schema.",
+            ).save()
+            bt.Phenotype(
+                ontology_id="unknown",
+                name="unknown",
+                description="From CellxGene schema.",
+            ).save()
+            tissue_type = ln.ULabel(
+                name="TissueType",
+                is_type=True,
+                description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
+            ).save()
+            ln.ULabel(
+                name="tissue", type=tissue_type, description="From CellxGene schema."
+            ).save()
+            ln.ULabel(
+                name="organoid", type=tissue_type, description="From CellxGene schema."
+            ).save()
+            ln.ULabel(
+                name="cell culture",
+                type=tissue_type,
+                description="From CellxGene schema.",
+            ).save()
+            suspension_type = ln.ULabel(
+                name="SuspensionType",
+                is_type=True,
+                description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
+            ).save()
+            ln.ULabel(
+                name="cell", type=suspension_type, description="From CellxGene schema."
+            ).save()
+            ln.ULabel(
+                name="nucleus",
+                type=suspension_type,
+                description="From CellxGene schema.",
+            ).save()
+            ln.ULabel(name="na", type=suspension_type).save()
+    @classmethod
+    def _get_categoricals(cls) -> dict[str, FieldAttr]:
+        import bionty as bt
+        return {
+            "assay": bt.ExperimentalFactor.name,
+            "assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
+            "cell_type": bt.CellType.name,
+            "cell_type_ontology_term_id": bt.CellType.ontology_id,
+            "development_stage": bt.DevelopmentalStage.name,
+            "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
+            "disease": bt.Disease.name,
+            "disease_ontology_term_id": bt.Disease.ontology_id,
+            # "donor_id": "str",  via pandera
+            "self_reported_ethnicity": bt.Ethnicity.name,
+            "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
+            "sex": bt.Phenotype.name,
+            "sex_ontology_term_id": bt.Phenotype.ontology_id,
+            "suspension_type": ULabel.name,
+            "tissue": bt.Tissue.name,
+            "tissue_ontology_term_id": bt.Tissue.ontology_id,
+            "tissue_type": ULabel.name,
+            "organism": bt.Organism.name,
+            "organism_ontology_term_id": bt.Organism.ontology_id,
+        }
+    @classmethod
+    def _get_categoricals_defaults(cls) -> dict[str, str]:
+        return {
+            "cell_type": "unknown",
+            "development_stage": "unknown",
+            "disease": "normal",
+            "donor_id": "unknown",
+            "self_reported_ethnicity": "unknown",
+            "sex": "unknown",
+            "suspension_type": "cell",
+            "tissue_type": "tissue",
+        }
+    @property
+    def pinned_ontologies(self) -> pd.DataFrame:
+        return self._pinned_ontologies
+    @property
+    def adata(self) -> AnnData:
+        return self._adata
+    def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
+        """Creates a sources dictionary that can be passed to AnnDataCatManager."""
+        import bionty as bt
+        # fmt: off
+        def _fetch_bionty_source(
+            entity: str, organism: str, source: str
+        ) -> bt.Source | None:
+            """Fetch the Bionty source of the pinned ontology.
+            Returns None if the source does not exist.
+            """
+            version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
+                                                  (self._pinned_ontologies["organism"] == organism) &
+                                                  (self._pinned_ontologies["source"] == source), "version"].iloc[0]
+            return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
+        entity_mapping = {
+             "var_index": ("Gene", self.organism, "ensembl"),
+             "cell_type": ("CellType", "all", "cl"),
+             "assay": ("ExperimentalFactor", "all", "efo"),
+             "self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
+             "development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
+             "disease": ("Disease", "all", "mondo"),
+             # "organism": ("Organism", "vertebrates", "ensembl"),
+             "sex": ("Phenotype", "all", "pato"),
+             "tissue": ("Tissue", "all", "uberon"),
+        }
+        # fmt: on
+        # Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
+        entity_to_sources = {
+            entity: _fetch_bionty_source(*params)
+            for entity, params in entity_mapping.items()
+            if entity in obs.columns
+            or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
+            or entity == "var_index"
+        }
+        return entity_to_sources
+    def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
+        """Converts a column that stores a name into a column that stores the ontology id.
+        cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
+        """
+        field_name = field.field.name
+        assert field_name == "name"  # noqa: S101
+        cols = ["name", "ontology_id"]
+        registry = field.field.model
+        if hasattr(registry, "ontology_id"):
+            validated_records = registry.filter(**{f"{field_name}__in": values})
+            mapper = (
+                pd.DataFrame(validated_records.values_list(*cols))
+                .set_index(0)
+                .to_dict()[1]
+            )
+            return values.map(mapper)
+    def validate(self) -> bool:  # type: ignore
+        """Validates the AnnData object against most cellxgene requirements."""
+        # Verify that all required obs columns are present
+        missing_obs_fields = [
+            name
+            for name in CellxGeneAnnDataCatManager._get_categoricals_defaults().keys()
+            if name not in self._adata.obs.columns
+            and f"{name}_ontology_term_id" not in self._adata.obs.columns
+        ]
+        if len(missing_obs_fields) > 0:
+            missing_obs_fields_str = ", ".join(list(missing_obs_fields))
+            logger.error(f"missing required obs columns {missing_obs_fields_str}")
+            logger.info(
+                "consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
+                "to automatically add these columns with default values."
+            )
+            return False
+        # Verify that no cellxgene reserved names are present
+        reserved_names = {
+            "ethnicity",
+            "ethnicity_ontology_term_id",
+            "X_normalization",
+            "default_field",
+            "layer_descriptions",
+            "tags",
+            "versions",
+            "contributors",
+            "preprint_doi",
+            "project_description",
+            "project_links",
+            "project_name",
+            "publication_doi",
+        }
+        matched_columns = [
+            column for column in self._adata.obs.columns if column in reserved_names
+        ]
+        if len(matched_columns) > 0:
+            raise ValueError(
+                f"AnnData object must not contain obs columns {matched_columns} which are"
+                " reserved from previous schema versions."
+            )
-        feature_ref_is_name = _ref_is_name(self._columns_field)
-        features = Feature.lookup().dict()
-        for key, field in self._obs_fields.items():
-            feature = features.get(key)
-            registry = field.field.model
-            organism = check_registry_organism(field.field.model, self._organism).get(
-                "organism"
+        return super().validate()
+    def to_cellxgene_anndata(
+        self, is_primary_data: bool, title: str | None = None
+    ) -> ad.AnnData:
+        """Converts the AnnData object to the cellxgene-schema input format.
+        cellxgene expects the obs fields to be {entity}_ontology_id fields and has many further requirements which are
+        documented here: https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema.
+        This function checks for most but not all requirements of the CELLxGENE schema.
+        If you want to ensure that it fully adheres to the CELLxGENE schema, run `cellxgene-schema` on the AnnData object.
+        Args:
+            is_primary_data: Whether the measured data is primary data or not.
+            title: Title of the AnnData object. Commonly the name of the publication.
+        Returns:
+            An AnnData object which adheres to the cellxgene-schema.
+        """
+        # Create a copy since we modify the AnnData object extensively
+        adata_cxg = self._adata.copy()
+        # cellxgene requires an embedding
+        embedding_pattern = r"^[a-zA-Z][a-zA-Z0-9_.-]*$"
+        exclude_key = "spatial"
+        matching_keys = [
+            key
+            for key in adata_cxg.obsm.keys()
+            if re.match(embedding_pattern, key) and key != exclude_key
+        ]
+        if len(matching_keys) == 0:
+            raise ValueError(
+                "Unable to find an embedding key. Please calculate an embedding."
             )
-            labels = registry.from_values(
-                values=self._validated_values[key], field=field, organism=organism
+        # convert name column to ontology_term_id column
+        for column in adata_cxg.obs.columns:
+            if column in self.categoricals and not column.endswith("_ontology_term_id"):
+                mapped_column = self._convert_name_to_ontology_id(
+                    adata_cxg.obs[column], field=self.categoricals.get(column)
+                )
+                if mapped_column is not None:
+                    adata_cxg.obs[f"{column}_ontology_term_id"] = mapped_column
+        # drop the name columns for ontologies. cellxgene does not allow them.
+        drop_columns = [
+            i
+            for i in adata_cxg.obs.columns
+            if f"{i}_ontology_term_id" in adata_cxg.obs.columns
+        ]
+        adata_cxg.obs.drop(columns=drop_columns, inplace=True)
+        # Add cellxgene metadata to AnnData object
+        if "is_primary_data" not in adata_cxg.obs.columns:
+            adata_cxg.obs["is_primary_data"] = is_primary_data
+        if "feature_is_filtered" not in adata_cxg.var.columns:
+            logger.warn(
+                "column 'feature_is_filtered' not present in var. Setting to default"
+                " value of False."
             )
-            if len(labels) == 0:
-                continue
-            if hasattr(registry, "_name_field"):
-                label_ref_is_name = field.field.name == registry._name_field
-                add_labels(
-                    artifact,
-                    records=labels,
-                    feature=feature,
-                    feature_ref_is_name=feature_ref_is_name,
-                    label_ref_is_name=label_ref_is_name,
-                    from_curator=True,
+            adata_cxg.var["feature_is_filtered"] = False
+        if title is None:
+            raise ValueError("please pass a title!")
+        else:
+            adata_cxg.uns["title"] = title
+        adata_cxg.uns["cxg_lamin_schema_reference"] = self.schema_reference
+        adata_cxg.uns["cxg_lamin_schema_version"] = self.schema_version
+        return adata_cxg
+class ValueUnit:
+    """Base class for handling value-unit combinations."""
+    @staticmethod
+    def parse_value_unit(value: str, is_dose: bool = True) -> tuple[str, str] | None:
+        """Parse a string containing a value and unit into a tuple."""
+        if not isinstance(value, str) or not value.strip():
+            return None
+        value = str(value).strip()
+        match = re.match(r"^(\d*\.?\d{0,1})\s*([a-zA-ZμµΜ]+)$", value)
+        if not match:
+            raise ValueError(
+                f"Invalid format: {value}. Expected format: number with max 1 decimal place + unit"
+            )
+        number, unit = match.groups()
+        formatted_number = f"{float(number):.1f}"
+        if is_dose:
+            standardized_unit = DoseHandler.standardize_unit(unit)
+            if not DoseHandler.validate_unit(standardized_unit):
+                raise ValueError(
+                    f"Invalid dose unit: {unit}. Must be convertible to one of: nM, μM, mM, M"
+                )
+        else:
+            standardized_unit = TimeHandler.standardize_unit(unit)
+            if not TimeHandler.validate_unit(standardized_unit):
+                raise ValueError(
+                    f"Invalid time unit: {unit}. Must be convertible to one of: h, m, s, d, y"
                 )
-        return artifact.save()
+        return formatted_number, standardized_unit
-class Curator(BaseCurator):
-    """Dataset curator.
+class DoseHandler:
+    """Handler for dose-related operations."""
-    A `Curator` object makes it easy to save validated & annotated artifacts.
+    VALID_UNITS = {"nM", "μM", "µM", "mM", "M"}
+    UNIT_MAP = {
+        "nm": "nM",
+        "NM": "nM",
+        "um": "μM",
+        "UM": "μM",
+        "μm": "μM",
+        "μM": "μM",
+        "µm": "μM",
+        "µM": "μM",
+        "mm": "mM",
+        "MM": "mM",
+        "m": "M",
+        "M": "M",
+    }
-    Example:
+    @classmethod
+    def validate_unit(cls, unit: str) -> bool:
+        """Validate if the dose unit is acceptable."""
+        return unit in cls.VALID_UNITS
-    >>> curator = ln.Curator.from_df(
-    >>>     df,
-    >>>     # define validation criteria as mappings
-    >>>     columns=ln.Feature.name,  # map column names
-    >>>     categoricals={"perturbation": ln.ULabel.name},  # map categories
-    >>> )
-    >>> curator.validate()  # validate the data in df
-    >>> artifact = curator.save_artifact(description="my RNA-seq")
-    >>> artifact.describe()  # see annotations
+    @classmethod
+    def standardize_unit(cls, unit: str) -> str:
+        """Standardize dose unit to standard formats."""
+        return cls.UNIT_MAP.get(unit, unit)
-    `curator.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
+    @classmethod
+    def validate_values(cls, values: pd.Series) -> list:
+        """Validate pert_dose values with strict case checking."""
+        errors = []
-    If you find non-validated values, you have several options:
+        for idx, value in values.items():
+            if pd.isna(value):
+                continue
-    - new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`
-    - non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and addressed manually
-    """
+            if isinstance(value, (int, float)):
+                errors.append(
+                    f"Row {idx} - Missing unit for dose: {value}. Must include a unit (nM, μM, mM, M)"
+                )
+                continue
+            try:
+                ValueUnit.parse_value_unit(value, is_dose=True)
+            except ValueError as e:
+                errors.append(f"Row {idx} - {str(e)}")
+        return errors
+class TimeHandler:
+    """Handler for time-related operations."""
+    VALID_UNITS = {"h", "m", "s", "d", "y"}
     @classmethod
-    @doc_args(DataFrameCurator.__doc__)
-    def from_df(
-        cls,
-        df: pd.DataFrame,
-        categoricals: dict[str, FieldAttr] | None = None,
-        columns: FieldAttr = Feature.name,
-        using_key: str | None = None,
-        verbosity: str = "hint",
-        organism: str | None = None,
-    ) -> DataFrameCurator:
-        """{}"""  # noqa: D415
-        return DataFrameCurator(
-            df=df,
-            categoricals=categoricals,
-            columns=columns,
-            using_key=using_key,
-            verbosity=verbosity,
-            organism=organism,
-        )
+    def validate_unit(cls, unit: str) -> bool:
+        """Validate if the time unit is acceptable."""
+        return unit == unit.lower() and unit in cls.VALID_UNITS
     @classmethod
-    @doc_args(AnnDataCurator.__doc__)
-    def from_anndata(
-        cls,
-        data: ad.AnnData | UPathStr,
-        var_index: FieldAttr,
-        categoricals: dict[str, FieldAttr] | None = None,
-        obs_columns: FieldAttr = Feature.name,
-        using_key: str | None = None,
-        verbosity: str = "hint",
-        organism: str | None = None,
-        sources: dict[str, Record] | None = None,
-    ) -> AnnDataCurator:
-        """{}"""  # noqa: D415
-        return AnnDataCurator(
-            data=data,
-            var_index=var_index,
-            categoricals=categoricals,
-            obs_columns=obs_columns,
-            using_key=using_key,
-            verbosity=verbosity,
-            organism=organism,
-            sources=sources,
-        )
+    def standardize_unit(cls, unit: str) -> str:
+        """Standardize time unit to standard formats."""
+        if unit.startswith("hr"):
+            return "h"
+        elif unit.startswith("min"):
+            return "m"
+        elif unit.startswith("sec"):
+            return "s"
+        return unit[0].lower()
     @classmethod
-    @doc_args(MuDataCurator.__doc__)
-    def from_mudata(
-        cls,
-        mdata: MuData,
-        var_index: dict[str, dict[str, FieldAttr]],
-        categoricals: dict[str, FieldAttr] | None = None,
-        using_key: str | None = None,
+    def validate_values(cls, values: pd.Series) -> list:
+        """Validate pert_time values."""
+        errors = []
+        for idx, value in values.items():
+            if pd.isna(value):
+                continue
+            if isinstance(value, (int, float)):
+                errors.append(
+                    f"Row {idx} - Missing unit for time: {value}. Must include a unit (h, m, s, d, y)"
+                )
+                continue
+            try:
+                ValueUnit.parse_value_unit(value, is_dose=False)
+            except ValueError as e:
+                errors.append(f"Row {idx} - {str(e)}")
+        return errors
+class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
+    """Curator flow for Perturbation data."""
+    PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
+    def __init__(
+        self,
+        adata: ad.AnnData,
+        organism: Literal["human", "mouse"] = "human",
+        pert_dose: bool = True,
+        pert_time: bool = True,
+        *,
         verbosity: str = "hint",
-        organism: str | None = None,
-    ) -> MuDataCurator:
-        """{}"""  # noqa: D415
-        return MuDataCurator(
-            mdata=mdata,
-            var_index=var_index,
-            categoricals=categoricals,
-            using_key=using_key,
+        cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
+    ):
+        """Initialize the curator with configuration and validation settings."""
+        import bionty as bt
+        self._pert_time = pert_time
+        self._pert_dose = pert_dose
+        self._validate_initial_data(adata)
+        self._setup_configuration(adata)
+        self._setup_sources(adata)
+        self._setup_compound_source()
+        super().__init__(
+            adata=adata,
+            categoricals=self.PT_CATEGORICALS,
+            defaults=self.PT_DEFAULT_VALUES,
             verbosity=verbosity,
             organism=organism,
+            extra_sources=self.PT_SOURCES,
+            schema_version=cxg_schema_version,
         )
-    @classmethod
-    @doc_args(SOMACurator.__doc__)
-    def from_tiledbsoma(
-        cls,
-        experiment_uri: UPathStr,
-        var_index: dict[str, tuple[str, FieldAttr]],
-        categoricals: dict[str, FieldAttr] | None = None,
-        obs_columns: FieldAttr = Feature.name,
-        using_key: str | None = None,
-        organism: str | None = None,
-        sources: dict[str, Record] | None = None,
-        exclude: dict[str, str | list[str]] | None = None,
-    ) -> SOMACurator:
-        """{}"""  # noqa: D415
-        return SOMACurator(
-            experiment_uri=experiment_uri,
-            var_index=var_index,
-            categoricals=categoricals,
-            obs_columns=obs_columns,
-            using_key=using_key,
-            organism=organism,
-            sources=sources,
-            exclude=exclude,
+    def _setup_configuration(self, adata: ad.AnnData):
+        """Set up default configuration values."""
+        import bionty as bt
+        import wetlab as wl
+        self.PT_DEFAULT_VALUES = (
+            CellxGeneAnnDataCatManager._get_categoricals_defaults()
+            | {
+                "cell_line": "unknown",
+                "pert_target": "unknown",
+            }
         )
-    @classmethod
-    def from_spatialdata(
-        cls,
-        sdata,
-        var_index: dict[str, FieldAttr],
-        categoricals: dict[str, dict[str, FieldAttr]] | None = None,
-        using_key: str | None = None,
-        organism: str | None = None,
-        sources: dict[str, dict[str, Record]] | None = None,
-        exclude: dict[str, dict] | None = None,
-        verbosity: str = "hint",
-        *,
-        sample_metadata_key: str = "sample",
-    ):
-        """Curation flow for a ``Spatialdata`` object.
+        self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
+            k: v
+            for k, v in {
+                "cell_line": bt.CellLine.name,
+                "pert_target": wl.PerturbationTarget.name,
+                "pert_genetic": wl.GeneticPerturbation.name,
+                "pert_compound": wl.Compound.name,
+                "pert_biologic": wl.Biologic.name,
+                "pert_physical": wl.EnvironmentalPerturbation.name,
+            }.items()
+            if k in adata.obs.columns
+        }
+        # if "donor_id" in self.PT_CATEGORICALS:
+        #     self.PT_CATEGORICALS["donor_id"] = Donor.name
+    def _setup_sources(self, adata: ad.AnnData):
+        """Set up data sources."""
+        self.PT_SOURCES = {}
+        # if "cell_line" in adata.obs.columns:
+        #     self.PT_SOURCES["cell_line"] = (
+        #         bt.Source.filter(name="depmap").first()
+        #     )
+        if "pert_compound" in adata.obs.columns:
+            import bionty as bt
+            self.PT_SOURCES["pert_compound"] = bt.Source.filter(
+                entity="wetlab.Compound", name="chebi"
+            ).first()
+    def _validate_initial_data(self, adata: ad.AnnData):
+        """Validate the initial data structure."""
+        self._validate_required_columns(adata)
+        self._validate_perturbation_types(adata)
+    def _validate_required_columns(self, adata: ad.AnnData):
+        """Validate required columns are present."""
+        if "pert_target" not in adata.obs.columns:
+            if (
+                "pert_name" not in adata.obs.columns
+                or "pert_type" not in adata.obs.columns
+            ):
+                raise ValidationError(
+                    "either 'pert_target' or both 'pert_name' and 'pert_type' must be present"
+                )
+        else:
+            if "pert_name" not in adata.obs.columns:
+                logger.warning(
+                    "no 'pert' column found in adata.obs, will only curate 'pert_target'"
+                )
+            elif "pert_type" not in adata.obs.columns:
+                raise ValidationError("both 'pert' and 'pert_type' must be present")
+    def _validate_perturbation_types(self, adata: ad.AnnData):
+        """Validate perturbation types."""
+        if "pert_type" in adata.obs.columns:
+            data_pert_types = set(adata.obs["pert_type"].unique())
+            invalid_pert_types = data_pert_types - self.PERT_COLUMNS
+            if invalid_pert_types:
+                raise ValidationError(
+                    f"invalid pert_type found: {invalid_pert_types}!\n"
+                    f"    → allowed values: {self.PERT_COLUMNS}"
+                )
+            self._process_perturbation_types(adata, data_pert_types)
+    def _process_perturbation_types(self, adata: ad.AnnData, pert_types: set):
+        """Process and map perturbation types."""
+        for pert_type in pert_types:
+            col_name = "pert_" + pert_type
+            adata.obs[col_name] = adata.obs["pert_name"].where(
+                adata.obs["pert_type"] == pert_type, None
+            )
+            if adata.obs[col_name].dtype.name == "category":
+                adata.obs[col_name].cat.remove_unused_categories()
+            logger.important(f"mapped 'pert_name' to '{col_name}'")
-        See also :class:`~lamindb.Curator`.
+    def _setup_compound_source(self):
+        """Set up the compound source with muted logging."""
+        import bionty as bt
+        import wetlab as wl
+        with logger.mute():
+            chebi_source = bt.Source.filter(
+                entity="wetlab.Compound", name="chebi"
+            ).first()
+            if not chebi_source:
+                wl.Compound.add_source(
+                    bt.Source.filter(entity="Drug", name="chebi").first()
+                )
-        Note that if genes or other measurements are removed from the SpatialData object,
-        the object should be recreated.
+    def validate(self) -> bool:  # type: ignore
+        """Validate the AnnData object."""
+        validated = super().validate()
+        if self._pert_dose:
+            validated &= self._validate_dose_column()
+        if self._pert_time:
+            validated &= self._validate_time_column()
+        self._is_validated = validated
+        # sort columns
+        first_columns = [
+            "pert_target",
+            "pert_genetic",
+            "pert_compound",
+            "pert_biologic",
+            "pert_physical",
+            "pert_dose",
+            "pert_time",
+            "organism",
+            "cell_line",
+            "cell_type",
+            "disease",
+            "tissue_type",
+            "tissue",
+            "assay",
+            "suspension_type",
+            "donor_id",
+            "sex",
+            "self_reported_ethnicity",
+            "development_stage",
+            "pert_name",
+            "pert_type",
+        ]
+        sorted_columns = [
+            col for col in first_columns if col in self._adata.obs.columns
+        ] + [col for col in self._adata.obs.columns if col not in first_columns]
+        # must assign to self._df to ensure .standardize works correctly
+        self._obs_df = self._adata.obs[sorted_columns]
+        self._adata.obs = self._obs_df
+        return validated
+    def standardize(self, key: str) -> pd.DataFrame:
+        """Standardize the AnnData object."""
+        super().standardize(key)
+        self._adata.obs = self._obs_df
+    def _validate_dose_column(self) -> bool:
+        """Validate the dose column."""
+        if not Feature.filter(name="pert_dose").exists():
+            Feature(name="pert_dose", dtype="str").save()  # type: ignore
+        dose_errors = DoseHandler.validate_values(self._adata.obs["pert_dose"])
+        if dose_errors:
+            self._log_validation_errors("pert_dose", dose_errors)
+            return False
+        return True
+    def _validate_time_column(self) -> bool:
+        """Validate the time column."""
+        if not Feature.filter(name="pert_time").exists():
+            Feature(name="pert_time", dtype="str").save()  # type: ignore
+        time_errors = TimeHandler.validate_values(self._adata.obs["pert_time"])
+        if time_errors:
+            self._log_validation_errors("pert_time", time_errors)
+            return False
+        return True
+    def _log_validation_errors(self, column: str, errors: list):
+        """Log validation errors with formatting."""
+        errors_print = "\n    ".join(errors)
+        logger.warning(
+            f"invalid {column} values found!\n    {errors_print}\n"
+            f"    → run {colors.cyan('standardize_dose_time()')}"
+        )
-        In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
+    def standardize_dose_time(self) -> pd.DataFrame:
+        """Standardize dose and time values."""
+        standardized_df = self._adata.obs.copy()
-        Args:
-            sdata: The SpatialData object to curate.
-            var_index: A dictionary mapping table keys to the ``.var`` indices.
-            categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
-            using_key: A reference LaminDB instance.
-            organism: The organism name.
-            sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
-            exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
-                When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
-                using the exclude parameter ensures they are not validated.
-            verbosity: The verbosity level of the logger.
-            sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
-        Examples:
-            >>> import lamindb as ln
-            >>> import bionty as bt
-            >>> curator = ln.Curator.from_spatialdata(
-            ...     sdata,
-            ...     var_index={
-            ...         "table_1": bt.Gene.ensembl_gene_id,
-            ...     },
-            ...     categoricals={
-            ...         "table1":
-            ...             {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name},
-            ...         "sample":
-            ...             {"experimental_factor": bt.ExperimentalFactor.name},
-            ...     },
-            ...     organism="human",
-            ... )
-        """
-        try:
-            import spatialdata
-        except ImportError as e:
-            raise ImportError(
-                "Please install spatialdata: pip install spatialdata"
-            ) from e
+        if "pert_dose" in self._adata.obs.columns:
+            standardized_df = self._standardize_column(
+                standardized_df, "pert_dose", is_dose=True
+            )
-        from ._spatial import SpatialDataCurator
+        if "pert_time" in self._adata.obs.columns:
+            standardized_df = self._standardize_column(
+                standardized_df, "pert_time", is_dose=False
+            )
-        return SpatialDataCurator(
-            sdata=sdata,
-            var_index=var_index,
-            categoricals=categoricals,
-            using_key=using_key,
-            verbosity=verbosity,
-            organism=organism,
-            sources=sources,
-            exclude=exclude,
-            sample_metadata_key=sample_metadata_key,
-        )
+        self._adata.obs = standardized_df
+        return standardized_df
+    def _standardize_column(
+        self, df: pd.DataFrame, column: str, is_dose: bool
+    ) -> pd.DataFrame:
+        """Standardize values in a specific column."""
+        for idx, value in self._adata.obs[column].items():
+            if pd.isna(value) or (
+                isinstance(value, str) and (not value.strip() or value.lower() == "nan")
+            ):
+                df.at[idx, column] = None
+                continue
+            try:
+                num, unit = ValueUnit.parse_value_unit(value, is_dose=is_dose)
+                df.at[idx, column] = f"{num}{unit}"
+            except ValueError:
+                continue
-def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
-    """Get a registry instance using a specific instance."""
-    if using_key is not None and using_key != "default":
-        return registry.using(using_key)
-    return registry
+        return df
 def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
@@ -1871,11 +3180,11 @@ def validate_categories(
     values: Iterable[str],
     field: FieldAttr,
     key: str,
-    using_key: str | None = None,
     organism: str | None = None,
     source: Record | None = None,
     exclude: str | list | None = None,
     hint_print: str | None = None,
+    curator: CatManager | None = None,
 ) -> tuple[bool, list]:
     """Validate ontology terms in a pandas series using LaminDB registries.
@@ -1883,7 +3192,6 @@ def validate_categories(
         values: The values to validate.
         field: The field attribute.
         key: The key referencing the slot in the DataFrame.
-        using_key: A reference LaminDB instance.
         organism: The organism name.
         source: The source record.
         exclude: Exclude specific values from validation.
@@ -1918,22 +3226,8 @@ def validate_categories(
     non_validated = inspect_result.non_validated
     syn_mapper = inspect_result.synonyms_mapper
-    # inspect the non-validated values from the using_key instance
-    values_validated = []
-    if using_key is not None and using_key != "default" and non_validated:
-        registry_using = get_registry_instance(registry, using_key)
-        inspect_result = inspect_instance(
-            values=non_validated,
-            field=field,
-            registry=registry_using,
-            exclude=exclude,
-            **kwargs,
-        )
-        non_validated = inspect_result.non_validated
-        values_validated += inspect_result.validated
-        syn_mapper.update(inspect_result.synonyms_mapper)
     # inspect the non-validated values from public (bionty only)
+    values_validated = []
     if hasattr(registry, "public"):
         verbosity = settings.verbosity
         try:
@@ -1975,6 +3269,10 @@ def validate_categories(
         if logger.indent == "":
             _log_mapping_info()
         logger.warning(warning_message)
+        if curator is not None:
+            curator._validate_category_error_messages = strip_ansi_codes(
+                warning_message
+            )
         logger.indent = ""
         return False, non_validated
@@ -1982,7 +3280,6 @@ def validate_categories(
 def standardize_categories(
     values: Iterable[str],
     field: FieldAttr,
-    using_key: str | None = None,
     organism: str | None = None,
     source: Record | None = None,
 ) -> dict:
@@ -1999,30 +3296,15 @@ def standardize_categories(
         mute=True,
         return_mapper=True,
     )
-    if len(values) > len(syn_mapper):  # type: ignore
-        # standardize values using the using_key instance
-        if using_key is not None and using_key != "default":
-            registry_using = get_registry_instance(registry, using_key)
-            syn_mapper.update(
-                registry_using.standardize(
-                    [v for v in values if v not in syn_mapper],
-                    field=field.field.name,
-                    organism=organism,
-                    source=source,
-                    mute=True,
-                    return_mapper=True,
-                )
-            )
     return syn_mapper
 def validate_categories_in_df(
     df: pd.DataFrame,
     fields: dict[str, FieldAttr],
-    using_key: str | None = None,
     sources: dict[str, Record] = None,
     exclude: dict | None = None,
+    curator: CatManager | None = None,
     **kwargs,
 ) -> tuple[bool, dict]:
     """Validate categories in DataFrame columns using LaminDB registries."""
@@ -2038,9 +3320,9 @@ def validate_categories_in_df(
             df[key],
             field=field,
             key=key,
-            using_key=using_key,
             source=sources.get(key),
             exclude=exclude.get(key) if exclude else None,
+            curator=curator,
             **kwargs,
         )
         validated &= is_val
@@ -2055,80 +3337,72 @@ def save_artifact(
     columns_field: FieldAttr | dict[str, FieldAttr],
     description: str | None = None,
     organism: str | None = None,
-    adata: ad.AnnData | None = None,
     key: str | None = None,
+    artifact: Artifact | None = None,
     revises: Artifact | None = None,
     run: Run | None = None,
+    schema: Schema | None = None,
 ) -> Artifact:
     """Save all metadata with an Artifact.
     Args:
-        data: The DataFrame or AnnData object to save.
+        data: The DataFrame/AnnData/MuData object to save.
         fields: A dictionary mapping obs_column to registry_field.
         columns_field: The registry field to validate variables index against.
         description: A description of the artifact.
         organism: The organism name.
-        adata: The AnnData object to save and get n_observations, must be provided if data is a path.
         type: The artifact type.
-        key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
+        key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
+        artifact: A already registered artifact. Passing this will not save a new artifact from data.
         revises: Previous version of the artifact. Triggers a revision.
         run: The run that creates the artifact.
     Returns:
         The saved Artifact.
     """
-    from .._artifact import data_is_anndata
+    from .._artifact import data_is_anndata, data_is_mudata
     from ..core._data import add_labels
-    artifact = None
-    if data_is_anndata(data):
-        assert adata is not None  # noqa: S101
-        artifact = Artifact.from_anndata(
-            data, description=description, key=key, revises=revises, run=run
-        )
-        artifact.n_observations = adata.shape[0]
-        data = adata
-    elif isinstance(data, pd.DataFrame):
-        artifact = Artifact.from_df(
-            data, description=description, key=key, revises=revises, run=run
-        )
-    else:
-        try:
-            from mudata import MuData
-            if isinstance(data, MuData):
-                artifact = Artifact.from_mudata(
-                    data,
-                    description=description,
-                    key=key,
-                    revises=revises,
-                    run=run,
-                )
-                artifact.n_observations = data.n_obs
-        except ImportError:
-            pass
     if artifact is None:
-        raise ValueError("data must be a DataFrame, AnnData or MuData object.")
+        if data_is_anndata(data):
+            artifact = Artifact.from_anndata(
+                data, description=description, key=key, revises=revises, run=run
+            )
+        elif isinstance(data, pd.DataFrame):
+            artifact = Artifact.from_df(
+                data, description=description, key=key, revises=revises, run=run
+            )
+        elif data_is_mudata(data):
+            artifact = Artifact.from_mudata(
+                data,
+                description=description,
+                key=key,
+                revises=revises,
+                run=run,
+            )
+    artifact.schema = schema
     artifact.save()
-    feature_kwargs = check_registry_organism(
-        (
-            list(columns_field.values())[0].field.model
-            if isinstance(columns_field, dict)
-            else columns_field.field.model
-        ),
-        organism,
-    )
+    if organism is not None:
+        feature_kwargs = check_registry_organism(
+            (
+                list(columns_field.values())[0].field.model
+                if isinstance(columns_field, dict)
+                else columns_field.field.model
+            ),
+            organism,
+        )
+    else:
+        feature_kwargs = {}
     if artifact.otype == "DataFrame":
-        artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
+        artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)  # type: ignore
     elif artifact.otype == "AnnData":
-        artifact.features._add_set_from_anndata(
+        artifact.features._add_set_from_anndata(  # type: ignore
             var_field=columns_field, **feature_kwargs
         )
     elif artifact.otype == "MuData":
-        artifact.features._add_set_from_mudata(
+        artifact.features._add_set_from_mudata(  # type: ignore
             var_fields=columns_field, **feature_kwargs
         )
     else:
@@ -2202,7 +3476,7 @@ def save_artifact(
         )
     slug = ln_setup.settings.instance.slug
-    if ln_setup.settings.instance.is_remote:  # pragma: no cover
+    if ln_setup.settings.instance.is_remote:  # pdagma: no cover
         logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
     return artifact
@@ -2224,7 +3498,6 @@ def update_registry(
     values: list[str],
     field: FieldAttr,
     key: str,
-    using_key: str | None = None,
     validated_only: bool = True,
     df: pd.DataFrame | None = None,
     organism: str | None = None,
@@ -2233,13 +3506,12 @@ def update_registry(
     exclude: str | list | None = None,
     **kwargs,
 ) -> None:
-    """Save features or labels records in the default instance from the using_key instance.
+    """Save features or labels records in the default instance..
     Args:
         values: A list of values to be saved as labels.
         field: The FieldAttr object representing the field for which labels are being saved.
         key: The name of the feature to save.
-        using_key: The name of the instance from which to transfer labels (if applicable).
         validated_only: If True, only save validated labels.
         df: A DataFrame to save labels from.
         organism: The organism name.
@@ -2290,22 +3562,10 @@ def update_registry(
             i for i in values if i not in existing_and_public_labels
         ]
-        # inspect and save validated records the using_key instance
-        (
-            labels_saved[f"from {using_key}"],
-            non_validated_labels,
-        ) = update_registry_from_using_instance(
-            non_validated_labels,
-            field=field,
-            using_key=using_key,
-            exclude=exclude,
-            **filter_kwargs,
-        )
         # save non-validated/new records
         labels_saved["new"] = non_validated_labels
         if not validated_only:
-            non_validated_records = []
+            non_validated_records: RecordList[Any] = []  # type: ignore
             if df is not None and registry == Feature:
                 nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
                 non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
@@ -2379,48 +3639,6 @@ def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
     is_feature.children.add(*all_records)
-def update_registry_from_using_instance(
-    values: list[str],
-    field: FieldAttr,
-    using_key: str | None = None,
-    exclude: str | list | None = None,
-    **kwargs,
-) -> tuple[list[str], list[str]]:
-    """Save features or labels records from the using_key instance.
-    Args:
-        values: A list of values to be saved as labels.
-        field: The FieldAttr object representing the field for which labels are being saved.
-        using_key: The name of the instance from which to transfer labels (if applicable).
-        kwargs: Additional keyword arguments to pass to the registry model.
-    Returns:
-        A tuple containing the list of saved labels and the list of non-saved labels.
-    """
-    labels_saved = []
-    not_saved = values
-    if using_key is not None and using_key != "default":
-        registry_using = get_registry_instance(field.field.model, using_key)
-        inspect_result_using = inspect_instance(
-            values=values,
-            field=field,
-            registry=registry_using,
-            exclude=exclude,
-            **kwargs,
-        )
-        labels_using = registry_using.filter(
-            **{f"{field.field.name}__in": inspect_result_using.validated}
-        ).all()
-        for label_using in labels_using:
-            label_using.save()
-            labels_saved.append(getattr(label_using, field.field.name))
-        not_saved = inspect_result_using.non_validated
-    return labels_saved, not_saved
 def _save_organism(name: str):
     """Save an organism record."""
     import bionty as bt
@@ -2445,4 +3663,121 @@ def _ref_is_name(field: FieldAttr) -> bool | None:
     return field.field.name == name_field
-Curate = Curator  # backward compat
+# backward compat constructors ------------------
+@classmethod  # type: ignore
+def from_df(
+    cls,
+    df: pd.DataFrame,
+    categoricals: dict[str, FieldAttr] | None = None,
+    columns: FieldAttr = Feature.name,
+    verbosity: str = "hint",
+    organism: str | None = None,
+) -> DataFrameCatManager:
+    return DataFrameCatManager(
+        df=df,
+        categoricals=categoricals,
+        columns=columns,
+        verbosity=verbosity,
+        organism=organism,
+    )
+@classmethod  # type: ignore
+def from_anndata(
+    cls,
+    data: ad.AnnData | UPathStr,
+    var_index: FieldAttr,
+    categoricals: dict[str, FieldAttr] | None = None,
+    obs_columns: FieldAttr = Feature.name,
+    verbosity: str = "hint",
+    organism: str | None = None,
+    sources: dict[str, Record] | None = None,
+) -> AnnDataCatManager:
+    return AnnDataCatManager(
+        data=data,
+        var_index=var_index,
+        categoricals=categoricals,
+        obs_columns=obs_columns,
+        verbosity=verbosity,
+        organism=organism,
+        sources=sources,
+    )
+@classmethod  # type: ignore
+def from_mudata(
+    cls,
+    mdata: MuData,
+    var_index: dict[str, dict[str, FieldAttr]],
+    categoricals: dict[str, FieldAttr] | None = None,
+    verbosity: str = "hint",
+    organism: str | None = None,
+) -> MuDataCatManager:
+    return MuDataCatManager(
+        mdata=mdata,
+        var_index=var_index,
+        categoricals=categoricals,
+        verbosity=verbosity,
+        organism=organism,
+    )
+@classmethod  # type: ignore
+def from_tiledbsoma(
+    cls,
+    experiment_uri: UPathStr,
+    var_index: dict[str, tuple[str, FieldAttr]],
+    categoricals: dict[str, FieldAttr] | None = None,
+    obs_columns: FieldAttr = Feature.name,
+    organism: str | None = None,
+    sources: dict[str, Record] | None = None,
+    exclude: dict[str, str | list[str]] | None = None,
+) -> TiledbsomaCatManager:
+    return TiledbsomaCatManager(
+        experiment_uri=experiment_uri,
+        var_index=var_index,
+        categoricals=categoricals,
+        obs_columns=obs_columns,
+        organism=organism,
+        sources=sources,
+        exclude=exclude,
+    )
+@classmethod  # type: ignore
+def from_spatialdata(
+    cls,
+    sdata,
+    var_index: dict[str, FieldAttr],
+    categoricals: dict[str, dict[str, FieldAttr]] | None = None,
+    organism: str | None = None,
+    sources: dict[str, dict[str, Record]] | None = None,
+    exclude: dict[str, dict] | None = None,
+    verbosity: str = "hint",
+    *,
+    sample_metadata_key: str = "sample",
+):
+    try:
+        import spatialdata
+    except ImportError as e:
+        raise ImportError("Please install spatialdata: pip install spatialdata") from e
+    return SpatialDataCatManager(
+        sdata=sdata,
+        var_index=var_index,
+        categoricals=categoricals,
+        verbosity=verbosity,
+        organism=organism,
+        sources=sources,
+        exclude=exclude,
+        sample_metadata_key=sample_metadata_key,
+    )
+CatManager.from_df = from_df  # type: ignore
+CatManager.from_anndata = from_anndata  # type: ignore
+CatManager.from_mudata = from_mudata  # type: ignore
+CatManager.from_spatialdata = from_spatialdata  # type: ignore
+CatManager.from_tiledbsoma = from_tiledbsoma  # type: ignore

lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

lamindb 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl