PyPI - lamindb - Versions diffs - 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

lamindb 1.10.1py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

lamindb/__init__.py +89 -49
lamindb/_finish.py +17 -15
lamindb/_tracked.py +2 -4
lamindb/_view.py +1 -1
lamindb/base/__init__.py +2 -1
lamindb/base/dtypes.py +76 -0
lamindb/core/_settings.py +45 -2
lamindb/core/storage/_anndata_accessor.py +118 -26
lamindb/core/storage/_backed_access.py +10 -7
lamindb/core/storage/_spatialdata_accessor.py +15 -4
lamindb/core/storage/_zarr.py +3 -0
lamindb/curators/_legacy.py +16 -3
lamindb/curators/core.py +449 -193
lamindb/errors.py +6 -0
lamindb/examples/cellxgene/__init__.py +8 -3
lamindb/examples/cellxgene/_cellxgene.py +127 -13
lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
lamindb/examples/croissant/__init__.py +32 -6
lamindb/examples/datasets/__init__.py +2 -2
lamindb/examples/datasets/_core.py +9 -2
lamindb/examples/datasets/_small.py +66 -22
lamindb/examples/fixtures/sheets.py +8 -2
lamindb/integrations/_croissant.py +34 -11
lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
lamindb/migrations/0121_recorduser.py +60 -0
lamindb/models/__init__.py +4 -1
lamindb/models/_describe.py +2 -2
lamindb/models/_feature_manager.py +131 -71
lamindb/models/_from_values.py +2 -2
lamindb/models/_is_versioned.py +4 -4
lamindb/models/_label_manager.py +4 -4
lamindb/models/artifact.py +357 -192
lamindb/models/artifact_set.py +45 -1
lamindb/models/can_curate.py +1 -2
lamindb/models/collection.py +3 -34
lamindb/models/feature.py +111 -7
lamindb/models/has_parents.py +11 -11
lamindb/models/project.py +42 -2
lamindb/models/query_manager.py +16 -7
lamindb/models/query_set.py +191 -78
lamindb/models/record.py +30 -5
lamindb/models/run.py +10 -33
lamindb/models/save.py +6 -8
lamindb/models/schema.py +54 -26
lamindb/models/sqlrecord.py +152 -40
lamindb/models/storage.py +59 -14
lamindb/models/transform.py +17 -17
lamindb/models/ulabel.py +6 -1
{lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/METADATA +11 -16
{lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/RECORD +55 -50
{lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/LICENSE +0 -0
{lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/WHEEL +0 -0

lamindb/curators/core.py CHANGED Viewed

@@ -5,6 +5,7 @@
    Curator
    SlotsCurator
+   ComponentCurator
    CatVector
    CatLookup
    DataFrameCatManager
@@ -15,7 +16,6 @@ from __future__ import annotations
 import copy
 import re
-from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Callable
 import lamindb_setup as ln_setup
@@ -24,7 +24,9 @@ import pandas as pd
 import pandera.pandas as pandera
 from lamin_utils import colors, logger
 from lamindb_setup.core._docs import doc_args
+from lamindb_setup.core.upath import LocalPathClasses
+from lamindb.base.dtypes import check_dtype
 from lamindb.base.types import FieldAttr  # noqa
 from lamindb.models import (
     Artifact,
@@ -48,6 +50,7 @@ from lamindb.models.feature import (
 from ..errors import InvalidArgument, ValidationError
 if TYPE_CHECKING:
+    from collections.abc import Iterable
     from typing import Any
     from anndata import AnnData
@@ -145,6 +148,7 @@ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
 SLOTS_DOCSTRING = """Access sub curators by slot."""
+SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures."""
 VALIDATE_DOCSTRING = """Validate dataset against Schema.
@@ -197,7 +201,21 @@ class Curator:
                 "MuData",
                 "SpatialData",
             }:
-                self._dataset = self._dataset.load(is_run_input=False)
+                # Open remote AnnData Artifacts
+                if not isinstance(self._artifact.path, LocalPathClasses):
+                    if self._artifact.otype in {
+                        "AnnData",
+                    }:
+                        try:
+                            self._dataset = self._dataset.open(mode="r")
+                        # open can raise various errors. Fall back to loading into memory if open fails
+                        except Exception as e:
+                            logger.warning(
+                                f"Unable to open remote AnnData Artifact: {e}. Falling back to loading into memory."
+                            )
+                            self._dataset = self._dataset.load(is_run_input=False)
+                else:
+                    self._dataset = self._dataset.load(is_run_input=False)
         self._schema: Schema | None = schema
         self._is_validated: bool = False
@@ -284,9 +302,12 @@ class Curator:
         )
+@doc_args(SLOTS_DETAILS_DOCSTRING)
 class SlotsCurator(Curator):
     """Curator for a dataset with slots.
+    {}
     Args:
         dataset: The dataset to validate & annotate.
         schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -298,7 +319,7 @@ class SlotsCurator(Curator):
         schema: Schema,
     ) -> None:
         super().__init__(dataset=dataset, schema=schema)
-        self._slots: dict[str, DataFrameCurator] = {}
+        self._slots: dict[str, ComponentCurator] = {}
         # used for multimodal data structures (not AnnData)
         # in form of {table/modality_key: var_field}
@@ -308,7 +329,7 @@ class SlotsCurator(Curator):
     @property
     @doc_args(SLOTS_DOCSTRING)
-    def slots(self) -> dict[str, DataFrameCurator]:
+    def slots(self) -> dict[str, ComponentCurator]:
         """{}"""  # noqa: D415
         return self._slots
@@ -336,6 +357,10 @@ class SlotsCurator(Curator):
         if self._artifact is None:
             type_mapping = [
+                (
+                    lambda dataset: isinstance(dataset, pd.DataFrame),
+                    Artifact.from_dataframe,
+                ),
                 (
                     lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
                     Artifact.from_anndata,
@@ -364,12 +389,13 @@ class SlotsCurator(Curator):
                     )
                     break
-            self._artifact.schema = self._schema
-            self._artifact.save()
         cat_vectors = {}
         for curator in self._slots.values():
             for key, cat_vector in curator.cat._cat_vectors.items():
                 cat_vectors[key] = cat_vector
+        self._artifact.schema = self._schema
+        self._artifact.save()
         return annotate_artifact(  # type: ignore
             self._artifact,
             curator=self,
@@ -377,92 +403,21 @@ class SlotsCurator(Curator):
         )
-def is_list_of_type(value, expected_type):
-    """Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure."""
-    if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
-        # handle nested lists recursively
-        return all(is_list_of_type(item, expected_type) for item in value)
-    return isinstance(value, expected_type)
-def check_dtype(expected_type) -> Callable:
-    """Creates a check function for Pandera that validates a column's dtype.
-    Supports both standard dtype checking and mixed list/single values for the same type.
-    For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
-    Args:
-        expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
-    Returns:
-        A function that checks if a series has the expected dtype or contains mixed types
-    """
-    def check_function(series):
-        # first check if the series is entirely of the expected dtype (fast path)
-        if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
-            return True
-        elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
-            return True
-        elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
-            return True
-        elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
-            return True
-        elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
-            return True
-        # if we're here, it might be a mixed column with object dtype
-        # need to check each value individually
-        if series.dtype == "object" and expected_type.startswith("list"):
-            expected_type_member = expected_type.replace("list[", "").removesuffix("]")
-            if expected_type_member == "int":
-                return series.apply(lambda x: is_list_of_type(x, int)).all()
-            elif expected_type_member == "float":
-                return series.apply(lambda x: is_list_of_type(x, float)).all()
-            elif expected_type_member == "num":
-                # for numeric, accept either int or float
-                return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
-            elif (
-                expected_type_member == "str"
-                or expected_type_member == "path"
-                or expected_type_member.startswith("cat[")
-            ):
-                return series.apply(lambda x: is_list_of_type(x, str)).all()
-        # if we get here, the validation failed
-        return False
-    return check_function
-# this is also currently used as DictCurator
-class DataFrameCurator(Curator):
-    # the example in the docstring is tested in test_curators_quickstart_example
+# This is also currently used as DictCurator by flattening dictionaries into wide DataFrames.
+# Such an approach was never intended and there is room for a DictCurator in the future.
+# For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
+# https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836
+class ComponentCurator(Curator):
     """Curator for `DataFrame`.
+    Provides all key functionality to validate Pandas DataFrames.
+    This class is not user facing unlike :class:`~lamindb.curators.DataFrameCurator` which extends this
+    class with functionality to validate the `attrs` slot.
     Args:
         dataset: The DataFrame-like object to validate & annotate.
         schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
         slot: Indicate the slot in a composite curator for a composite data structure.
-    Example:
-        For simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
-        Here is an example that enforces a minimal set of columns in the dataframe.
-        .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
-            :language: python
-        Under-the-hood, this used the following schema.
-        .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
-            :language: python
-        Valid features & labels were defined as:
-        .. literalinclude:: scripts/define_mini_immuno_features_labels.py
-            :language: python
     """
     def __init__(
@@ -478,18 +433,18 @@ class DataFrameCurator(Curator):
         feature_ids: set[int] = set()
         if schema.flexible:
-            features += Feature.filter(name__in=self._dataset.keys()).list()
+            features += Feature.filter(name__in=self._dataset.keys()).to_list()
             feature_ids = {feature.id for feature in features}
         if schema.n > 0:
             if schema._index_feature_uid is not None:
                 schema_features = [
                     feature
-                    for feature in schema.members.list()
+                    for feature in schema.members.to_list()
                     if feature.uid != schema._index_feature_uid  # type: ignore
                 ]
             else:
-                schema_features = schema.members.list()  # type: ignore
+                schema_features = schema.members.to_list()  # type: ignore
             if feature_ids:
                 features.extend(
                     feature
@@ -580,9 +535,13 @@ class DataFrameCurator(Curator):
         # in the DataFrameCatManager, we use the
         # actual columns of the dataset, not the pandera columns
         # the pandera columns might have additional optional columns
+        if schema.itype == "Composite":
+            columns_field = Feature.name
+        else:
+            columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"]
         self._cat_manager = DataFrameCatManager(
             self._dataset,
-            columns_field=parse_cat_dtype(schema.itype, is_itype=True)["field"],
+            columns_field=columns_field,
             categoricals=categoricals,
             index=schema.index,
             slot=slot,
@@ -601,6 +560,11 @@ class DataFrameCurator(Curator):
         - Adds missing columns for features
         - Fills missing values for features with default values
         """
+        if self._artifact is not None:
+            raise RuntimeError(
+                "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
+            )
         for feature in self._schema.members:
             if feature.name not in self._dataset.columns:
                 if feature.default_value is not None or feature.nullable:
@@ -679,35 +643,262 @@ class DataFrameCurator(Curator):
         if not self._is_validated:
             self.validate()  # raises ValidationError if doesn't validate
         if self._artifact is None:
-            self._artifact = Artifact.from_df(
+            self._artifact = Artifact.from_dataframe(
                 self._dataset,
                 key=key,
                 description=description,
                 revises=revises,
                 run=run,
-                format=".csv" if key.endswith(".csv") else None,
+                format=".csv" if key is not None and key.endswith(".csv") else None,
             )
-            self._artifact.schema = self._schema
-            self._artifact.save()
+        self._artifact.schema = self._schema
+        self._artifact.save()
         return annotate_artifact(  # type: ignore
             self._artifact,
             cat_vectors=self.cat._cat_vectors,
         )
+class DataFrameCurator(SlotsCurator):
+    # the example in the docstring is tested in test_curators_quickstart_example
+    """Curator for `DataFrame`.
+    Args:
+        dataset: The DataFrame-like object to validate & annotate.
+        schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
+        slot: Indicate the slot in a composite curator for a composite data structure.
+    Examples:
+        For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_dataframe`.
+        Here is an example that enforces a minimal set of columns in the dataframe.
+        .. literalinclude:: scripts/curate_dataframe_minimal_errors.py
+            :language: python
+        Under-the-hood, this used the following schema.
+        .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
+            :language: python
+        Valid features & labels were defined as:
+        .. literalinclude:: scripts/define_mini_immuno_features_labels.py
+            :language: python
+        It is also possible to curate the `attrs` slot.
+        .. literalinclude:: scripts/curate_dataframe_attrs.py
+            :language: python
+    """
+    def __init__(
+        self,
+        dataset: pd.DataFrame | Artifact,
+        schema: Schema,
+        slot: str | None = None,
+    ) -> None:
+        super().__init__(dataset=dataset, schema=schema)
+        # Create atomic curator for features only
+        if len(self._schema.features.all()) > 0:
+            self._atomic_curator = ComponentCurator(
+                dataset=dataset,
+                schema=schema,
+                slot=slot,
+            )
+        # Handle (nested) attrs
+        if slot is None and schema.slots:
+            for slot_name, slot_schema in schema.slots.items():
+                if slot_name.startswith("attrs"):
+                    path_parts = slot_name.split(":")
+                    attrs_dict = getattr(self._dataset, "attrs", None)
+                    if attrs_dict is not None:
+                        if len(path_parts) == 1:
+                            data = attrs_dict
+                        else:
+                            deeper_keys = path_parts[1:]
+                            data = _resolve_schema_slot_path(
+                                attrs_dict, deeper_keys, slot_name, "attrs"
+                            )
+                        df = pd.DataFrame([data])
+                        self._slots[slot_name] = ComponentCurator(
+                            df, slot_schema, slot=slot_name
+                        )
+                else:
+                    raise ValueError(
+                        f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
+                    )
+    @property
+    def cat(self) -> DataFrameCatManager:
+        """Manage categoricals by updating registries."""
+        if hasattr(self, "_atomic_curator"):
+            return self._atomic_curator.cat
+        raise AttributeError("cat is only available for slots DataFrameCurator")
+    def standardize(self) -> None:
+        """Standardize the dataset.
+        - Adds missing columns for features
+        - Fills missing values for features with default values
+        """
+        if hasattr(self, "_atomic_curator"):
+            self._atomic_curator.standardize()
+        else:
+            for slot_curator in self._slots.values():
+                slot_curator.standardize()
+    @doc_args(VALIDATE_DOCSTRING)
+    def validate(self) -> None:
+        """{}."""
+        if hasattr(self, "_atomic_curator"):
+            self._atomic_curator.validate()
+            self._is_validated = self._atomic_curator._is_validated
+        if self._schema.itype == "Composite":
+            super().validate()
+    @doc_args(SAVE_ARTIFACT_DOCSTRING)
+    def save_artifact(
+        self, *, key=None, description=None, revises=None, run=None
+    ) -> Artifact:
+        """{}."""
+        if not self._is_validated:
+            self.validate()
+        if self._slots:
+            self._slots["columns"] = self._atomic_curator
+            try:
+                return super().save_artifact(
+                    key=key, description=description, revises=revises, run=run
+                )
+            finally:
+                del self._slots["columns"]
+        else:
+            return self._atomic_curator.save_artifact(
+                key=key, description=description, revises=revises, run=run
+            )
+def _resolve_schema_slot_path(
+    target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
+) -> Any:
+    """Resolve a schema slot path by traversing nested dictionary keys.
+    Args:
+        target_dict: Root dictionary to traverse
+        slot_keys: Sequence of keys defining the paths to traverse
+        slot_name: Schema slot identifier for error context
+        base_path: Base path string for error context
+    Returns:
+        The value at the resolved path
+    """
+    current = target_dict
+    for key in slot_keys:
+        base_path += f"['{key}']"
+        try:
+            current = current[key]
+        except KeyError:
+            available = (
+                list(current.keys()) if isinstance(current, dict) else "not a dict"
+            )
+            raise InvalidArgument(
+                f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
+                f"not found. Available keys at this level: {available}"
+            ) from None
+    return current
+def _handle_dict_slots(
+    dataset: ScverseDataStructures, slot: str
+) -> tuple[pd.DataFrame | None, str | None, str | None]:
+    """Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.
+    Supports two patterns:
+        - Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key"
+        - Modality dict access: "modality:uns"
+    Args:
+        dataset: The scverse datastructure object
+        slot: The slot path string to parse like 'uns:path:to'.
+    Returns:
+        tuple: (dataframe, modality_key, remaining_slot_path)
+            - dataframe: Single-row DataFrame containing the resolved data
+            - modality_key: Modality identifier if slot targets modality dict, else None
+            - remaining_slot_path: The dict attribute and nested keys as string
+    """
+    path_parts = slot.split(":")
+    # Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..."
+    if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]:
+        dict_attr = getattr(dataset, path_parts[0], None)
+        if dict_attr is not None:
+            if len(path_parts) == 1:
+                return pd.DataFrame([dict_attr]), None, path_parts[0]
+            deeper_keys = path_parts[1:]
+            data = _resolve_schema_slot_path(
+                dict_attr, deeper_keys, slot, path_parts[0]
+            )
+            return pd.DataFrame([data]), None, ":".join(path_parts[1:])
+    # Handle modality dict slots: "modality:uns", "modality:uns:key1:key2"
+    elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]:
+        modality, dict_name = path_parts[0], path_parts[1]
+        try:
+            modality_dataset = dataset[modality]
+            dict_attr = getattr(modality_dataset, dict_name, None)
+            if dict_attr is not None:
+                if len(path_parts) == 2:
+                    return pd.DataFrame([dict_attr]), modality, dict_name
+                deeper_keys = path_parts[2:]
+                data = _resolve_schema_slot_path(
+                    dict_attr, deeper_keys, slot, f"{modality}.{dict_name}"
+                )
+                return pd.DataFrame([data]), modality, ":".join(path_parts[1:])
+        except (KeyError, AttributeError):
+            pass
+    else:
+        raise InvalidArgument(
+            f"Invalid dict slot pattern '{slot}'. Expected formats: "
+            f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'"
+        )
+    return None, None, None
+@doc_args(SLOTS_DETAILS_DOCSTRING)
 class AnnDataCurator(SlotsCurator):
     """Curator for `AnnData`.
+    {}
     Args:
         dataset: The AnnData-like object to validate & annotate.
         schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
-    Example:
+    Examples:
+        Curate Ensembl gene IDs and valid features in obs:
         .. literalinclude:: scripts/curate_anndata_flexible.py
             :language: python
             :caption: curate_anndata_flexible.py
+        Curate `uns` dictionary:
+        .. literalinclude:: scripts/curate_anndata_uns.py
+            :language: python
+            :caption: curate_anndata_uns.py
     See Also:
         :meth:`~lamindb.Artifact.from_anndata`.
     """
@@ -720,34 +911,37 @@ class AnnDataCurator(SlotsCurator):
         super().__init__(dataset=dataset, schema=schema)
         if not data_is_scversedatastructure(self._dataset, "AnnData"):
             raise InvalidArgument("dataset must be AnnData-like.")
-        if schema.otype != "AnnData":
+        if schema.otype and schema.otype != "AnnData":
             raise InvalidArgument("Schema otype must be 'AnnData'.")
-        self._slots = {
-            slot: DataFrameCurator(
-                (
+        for slot, slot_schema in schema.slots.items():
+            if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"):
+                raise ValueError(
+                    f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}"
+                )
+            if slot.startswith("uns"):
+                df, _, _ = _handle_dict_slots(self._dataset, slot)
+            elif slot in {"obs", "var", "var.T"}:
+                df = (
                     getattr(self._dataset, slot.strip(".T")).T
                     if slot == "var.T"
                     or (
-                        # backward compat
                         slot == "var"
                         and schema.slots["var"].itype not in {None, "Feature"}
                     )
                     else getattr(self._dataset, slot)
-                ),
-                slot_schema,
-                slot=slot,
-            )
-            for slot, slot_schema in schema.slots.items()
-            if slot in {"obs", "var", "var.T", "uns"}
-        }
-        if "var" in self._slots and schema.slots["var"].itype not in {None, "Feature"}:
-            logger.warning(
-                "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
-            )
-            self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
-                "var"
-            ].cat._cat_vectors.pop("columns")
-            self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
+                )
+            self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
+            # Handle var index naming for backward compat
+            if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}:
+                logger.warning(
+                    "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
+                )
+                self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
+                    "var"
+                ].cat._cat_vectors.pop("columns")
+                self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
 def _assign_var_fields_categoricals_multimodal(
@@ -757,11 +951,10 @@ def _assign_var_fields_categoricals_multimodal(
     slot_schema: Schema,
     var_fields: dict[str, FieldAttr],
     cat_vectors: dict[str, dict[str, CatVector]],
-    slots: dict[str, DataFrameCurator],
+    slots: dict[str, ComponentCurator],
 ) -> None:
     """Assigns var_fields and categoricals for multimodal data curators."""
     if modality is not None:
-        # Makes sure that all tables are present
         var_fields[modality] = None
         cat_vectors[modality] = {}
@@ -782,15 +975,17 @@ def _assign_var_fields_categoricals_multimodal(
             cat_vectors[modality] = obs_fields
+@doc_args(SLOTS_DETAILS_DOCSTRING)
 class MuDataCurator(SlotsCurator):
     """Curator for `MuData`.
+    {}
     Args:
         dataset: The MuData-like object to validate & annotate.
         schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
     Example:
         .. literalinclude:: scripts/curate_mudata.py
             :language: python
             :caption: curate_mudata.py
@@ -811,12 +1006,32 @@ class MuDataCurator(SlotsCurator):
             raise InvalidArgument("Schema otype must be 'MuData'.")
         for slot, slot_schema in schema.slots.items():
-            if ":" in slot:
-                modality, modality_slot = slot.split(":")
-                schema_dataset = self._dataset.__getitem__(modality)
+            # Handle slots: "mdata.uns", "modality:uns"
+            if "uns" in slot:
+                df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)
             else:
-                modality, modality_slot = None, slot
-                schema_dataset = self._dataset
+                # Handle slots: "modality:obs", "modality:var"
+                parts = slot.split(":")
+                if len(parts) == 2:
+                    modality, modality_slot = parts
+                    try:
+                        schema_dataset = self._dataset[modality]
+                        df = getattr(schema_dataset, modality_slot.rstrip(".T"))
+                    except KeyError:
+                        raise InvalidArgument(
+                            f"Modality '{modality}' not found in MuData"
+                        ) from None
+                    except AttributeError:
+                        raise InvalidArgument(
+                            f"Attribute '{modality_slot}' not found on modality '{modality}'"
+                        ) from None
+                else:
+                    # Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above)
+                    modality, modality_slot = None, slot
+                    schema_dataset = self._dataset
+                    df = getattr(schema_dataset, modality_slot.rstrip(".T"))
+            # Transpose var if necessary
             if modality_slot == "var" and schema.slots[slot].itype not in {
                 None,
                 "Feature",
@@ -824,19 +1039,12 @@ class MuDataCurator(SlotsCurator):
                 logger.warning(
                     "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
                 )
-            self._slots[slot] = DataFrameCurator(
-                (
-                    getattr(schema_dataset, modality_slot.rstrip(".T")).T
-                    if modality_slot == "var.T"
-                    or (
-                        # backward compat
-                        modality_slot == "var"
-                        and schema.slots[slot].itype not in {None, "Feature"}
-                    )
-                    else getattr(schema_dataset, modality_slot)
-                ),
-                slot_schema,
-            )
+                df = df.T
+            elif modality_slot == "var.T":
+                df = df.T
+            self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
             _assign_var_fields_categoricals_multimodal(
                 modality=modality,
                 slot_type=modality_slot,
@@ -846,18 +1054,21 @@ class MuDataCurator(SlotsCurator):
                 cat_vectors=self._cat_vectors,
                 slots=self._slots,
             )
         self._columns_field = self._var_fields
+@doc_args(SLOTS_DETAILS_DOCSTRING)
 class SpatialDataCurator(SlotsCurator):
     """Curator for `SpatialData`.
+    {}
     Args:
         dataset: The SpatialData-like object to validate & annotate.
         schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
     Example:
         .. literalinclude:: scripts/curate_spatialdata.py
             :language: python
             :caption: curate_spatialdata.py
@@ -878,69 +1089,75 @@ class SpatialDataCurator(SlotsCurator):
             raise InvalidArgument("Schema otype must be 'SpatialData'.")
         for slot, slot_schema in schema.slots.items():
-            split_result = slot.split(":")
-            if (len(split_result) == 2 and split_result[0] == "table") or (
-                len(split_result) == 3 and split_result[0] == "tables"
-            ):
-                if len(split_result) == 2:
-                    table_key, sub_slot = split_result
-                    logger.warning(
-                        f"please prefix slot {slot} with 'tables:' going forward"
-                    )
+            # Handle slots: "sdata:attrs"
+            if slot.startswith("attrs"):
+                df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)
+            else:
+                parts = slot.split(":")
+                # Handle slots: "tables:table_key:obs", "tables:table_key:var"
+                if len(parts) == 3 and parts[0] == "tables":
+                    table_key, table_slot = parts[1], parts[2]
+                    try:
+                        slot_object = self._dataset.tables[table_key]
+                        df = getattr(slot_object, table_slot.rstrip(".T"))
+                    except KeyError:
+                        raise InvalidArgument(
+                            f"Table '{table_key}' not found in sdata.tables"
+                        ) from None
+                    except AttributeError:
+                        raise InvalidArgument(
+                            f"Attribute '{table_slot}' not found on table '{table_key}'"
+                        ) from None
                 else:
-                    table_key, sub_slot = split_result[1], split_result[2]
-                slot_object = self._dataset.tables.__getitem__(table_key)
-                if sub_slot == "var" and schema.slots[slot].itype not in {
-                    None,
-                    "Feature",
-                }:
-                    logger.warning(
-                        "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
-                    )
-                data_object = (
-                    getattr(slot_object, sub_slot.rstrip(".T")).T
-                    if sub_slot == "var.T"
-                    or (
-                        # backward compat
-                        sub_slot == "var"
-                        and schema.slots[slot].itype not in {None, "Feature"}
-                    )
-                    else getattr(slot_object, sub_slot)
-                )
-            elif len(split_result) == 1 or (
-                len(split_result) > 1 and split_result[0] == "attrs"
-            ):
-                table_key = None
-                if len(split_result) == 1:
-                    if split_result[0] != "attrs":
+                    # Handle legacy single keys for backward compatibility
+                    if len(parts) == 1 and parts[0] != "attrs":
                         logger.warning(
                             f"please prefix slot {slot} with 'attrs:' going forward"
                         )
-                        sub_slot = slot
-                        data_object = self._dataset.attrs[slot]
+                        try:
+                            df = pd.DataFrame([self._dataset.attrs[slot]])
+                            table_key = None
+                            table_slot = slot
+                        except KeyError:
+                            raise InvalidArgument(
+                                f"Slot '{slot}' not found in sdata.attrs"
+                            ) from None
                     else:
-                        sub_slot = "attrs"
-                        data_object = self._dataset.attrs
-                elif len(split_result) == 2:
-                    sub_slot = split_result[1]
-                    data_object = self._dataset.attrs[split_result[1]]
-                data_object = pd.DataFrame([data_object])
-            self._slots[slot] = DataFrameCurator(data_object, slot_schema, slot)
+                        raise InvalidArgument(f"Unrecognized slot format: {slot}")
+            # Handle var transposition logic
+            if table_slot == "var" and schema.slots[slot].itype not in {
+                None,
+                "Feature",
+            }:
+                logger.warning(
+                    "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
+                )
+                df = df.T
+            elif table_slot == "var.T":
+                df = df.T
+            self._slots[slot] = ComponentCurator(df, slot_schema, slot)
             _assign_var_fields_categoricals_multimodal(
                 modality=table_key,
-                slot_type=sub_slot,
+                slot_type=table_slot,
                 slot=slot,
                 slot_schema=slot_schema,
                 var_fields=self._var_fields,
                 cat_vectors=self._cat_vectors,
                 slots=self._slots,
             )
         self._columns_field = self._var_fields
+@doc_args(SLOTS_DETAILS_DOCSTRING)
 class TiledbsomaExperimentCurator(SlotsCurator):
     """Curator for `tiledbsoma.Experiment`.
+    {}
     Args:
         dataset: The `tiledbsoma.Experiment` object.
         schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
@@ -977,7 +1194,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
                     .drop("soma_joinid", axis=1, errors="ignore")
                 )
-                self._slots[slot] = DataFrameCurator(
+                self._slots[slot] = ComponentCurator(
                     (schema_dataset.T if modality_slot == "var.T" else schema_dataset),
                     slot_schema,
                 )
@@ -990,7 +1207,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
                     .to_pandas()
                     .drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
                 )
-                self._slots[slot] = DataFrameCurator(
+                self._slots[slot] = ComponentCurator(
                     schema_dataset,
                     slot_schema,
                 )
@@ -1040,9 +1257,12 @@ class CatVector:
         self._maximal_set = maximal_set
         self._all_filters = {"source": self._source, "organism": self._organism}
         if self._subtype_str and "=" in self._subtype_str:
             self._all_filters.update(
-                resolve_relation_filters(parse_filter_string(self._subtype_str), self)  # type: ignore
+                resolve_relation_filters(
+                    parse_filter_string(self._subtype_str), self._field.field.model
+                )  # type: ignore
             )
         if hasattr(field.field.model, "_name_field"):
@@ -1241,7 +1461,7 @@ class CatVector:
             type_record = registry.get(name=self._subtype_str)
         if df is not None and registry == Feature:
             nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
-            non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
+            non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])
         else:
             if (
                 self._organism
@@ -1343,7 +1563,7 @@ class CatVector:
                     warning_message += "\n    for remaining terms:\n"
                 warning_message += f"    → fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}"
                 if self._subtype_query_set is not None:
-                    warning_message += f"\n    → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.list('name')}"
+                    warning_message += f"\n    → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.to_list('name')}"
             logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
             logger.warning(warning_message)
             if self._cat_manager is not None:
@@ -1493,6 +1713,30 @@ class DataFrameCatManager:
         """The categorical features."""
         return self._categoricals
+    def __repr__(self) -> str:
+        cls_name = colors.green(self.__class__.__name__)
+        status_str = (
+            f"{colors.green('validated')}"
+            if self._is_validated
+            else f"{colors.yellow('unvalidated')}"
+        )
+        info_parts = []
+        cat_count = len(self._categoricals)
+        if cat_count > 0:
+            info_parts.append(f"categorical_features={cat_count}")
+        if self._slot:
+            info_parts.append(f"slot: {colors.italic(self._slot)}")
+        info_str = ", ".join(info_parts)
+        if info_str:
+            return f"{cls_name}({info_str}, {status_str})"
+        else:
+            return f"{cls_name}({status_str})"
     def lookup(self, public: bool = False) -> CatLookup:
         """Lookup categories.
@@ -1537,7 +1781,9 @@ class DataFrameCatManager:
             key: The key referencing the column in the DataFrame to standardize.
         """
         if self._artifact is not None:
-            raise RuntimeError("can't mutate the dataset when an artifact is passed!")
+            raise RuntimeError(
+                "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
+            )
         if key == "all":
             logger.warning(
@@ -1610,7 +1856,7 @@ def get_organism_kwargs(
 def annotate_artifact(
     artifact: Artifact,
     *,
-    curator: AnnDataCurator | SlotsCurator | None = None,
+    curator: SlotsCurator | None = None,
     cat_vectors: dict[str, CatVector] | None = None,
 ) -> Artifact:
     from .. import settings
@@ -1643,7 +1889,9 @@ def annotate_artifact(
         )
     # annotate with inferred schemas aka feature sets
-    if artifact.otype == "DataFrame":
+    if (
+        artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None
+    ):  # Prevent overwriting user-defined schemas that contain slots
         features = cat_vectors["columns"].records
         if features is not None:
             index_feature = artifact.schema.index
@@ -1663,7 +1911,11 @@ def annotate_artifact(
                 logger.important(
                     f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
                 )
-                itype = parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
+                itype = (
+                    Feature.name
+                    if artifact.schema.itype == "Composite"
+                    else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
+                )
                 feature_set = Schema(itype=itype, n=len(features))
             artifact.feature_sets.add(
                 feature_set.save(), through_defaults={"slot": "columns"}
@@ -1698,9 +1950,13 @@ def annotate_artifact(
                 logger.important(
                     f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
                 )
-                itype = parse_cat_dtype(
-                    artifact.schema.slots[slot].itype, is_itype=True
-                )["field"]
+                itype = (
+                    Feature.name
+                    if artifact.schema.slots[slot].itype == "Composite"
+                    else parse_cat_dtype(
+                        artifact.schema.slots[slot].itype, is_itype=True
+                    )["field"]
+                )
                 feature_set = Schema(itype=itype, n=len(features))
             artifact.feature_sets.add(
                 feature_set.save(), through_defaults={"slot": slot}

lamindb 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

lamindb 1.10.1py3-none-any.whl → 1.11.0py3-none-any.whl