PyPI - lamindb - Versions diffs - 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl - Mend

lamindb 0.74.3py3-none-any.whl → 0.75.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

lamindb/__init__.py +1 -1
lamindb/_artifact.py +85 -43
lamindb/_can_validate.py +100 -35
lamindb/_collection.py +36 -28
lamindb/_curate.py +432 -181
lamindb/_feature_set.py +5 -5
lamindb/_filter.py +3 -3
lamindb/_finish.py +29 -23
lamindb/_from_values.py +47 -66
lamindb/_is_versioned.py +1 -1
lamindb/_parents.py +38 -13
lamindb/_record.py +41 -42
lamindb/_save.py +7 -7
lamindb/_transform.py +27 -16
lamindb/_view.py +13 -11
lamindb/core/__init__.py +2 -0
lamindb/core/_data.py +18 -20
lamindb/core/_feature_manager.py +50 -50
lamindb/core/_label_manager.py +17 -19
lamindb/core/_mapped_collection.py +1 -1
lamindb/core/_run_context.py +6 -8
lamindb/core/datasets/_core.py +7 -7
lamindb/core/exceptions.py +11 -0
lamindb/core/schema.py +5 -5
lamindb/core/storage/__init__.py +12 -2
lamindb/core/storage/_anndata_accessor.py +735 -0
lamindb/core/storage/_backed_access.py +77 -747
lamindb/core/storage/_valid_suffixes.py +16 -2
lamindb/core/storage/paths.py +9 -14
lamindb/core/types.py +3 -0
lamindb/core/versioning.py +1 -1
lamindb/integrations/__init__.py +1 -0
lamindb/integrations/_vitessce.py +68 -31
{lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/METADATA +5 -5
lamindb-0.75.1.dist-info/RECORD +58 -0
lamindb-0.74.3.dist-info/RECORD +0 -57
{lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/LICENSE +0 -0
{lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/WHEEL +0 -0

lamindb/_curate.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Iterable
+import copy
+from typing import TYPE_CHECKING, Iterable, Type
 import anndata as ad
 import lamindb_setup as ln_setup
@@ -9,7 +10,6 @@ from lamin_utils import colors, logger
 from lamindb_setup.core._docs import doc_args
 from lnschema_core import (
     Artifact,
-    Collection,
     Feature,
     Record,
     Run,
@@ -31,23 +31,25 @@ class CurateLookup:
         self,
         categoricals: dict[str, FieldAttr],
         slots: dict[str, FieldAttr] = None,
-        using: str | None = None,
+        using_key: str | None = None,
     ) -> None:
         if slots is None:
             slots = {}
         self._fields = {**categoricals, **slots}
-        self._using = None if using == "default" else using
-        self._using_name = self._using or ln_setup.settings.instance.slug
-        debug_message = f"Lookup objects from the " f"{colors.italic(self._using_name)}"
+        self._using_key = None if using_key == "default" else using_key
+        self._using_key_name = self._using_key or ln_setup.settings.instance.slug
+        debug_message = (
+            f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
+        )
         logger.debug(debug_message)
     def __getattr__(self, name):
         if name in self._fields:
             registry = self._fields[name].field.model
-            if self._using == "public":
+            if self._using_key == "public":
                 return registry.public().lookup()
             else:
-                return get_registry_instance(registry, self._using).lookup()
+                return get_registry_instance(registry, self._using_key).lookup()
         raise AttributeError(
             f"'{self.__class__.__name__}' object has no attribute '{name}'"
         )
@@ -55,10 +57,10 @@ class CurateLookup:
     def __getitem__(self, name):
         if name in self._fields:
             registry = self._fields[name].field.model
-            if self._using == "public":
+            if self._using_key == "public":
                 return registry.public().lookup()
             else:
-                return get_registry_instance(registry, self._using).lookup()
+                return get_registry_instance(registry, self._using_key).lookup()
         raise AttributeError(
             f"'{self.__class__.__name__}' object has no attribute '{name}'"
         )
@@ -72,7 +74,7 @@ class CurateLookup:
                 [str([key]) for key in self._fields if not key.isidentifier()]
             )
             return (
-                f"Lookup objects from the {colors.italic(self._using_name)}:\n "
+                f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
                 f"{colors.green(getattr_keys)}\n "
                 f"{colors.green(getitem_keys)}\n\n"
                 "Example:\n    → categories = validator.lookup().cell_type\n"
@@ -83,15 +85,19 @@ class CurateLookup:
 class DataFrameCurator:
-    """Annotation flow for a DataFrame object.
+    """Curation flow for a DataFrame object.
+    See also :class:`~lamindb.Curate`.
     Args:
         df: The DataFrame object to curate.
         columns: The field attribute for the feature column.
         categoricals: A dictionary mapping column names to registry_field.
-        using: The reference instance containing registries to validate against.
+        using_key: The reference instance containing registries to validate against.
         verbosity: The verbosity level.
         organism: The organism name.
+        sources: A dictionary mapping column names to Source records.
+        exclude: A dictionary mapping column names to values to exclude.
     Examples:
         >>> import bionty as bt
@@ -106,40 +112,56 @@ class DataFrameCurator:
         df: pd.DataFrame,
         columns: FieldAttr = Feature.name,
         categoricals: dict[str, FieldAttr] | None = None,
-        using: str | None = None,
+        using_key: str | None = None,
         verbosity: str = "hint",
         organism: str | None = None,
+        sources: dict[str, Record] | None = None,
+        exclude: dict | None = None,
     ) -> None:
         from lamindb.core._settings import settings
         self._df = df
         self._fields = categoricals or {}
         self._columns_field = columns
-        self._using = using
+        self._using_key = using_key
         settings.verbosity = verbosity
         self._artifact = None
         self._collection = None
         self._validated = False
         self._kwargs = {"organism": organism} if organism else {}
+        if sources is None:
+            sources = {}
+        self._sources = sources
+        if exclude is None:
+            exclude = {}
+        self._exclude = exclude
+        self._non_validated = None
         self._save_columns()
+    @property
+    def non_validated(self) -> list:
+        """Return the non-validated features and labels."""
+        if self._non_validated is None:
+            raise ValueError("Please run validate() first!")
+        return self._non_validated
     @property
     def fields(self) -> dict:
         """Return the columns fields to validate against."""
         return self._fields
-    def lookup(self, using: str | None = None) -> CurateLookup:
+    def lookup(self, using_key: str | None = None) -> CurateLookup:
         """Lookup categories.
         Args:
-            using: The instance where the lookup is performed.
-                if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
+            using_key: The instance where the lookup is performed.
+                if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
                 if "public", the lookup is performed on the public reference.
         """
         return CurateLookup(
             categoricals=self._fields,
             slots={"columns": self._columns_field},
-            using=using or self._using,
+            using_key=using_key or self._using_key,
         )
     def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
@@ -156,8 +178,9 @@ class DataFrameCurator:
             field=self._columns_field,
             key="columns",
             save_function="add_new_from_columns",
-            using=self._using,
+            using_key=self._using_key,
             validated_only=False,
+            source=self._sources.get("columns"),
             **kwargs,
         )
@@ -169,9 +192,11 @@ class DataFrameCurator:
                 field=self._columns_field,
                 key="columns",
                 save_function="add_new_from_columns",
-                using=self._using,
+                using_key=self._using_key,
                 validated_only=validated_only,
                 df=self._df,  # Get the Feature type from df
+                source=self._sources.get("columns"),
+                warning=False,  # Do not warn about missing columns, just an info message
                 **kwargs,
             )
@@ -220,8 +245,9 @@ class DataFrameCurator:
                 values=self._df[categorical].unique().tolist(),
                 field=self.fields[categorical],
                 key=categorical,
-                using=self._using,
+                using_key=self._using_key,
                 validated_only=validated_only,
+                sources=self._sources.get(categorical),
                 **kwargs,
             )
@@ -238,10 +264,12 @@ class DataFrameCurator:
             Whether the DataFrame is validated.
         """
         self._kwargs.update({"organism": organism} if organism else {})
-        self._validated = validate_categories_in_df(
+        self._validated, self._non_validated = validate_categories_in_df(  # type: ignore
             self._df,
             fields=self.fields,
-            using=self._using,
+            using_key=self._using_key,
+            sources=self._sources,
+            exclude=self._exclude,
             **self._kwargs,
         )
         return self._validated
@@ -283,41 +311,6 @@ class DataFrameCurator:
         return self._artifact
-    def save_collection(
-        self,
-        artifact: Artifact | Iterable[Artifact],
-        name: str,
-        description: str | None = None,
-        reference: str | None = None,
-        reference_type: str | None = None,
-    ) -> Collection:
-        """Save a collection from artifact/artifacts.
-        Args:
-            artifact: One or several saved Artifacts.
-            name: Title of the publication.
-            description: Description of the publication.
-            reference: Accession number (e.g. GSE#, E-MTAB#, etc.).
-            reference_type: Source type (e.g. GEO, ArrayExpress, SRA, etc.).
-        """
-        collection = Collection(
-            artifact,
-            name=name,
-            description=description,
-            reference=reference,
-            reference_type=reference_type,
-        )
-        slug = ln_setup.settings.instance.slug
-        if collection._state.adding:
-            collection.save()
-        else:  # pragma: no cover
-            collection.save()
-            logger.warning(f"collection already exists in {colors.italic(slug)}!")
-        if ln_setup.settings.instance.is_remote:  # pragma: no cover
-            logger.print(f"go to https://lamin.ai/{slug}/collection/{collection.uid}")
-        self._collection = collection
-        return collection
     def clean_up_failed_runs(self):
         """Clean up previous failed runs that don't save any outputs."""
         from lamindb.core._run_context import run_context
@@ -329,15 +322,21 @@ class DataFrameCurator:
 class AnnDataCurator(DataFrameCurator):
-    """Annotation flow for ``AnnData``.
+    """Curation flow for ``AnnData``.
+    See also :class:`~lamindb.Curate`.
+    Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curate.from_anndata`.
     Args:
         data: The AnnData object or an AnnData-like path.
         var_index: The registry field for mapping the ``.var`` index.
         categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
-        using: A reference LaminDB instance.
+        using_key: A reference LaminDB instance.
         verbosity: The verbosity level.
         organism: The organism name.
+        sources: A dictionary mapping ``.obs.columns`` to Source records.
+        exclude: A dictionary mapping column names to values to exclude.
     Examples:
         >>> import bionty as bt
@@ -354,14 +353,19 @@ class AnnDataCurator(DataFrameCurator):
         data: ad.AnnData | UPathStr,
         var_index: FieldAttr,
         categoricals: dict[str, FieldAttr] | None = None,
-        using: str = "default",
+        obs_columns: FieldAttr = Feature.name,
+        using_key: str = "default",
         verbosity: str = "hint",
         organism: str | None = None,
+        sources: dict[str, Record] | None = None,
+        exclude: dict | None = None,
     ) -> None:
         from lamindb_setup.core import upath
         from ._artifact import data_is_anndata
+        if sources is None:
+            sources = {}
         if not data_is_anndata(data):
             raise ValueError(
                 "data has to be an AnnData object or a path to AnnData-like"
@@ -378,12 +382,14 @@ class AnnDataCurator(DataFrameCurator):
         super().__init__(
             df=self._adata.obs,
             categoricals=categoricals,
-            using=using,
+            columns=obs_columns,
+            using_key=using_key,
             verbosity=verbosity,
             organism=organism,
+            sources=sources,
+            exclude=exclude,
         )
         self._obs_fields = categoricals
-        self._save_from_var_index(validated_only=True, **self._kwargs)
     @property
     def var_index(self) -> FieldAttr:
@@ -395,18 +401,18 @@ class AnnDataCurator(DataFrameCurator):
         """Return the obs fields to validate against."""
         return self._obs_fields
-    def lookup(self, using: str | None = None) -> CurateLookup:
+    def lookup(self, using_key: str | None = None) -> CurateLookup:
         """Lookup categories.
         Args:
-            using: The instance where the lookup is performed.
+            using_key: The instance where the lookup is performed.
                 if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
                 if "public", the lookup is performed on the public reference.
         """
         return CurateLookup(
             categoricals=self._obs_fields,
             slots={"columns": self._columns_field, "var_index": self._var_field},
-            using=using or self._using,
+            using_key=using_key or self._using_key,
         )
     def _save_from_var_index(
@@ -414,15 +420,25 @@ class AnnDataCurator(DataFrameCurator):
     ):
         """Save variable records."""
         update_registry(
-            values=self._adata.var.index,
+            values=list(self._adata.var.index),
             field=self.var_index,
             key="var_index",
             save_function="add_new_from_var_index",
-            using=self._using,
+            using_key=self._using_key,
             validated_only=validated_only,
             organism=organism,
+            source=self._sources.get("var_index"),
         )
+    def _update_registry_all(self, validated_only: bool = True, **kwargs):
+        """Save labels for all features."""
+        for name in self.fields.keys():
+            logger.info(f"saving labels for '{name}'")
+            if name == "var_index":
+                self._save_from_var_index(validated_only=validated_only, **kwargs)
+            else:
+                self._update_registry(name, validated_only=validated_only, **kwargs)
     def add_new_from_var_index(self, organism: str | None = None, **kwargs):
         """Update variable records.
@@ -433,6 +449,15 @@ class AnnDataCurator(DataFrameCurator):
         self._kwargs.update({"organism": organism} if organism else {})
         self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
+    def add_validated_from_var_index(self, organism: str | None = None):
+        """Add validated variable records.
+        Args:
+            organism: The organism name.
+        """
+        self._kwargs.update({"organism": organism} if organism else {})
+        self._save_from_var_index(validated_only=True, **self._kwargs)
     def validate(self, organism: str | None = None) -> bool:
         """Validate categories.
@@ -443,20 +468,32 @@ class AnnDataCurator(DataFrameCurator):
             Whether the AnnData object is validated.
         """
         self._kwargs.update({"organism": organism} if organism else {})
-        if self._using is not None and self._using != "default":
+        if self._using_key is not None and self._using_key != "default":
             logger.important(
-                f"validating metadata using registries of instance {colors.italic(self._using)}"
+                f"validating metadata using registries of instance {colors.italic(self._using_key)}"
             )
-        validated_var = validate_categories(
+        validated_var, non_validated_var = validate_categories(
             self._adata.var.index,
             field=self._var_field,
             key="var_index",
-            using=self._using,
-            **self._kwargs,
+            using_key=self._using_key,
+            source=self._sources.get("var_index"),
+            validated_hint_print=".add_validated_from_var_index()",
+            exclude=self._exclude.get("var_index"),
+            **self._kwargs,  # type: ignore
         )
-        validated_obs = validate_categories_in_df(
-            self._adata.obs, fields=self.categoricals, using=self._using, **self._kwargs
+        validated_obs, non_validated_obs = validate_categories_in_df(
+            self._adata.obs,
+            fields=self.categoricals,
+            using_key=self._using_key,
+            sources=self._sources,
+            exclude=self._exclude,
+            **self._kwargs,
         )
+        self._non_validated = non_validated_obs  # type: ignore
+        if len(non_validated_var) > 0:
+            self._non_validated["var_index"] = non_validated_var  # type: ignore
         self._validated = validated_var and validated_obs
         return self._validated
@@ -488,7 +525,12 @@ class AnnDataCurator(DataFrameCurator):
 class MuDataCurator:
-    """Annotation flow for a ``MuData`` object.
+    """Curation flow for a ``MuData`` object.
+    See also :class:`~lamindb.Curate`.
+    Note that if genes or other measurements are removed from the MuData object,
+    the object should be recreated using :meth:`~lamindb.Curate.from_mudata`.
     Args:
         mdata: The MuData object to curate.
@@ -497,9 +539,11 @@ class MuDataCurator:
             ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
         categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
             Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
-        using: A reference LaminDB instance.
+        using_key: A reference LaminDB instance.
         verbosity: The verbosity level.
         organism: The organism name.
+        sources: A dictionary mapping ``.obs.columns`` to Source records.
+        exclude: A dictionary mapping column names to values to exclude.
     Examples:
         >>> import bionty as bt
@@ -516,24 +560,34 @@ class MuDataCurator:
         mdata: MuData,
         var_index: dict[str, dict[str, FieldAttr]],
         categoricals: dict[str, FieldAttr] | None = None,
-        using: str = "default",
+        using_key: str = "default",
         verbosity: str = "hint",
         organism: str | None = None,
+        sources: dict[str, Record] | None = None,
+        exclude: dict | None = None,
     ) -> None:
+        if sources is None:
+            sources = {}
+        self._sources = sources
+        if exclude is None:
+            exclude = {}
+        self._exclude = exclude
         self._mdata = mdata
         self._kwargs = {"organism": organism} if organism else {}
         self._var_fields = var_index
         self._verify_modality(self._var_fields.keys())
         self._obs_fields = self._parse_categoricals(categoricals)
         self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
-        self._using = using
+        self._using_key = using_key
         self._verbosity = verbosity
         self._df_annotators = {
             modality: DataFrameCurator(
                 df=mdata[modality].obs if modality != "obs" else mdata.obs,
                 categoricals=self._obs_fields.get(modality, {}),
-                using=using,
+                using_key=using_key,
                 verbosity=verbosity,
+                sources=self._sources.get(modality),
+                exclude=self._exclude.get(modality),
                 **self._kwargs,
             )
             for modality in self._modalities
@@ -564,11 +618,11 @@ class MuDataCurator:
     ):
         """Save variable records."""
         update_registry(
-            values=self._mdata[modality].var.index,
+            values=list(self._mdata[modality].var.index),
             field=self._var_fields[modality],
             key="var_index",
             save_function="add_new_from_var_index",
-            using=self._using,
+            using_key=self._using_key,
             validated_only=validated_only,
             dtype="number",
             **kwargs,
@@ -592,12 +646,12 @@ class MuDataCurator:
                 obs_fields["obs"][k] = v
         return obs_fields
-    def lookup(self, using: str | None = None) -> CurateLookup:
+    def lookup(self, using_key: str | None = None) -> CurateLookup:
         """Lookup categories.
         Args:
-            using: The instance where the lookup is performed.
-                if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
+            using_key: The instance where the lookup is performed.
+                if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
                 if "public", the lookup is performed on the public reference.
         """
         return CurateLookup(
@@ -606,7 +660,7 @@ class MuDataCurator:
                 **self._obs_fields,
                 **{f"{k}_var_index": v for k, v in self._var_fields.items()},
             },
-            using=using or self._using,
+            using_key=using_key or self._using_key,
         )
     def add_new_from_columns(
@@ -625,14 +679,15 @@ class MuDataCurator:
             **kwargs: Additional keyword arguments to pass to the registry model.
         """
         self._kwargs.update({"organism": organism} if organism else {})
+        values = column_names or self._mdata[modality].obs.columns
         update_registry(
-            values=column_names or self._mdata[modality].obs.columns,
+            values=list(values),
             field=Feature.name,
             key=f"{modality} obs columns",
-            using=self._using,
+            using_key=self._using_key,
             validated_only=False,
             df=self._mdata[modality].obs,
-            **self._kwargs,
+            **self._kwargs,  # type: ignore
             **kwargs,
         )
@@ -651,6 +706,18 @@ class MuDataCurator:
             modality=modality, validated_only=False, **self._kwargs, **kwargs
         )
+    def add_validated_from_var_index(self, modality: str, organism: str | None = None):
+        """Add validated variable records.
+        Args:
+            modality: The modality name.
+            organism: The organism name.
+        """
+        self._kwargs.update({"organism": organism} if organism else {})
+        self._save_from_var_index_modality(
+            modality=modality, validated_only=True, **self._kwargs
+        )
     def add_validated_from(
         self, key: str, modality: str | None = None, organism: str | None = None
     ):
@@ -693,28 +760,48 @@ class MuDataCurator:
     def validate(self, organism: str | None = None) -> bool:
         """Validate categories."""
         self._kwargs.update({"organism": organism} if organism else {})
-        if self._using is not None and self._using != "default":
+        if self._using_key is not None and self._using_key != "default":
             logger.important(
-                f"validating metadata using registries of instance {colors.italic(self._using)}"
+                f"validating metadata using registries of instance {colors.italic(self._using_key)}"
             )
         validated_var = True
+        non_validated_var_modality = {}
         for modality, var_field in self._var_fields.items():
-            validated_var &= validate_categories(
+            is_validated_var, non_validated_var = validate_categories(
                 self._mdata[modality].var.index,
                 field=var_field,
                 key=f"{modality}_var_index",
-                using=self._using,
-                **self._kwargs,
+                using_key=self._using_key,
+                exclude=self._exclude.get(f"{modality}_var_index"),
+                **self._kwargs,  # type: ignore
             )
+            validated_var &= is_validated_var
+            if len(non_validated_var) > 0:
+                non_validated_var_modality[modality] = non_validated_var
         validated_obs = True
+        non_validated_obs_modality = {}
         for modality, fields in self._obs_fields.items():
             if modality == "obs":
                 obs = self._mdata.obs
             else:
                 obs = self._mdata[modality].obs
-            validated_obs &= validate_categories_in_df(
-                obs, fields=fields, using=self._using, **self._kwargs
+            is_validated_obs, non_validated_obs = validate_categories_in_df(
+                obs,
+                fields=fields,
+                using_key=self._using_key,
+                sources=self._sources.get(modality),
+                exclude=self._exclude.get(modality),
+                **self._kwargs,
             )
+            validated_obs &= is_validated_obs
+            non_validated_obs_modality[modality] = non_validated_obs
+            if modality in non_validated_var_modality:
+                non_validated_obs_modality[modality]["var_index"] = (
+                    non_validated_var_modality[modality]
+                )
+            if len(non_validated_obs_modality[modality]) > 0:
+                self._non_validated = non_validated_obs_modality[modality]
         self._validated = validated_var and validated_obs
         return self._validated
@@ -743,7 +830,32 @@ class MuDataCurator:
 class Curate:
-    """Annotation flow."""
+    """Curation flow.
+    Data curation entails accurately labeling datasets with standardized metadata
+    to facilitate data integration, interpretation and analysis.
+    The curation flow has several steps:
+    1. Create a :class:`Curate` object corresponding to the object type that you want to curate:
+    - :meth:`~lamindb.Curate.from_df`
+    - :meth:`~lamindb.Curate.from_anndata`
+    - :meth:`~lamindb.Curate.from_mudata`
+    During object creation, any passed categoricals found in the object will be saved.
+    2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
+    - Values that can successfully validated and already exist in the registry.
+    - Values which are new and not yet validated or potentially problematic values.
+    3. Determine how to handle validated and unvalidated values:
+    - Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
+    - Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
+    - All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
+    """
     @classmethod
     @doc_args(DataFrameCurator.__doc__)
@@ -752,7 +864,7 @@ class Curate:
         df: pd.DataFrame,
         categoricals: dict[str, FieldAttr] | None = None,
         columns: FieldAttr = Feature.name,
-        using: str | None = None,
+        using_key: str | None = None,
         verbosity: str = "hint",
         organism: str | None = None,
     ) -> DataFrameCurator:
@@ -761,7 +873,7 @@ class Curate:
             df=df,
             categoricals=categoricals,
             columns=columns,
-            using=using,
+            using_key=using_key,
             verbosity=verbosity,
             organism=organism,
         )
@@ -773,18 +885,22 @@ class Curate:
         data: ad.AnnData | UPathStr,
         var_index: FieldAttr,
         categoricals: dict[str, FieldAttr] | None = None,
-        using: str = "default",
+        obs_columns: FieldAttr = Feature.name,
+        using_key: str = "default",
         verbosity: str = "hint",
         organism: str | None = None,
+        sources: dict[str, Record] | None = None,
     ) -> AnnDataCurator:
         """{}"""  # noqa: D415
         return AnnDataCurator(
             data=data,
             var_index=var_index,
             categoricals=categoricals,
-            using=using,
+            obs_columns=obs_columns,
+            using_key=using_key,
             verbosity=verbosity,
             organism=organism,
+            sources=sources,
         )
     @classmethod
@@ -794,7 +910,7 @@ class Curate:
         mdata: MuData,
         var_index: dict[str, dict[str, FieldAttr]],
         categoricals: dict[str, FieldAttr] | None = None,
-        using: str = "default",
+        using_key: str = "default",
         verbosity: str = "hint",
         organism: str | None = None,
     ) -> MuDataCurator:
@@ -803,29 +919,68 @@ class Curate:
             mdata=mdata,
             var_index=var_index,
             categoricals=categoricals,
-            using=using,
+            using_key=using_key,
             verbosity=verbosity,
             organism=organism,
         )
-def get_registry_instance(registry: Record, using: str | None = None) -> Record:
+def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
     """Get a registry instance using a specific instance."""
-    if using is not None and using != "default":
-        return registry.using(using)
+    if using_key is not None and using_key != "default":
+        return registry.using(using_key)
     return registry
+def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
+    """Make sure the source and organism are saved in the same database as the registry."""
+    from lamindb.core._settings import settings
+    db = registry.filter().db
+    source = kwargs.get("source")
+    organism = kwargs.get("organism")
+    filter_kwargs = kwargs.copy()
+    try:
+        verbosity = settings.verbosity
+        settings.verbosity = "error"
+        if isinstance(organism, Record) and organism._state.db != "default":
+            if db is None or db == "default":
+                organism_default = copy.copy(organism)
+                # save the organism record in the default database
+                organism_default.save()
+                filter_kwargs["organism"] = organism_default
+        if isinstance(source, Record) and source._state.db != "default":
+            if db is None or db == "default":
+                source_default = copy.copy(source)
+                # save the source record in the default database
+                source_default.save()
+                filter_kwargs["source"] = source_default
+    finally:
+        settings.verbosity = verbosity
+    return filter_kwargs
 def standardize_and_inspect(
-    values: Iterable[str], field: FieldAttr, registry: Record, **kwargs
+    values: Iterable[str],
+    field: FieldAttr,
+    registry: type[Record],
+    standardize: bool = False,
+    **kwargs,
 ):
     """Standardize and inspect values using a registry."""
-    if hasattr(registry, "standardize") and hasattr(
-        registry,
-        "synonyms",  # https://github.com/laminlabs/lamindb/issues/1685
-    ):
-        values = registry.standardize(values, field=field, mute=True, **kwargs)
-    return registry.inspect(values, field=field, mute=True, **kwargs)
+    filter_kwargs = get_current_filter_kwargs(registry, kwargs)
+    if standardize:
+        if hasattr(registry, "standardize") and hasattr(
+            registry,
+            "synonyms",  # https://github.com/laminlabs/lamindb/issues/1685
+        ):
+            standardized_values = registry.standardize(
+                values, field=field, mute=True, **filter_kwargs
+            )
+            values = standardized_values
+    return registry.inspect(values, field=field, mute=True, **filter_kwargs)
 def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
@@ -846,10 +1001,26 @@ def validate_categories(
     values: Iterable[str],
     field: FieldAttr,
     key: str,
-    using: str | None = None,
+    using_key: str | None = None,
     organism: str | None = None,
-) -> bool:
-    """Validate ontology terms in a pandas series using LaminDB registries."""
+    source: Record | None = None,
+    exclude: str | list | None = None,
+    standardize: bool = True,
+    validated_hint_print: str | None = None,
+) -> tuple[bool, list]:
+    """Validate ontology terms in a pandas series using LaminDB registries.
+    Args:
+        values: The values to validate.
+        field: The field attribute.
+        key: The key referencing the slot in the DataFrame.
+        using_key: A reference LaminDB instance.
+        organism: The organism name.
+        source: The source record.
+        exclude: Exclude specific values.
+        standardize: Standardize the values.
+        validated_hint_print: The hint to print for validated values.
+    """
     from lamindb._from_values import _print_values
     from lamindb.core._settings import settings
@@ -861,42 +1032,60 @@ def validate_categories(
         logger.indent = "   "
     registry = field.field.model
-    filter_kwargs = check_registry_organism(registry, organism)
+    kwargs = check_registry_organism(registry, organism)
+    kwargs.update({"source": source} if source else {})
+    # inspect the default instance
+    if exclude is not None:
+        exclude = [exclude] if isinstance(exclude, str) else exclude
+        # exclude values are validated without source and organism
+        inspect_result = registry.inspect(exclude, field=field, mute=True)
+        # if exclude values are validated, remove them from the values
+        values = [i for i in values if i not in inspect_result.validated]
-    # Inspect the default instance
     inspect_result = standardize_and_inspect(
-        values=values, field=field, registry=registry, **filter_kwargs
+        values=values,
+        field=field,
+        registry=registry,
+        standardize=standardize,
+        **kwargs,
     )
     non_validated = inspect_result.non_validated
     values_validated = []
-    if using is not None and using != "default" and non_validated:
-        registry = get_registry_instance(registry, using)
-        # Inspect the using instance
+    if using_key is not None and using_key != "default" and non_validated:
+        registry_using = get_registry_instance(registry, using_key)
+        # inspect the using instance
         inspect_result = standardize_and_inspect(
-            values=non_validated, field=field, registry=registry, **filter_kwargs
+            values=non_validated,
+            field=field,
+            registry=registry_using,
+            standardize=standardize,
+            **kwargs,
         )
         non_validated = inspect_result.non_validated
         values_validated += inspect_result.validated
-    # Inspect from public (bionty only)
+    # inspect from public (bionty only)
     if hasattr(registry, "public"):
         verbosity = settings.verbosity
         try:
             settings.verbosity = "error"
             public_records = registry.from_values(
-                non_validated, field=field, **filter_kwargs
+                non_validated,
+                field=field,
+                **get_current_filter_kwargs(registry, kwargs),
             )
             values_validated += [getattr(r, field.field.name) for r in public_records]
         finally:
             settings.verbosity = verbosity
-    validated_hint_print = f".add_validated_from('{key}')"
+    validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
     n_validated = len(values_validated)
     if n_validated > 0:
         _log_mapping_info()
         logger.warning(
-            f"found {colors.yellow(f'{n_validated} terms')} validated terms: "
+            f"found {colors.yellow(n_validated)} validated terms: "
             f"{colors.yellow(values_validated)}\n      → save terms via "
             f"{colors.yellow(validated_hint_print)}"
         )
@@ -907,39 +1096,49 @@ def validate_categories(
     if n_non_validated == 0:
         logger.indent = ""
         logger.success(f"{key} is validated against {colors.italic(model_field)}")
-        return True
+        return True, []
     else:
         are = "are" if n_non_validated > 1 else "is"
         print_values = _print_values(non_validated)
         warning_message = (
-            f"{colors.yellow(f'{n_non_validated} terms')} {are} not validated: "
-            f"{colors.yellow(print_values)}\n      → save terms via "
-            f"{colors.yellow(non_validated_hint_print)}"
+            f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
+            f"{colors.red(print_values)}\n      → save terms via "
+            f"{colors.red(non_validated_hint_print)}"
         )
         if logger.indent == "":
             _log_mapping_info()
         logger.warning(warning_message)
         logger.indent = ""
-        return False
+        return False, non_validated
 def validate_categories_in_df(
     df: pd.DataFrame,
     fields: dict[str, FieldAttr],
-    using: str | None = None,
+    using_key: str | None = None,
+    sources: dict[str, Record] = None,
+    exclude: dict | None = None,
     **kwargs,
-) -> bool:
+) -> tuple[bool, dict]:
     """Validate categories in DataFrame columns using LaminDB registries."""
+    if sources is None:
+        sources = {}
     validated = True
+    non_validated = {}
     for key, field in fields.items():
-        validated &= validate_categories(
+        is_val, non_val = validate_categories(
             df[key],
             field=field,
             key=key,
-            using=using,
+            using_key=using_key,
+            source=sources.get(key),
+            exclude=exclude.get(key) if exclude else None,
             **kwargs,
         )
-    return validated
+        validated &= is_val
+        if len(non_val) > 0:
+            non_validated[key] = non_val
+    return validated, non_validated
 def save_artifact(
@@ -998,13 +1197,13 @@ def save_artifact(
         organism,
     )
-    if artifact.accessor == "DataFrame":
+    if artifact._accessor == "DataFrame":
         artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
-    elif artifact.accessor == "AnnData":
+    elif artifact._accessor == "AnnData":
         artifact.features._add_set_from_anndata(
             var_field=columns_field, **feature_kwargs
         )
-    elif artifact.accessor == "MuData":
+    elif artifact._accessor == "MuData":
         artifact.features._add_set_from_mudata(
             var_fields=columns_field, **feature_kwargs
         )
@@ -1017,11 +1216,16 @@ def save_artifact(
             feature = features.get(key)
             registry = field.field.model
             filter_kwargs = check_registry_organism(registry, organism)
+            filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
             df = data if isinstance(data, pd.DataFrame) else data.obs
-            labels = registry.from_values(df[key], field=field, **filter_kwargs)
+            labels = registry.from_values(
+                df[key],
+                field=field,
+                **filter_kwargs_current,
+            )
             artifact.labels.add(labels, feature)
-    if artifact.accessor == "MuData":
+    if artifact._accessor == "MuData":
         for modality, modality_fields in fields.items():
             if modality == "obs":
                 _add_labels(data, artifact, modality_fields)
@@ -1041,25 +1245,29 @@ def update_registry(
     field: FieldAttr,
     key: str,
     save_function: str = "add_new_from",
-    using: str | None = None,
+    using_key: str | None = None,
     validated_only: bool = True,
     df: pd.DataFrame | None = None,
     organism: str | None = None,
     dtype: str | None = None,
+    source: Record | None = None,
+    standardize: bool = True,
+    warning: bool = True,
     **kwargs,
-) -> list[Record]:
-    """Save features or labels records in the default instance from the using instance.
+) -> None:
+    """Save features or labels records in the default instance from the using_key instance.
     Args:
         values: A list of values to be saved as labels.
         field: The FieldAttr object representing the field for which labels are being saved.
         key: The name of the feature to save.
         save_function: The name of the function to save the labels.
-        using: The name of the instance from which to transfer labels (if applicable).
+        using_key: The name of the instance from which to transfer labels (if applicable).
         validated_only: If True, only save validated labels.
         df: A DataFrame to save labels from.
         organism: The organism name.
         dtype: The type of the feature.
+        source: The source record.
         kwargs: Additional keyword arguments to pass to the registry model to create new records.
     """
     from lamindb._save import save as ln_save
@@ -1067,51 +1275,79 @@ def update_registry(
     registry = field.field.model
     filter_kwargs = check_registry_organism(registry, organism)
+    filter_kwargs.update({"source": source} if source else {})
     verbosity = settings.verbosity
     try:
         settings.verbosity = "error"
+        # save from public
+        filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
+        existing_and_public_records = (
+            registry.from_values(
+                list(values),
+                field=field,
+                **filter_kwargs_current,
+            )
+            if values
+            else []
+        )
+        labels_saved: dict = {"from public": [], "without reference": []}
+        public_records = [r for r in existing_and_public_records if r._state.adding]
+        # here we check to only save the public records if they are from the specified source
+        # we check the uid because r.source and soruce can be from different instances
+        if source:
+            public_records = [r for r in public_records if r.source.uid == source.uid]
+        ln_save(public_records)
+        labels_saved["from public"] = [
+            getattr(r, field.field.name) for r in public_records
+        ]
+        non_public_labels = [i for i in values if i not in labels_saved["from public"]]
+        # inspect the default instance
         inspect_result_current = standardize_and_inspect(
-            values=values, field=field, registry=registry, **filter_kwargs
+            values=non_public_labels,
+            field=field,
+            registry=registry,
+            standardize=standardize,
+            **filter_kwargs,
         )
         if not inspect_result_current.non_validated:
             all_labels = registry.from_values(
-                inspect_result_current.validated, field=field, **filter_kwargs
+                inspect_result_current.validated,
+                field=field,
+                **filter_kwargs_current,
             )
             settings.verbosity = verbosity
             return all_labels
-        labels_saved: dict = {"from public": [], "without reference": []}
+        # inspect the using_key instance
         (
-            labels_saved[f"from {using}"],
+            labels_saved[f"from {using_key}"],
             non_validated_labels,
         ) = update_registry_from_using_instance(
             inspect_result_current.non_validated,
             field=field,
-            using=using,
+            using_key=using_key,
             **filter_kwargs,
         )
-        public_records = (
-            registry.from_values(non_validated_labels, field=field, **filter_kwargs)
-            if non_validated_labels
-            else []
-        )
-        ln_save(public_records)
-        labels_saved["from public"] = [
-            getattr(r, field.field.name) for r in public_records
-        ]
         labels_saved["without reference"] = [
-            i for i in non_validated_labels if i not in labels_saved["from public"]
+            i
+            for i in non_validated_labels
+            if i not in labels_saved[f"from {using_key}"]
         ]
+        # save non-validated records
         if not validated_only:
             non_validated_records = []
             if df is not None and registry == Feature:
                 non_validated_records = Feature.from_df(df)
             else:
                 if "organism" in filter_kwargs:
+                    # make sure organism record is saved to the current instance
                     filter_kwargs["organism"] = _save_organism(name=organism)
                 init_kwargs = {}
                 for value in labels_saved["without reference"]:
@@ -1119,19 +1355,24 @@ def update_registry(
                     if registry == Feature:
                         init_kwargs["dtype"] = "cat" if dtype is None else dtype
                     non_validated_records.append(
-                        registry(**init_kwargs, **filter_kwargs, **kwargs)
+                        registry(
+                            **init_kwargs,
+                            **{k: v for k, v in filter_kwargs.items() if k != "source"},
+                            **{k: v for k, v in kwargs.items() if k != "sources"},
+                        )
                     )
             ln_save(non_validated_records)
+        # save parent labels for ulabels
         if registry == ULabel and field.field.name == "name":
             save_ulabels_with_parent(values, field=field, key=key)
-        # get all records
-        all_labels = registry.from_values(
-            inspect_result_current.validated + inspect_result_current.non_validated,
-            field=field,
-            **filter_kwargs,
-        )
+        # # get all records that are now validated in the current instance
+        # all_labels = registry.from_values(
+        #     inspect_result_current.validated + inspect_result_current.non_validated,
+        #     field=field,
+        #     **get_current_filter_kwargs(registry, filter_kwargs),
+        # )
     finally:
         settings.verbosity = verbosity
@@ -1141,9 +1382,10 @@ def update_registry(
         save_function=save_function,
         model_field=f"{registry.__name__}.{field.field.name}",
         validated_only=validated_only,
+        warning=warning,
     )
-    return all_labels
+    # return all_labels
 def log_saved_labels(
@@ -1152,6 +1394,7 @@ def log_saved_labels(
     save_function: str,
     model_field: str,
     validated_only: bool = True,
+    warning: bool = True,
 ) -> None:
     """Log the saved labels."""
     from ._from_values import _print_values
@@ -1176,7 +1419,10 @@ def log_saved_labels(
                 if save_function == "add_new_from"
                 else f"\n      → to save, run {colors.yellow(save_function)}"
             )
-            logger.warning(msg)
+            if warning:
+                logger.warning(msg)
+            else:
+                logger.info(msg)
         else:
             k = "" if k == "without reference" else f"{colors.green(k)} "
             # the term "transferred" stresses that this is always in the context of transferring
@@ -1191,7 +1437,7 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
     """Save a parent label for the given labels."""
     registry = field.field.model
     assert registry == ULabel  # noqa: S101
-    all_records = registry.from_values(values, field=field)
+    all_records = registry.from_values(list(values), field=field)
     is_feature = registry.filter(name=f"is_{key}").one_or_none()
     if is_feature is None:
         is_feature = registry(name=f"is_{key}")
@@ -1202,15 +1448,16 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
 def update_registry_from_using_instance(
     values: list[str],
     field: FieldAttr,
-    using: str | None = None,
+    using_key: str | None = None,
+    standardize: bool = False,
     **kwargs,
 ) -> tuple[list[str], list[str]]:
-    """Save features or labels records from the using instance.
+    """Save features or labels records from the using_key instance.
     Args:
         values: A list of values to be saved as labels.
         field: The FieldAttr object representing the field for which labels are being saved.
-        using: The name of the instance from which to transfer labels (if applicable).
+        using_key: The name of the instance from which to transfer labels (if applicable).
         kwargs: Additional keyword arguments to pass to the registry model.
     Returns:
@@ -1219,11 +1466,15 @@ def update_registry_from_using_instance(
     labels_saved = []
     not_saved = values
-    if using is not None and using != "default":
-        registry = field.field.model
-        registry_using = get_registry_instance(registry, using)
+    if using_key is not None and using_key != "default":
+        registry_using = get_registry_instance(field.field.model, using_key)
         inspect_result_using = standardize_and_inspect(
-            values=values, field=field, registry=registry_using, **kwargs
+            values=values,
+            field=field,
+            registry=registry_using,
+            standardize=standardize,
+            **kwargs,
         )
         labels_using = registry_using.filter(
             **{f"{field.field.name}__in": inspect_result_using.validated}
@@ -1242,7 +1493,7 @@ def _save_organism(name: str):  # pragma: no cover
     organism = bt.Organism.filter(name=name).one_or_none()
     if organism is None:
-        organism = bt.Organism.from_public(name=name)
+        organism = bt.Organism.from_source(name=name)
         if organism is None:
             raise ValueError(
                 f"Organism '{name}' not found\n"

lamindb 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl

lamindb 0.74.3py3-none-any.whl → 0.75.1py3-none-any.whl