PyPI - lamindb - Versions diffs - 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

lamindb 1.3.2py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

lamindb/__init__.py +52 -36
lamindb/_finish.py +17 -10
lamindb/_tracked.py +1 -1
lamindb/base/__init__.py +3 -1
lamindb/base/fields.py +40 -22
lamindb/base/ids.py +1 -94
lamindb/base/types.py +2 -0
lamindb/base/uids.py +117 -0
lamindb/core/_context.py +216 -133
lamindb/core/_settings.py +38 -25
lamindb/core/datasets/__init__.py +11 -4
lamindb/core/datasets/_core.py +5 -5
lamindb/core/datasets/_small.py +0 -93
lamindb/core/datasets/mini_immuno.py +172 -0
lamindb/core/loaders.py +1 -1
lamindb/core/storage/_backed_access.py +100 -6
lamindb/core/storage/_polars_lazy_df.py +51 -0
lamindb/core/storage/_pyarrow_dataset.py +15 -30
lamindb/core/storage/objects.py +6 -0
lamindb/core/subsettings/__init__.py +2 -0
lamindb/core/subsettings/_annotation_settings.py +11 -0
lamindb/curators/__init__.py +7 -3559
lamindb/curators/_legacy.py +2056 -0
lamindb/curators/core.py +1546 -0
lamindb/errors.py +11 -0
lamindb/examples/__init__.py +27 -0
lamindb/examples/schemas/__init__.py +12 -0
lamindb/examples/schemas/_anndata.py +25 -0
lamindb/examples/schemas/_simple.py +19 -0
lamindb/integrations/_vitessce.py +8 -5
lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
lamindb/models/__init__.py +12 -2
lamindb/models/_describe.py +21 -4
lamindb/models/_feature_manager.py +384 -301
lamindb/models/_from_values.py +1 -1
lamindb/models/_is_versioned.py +5 -15
lamindb/models/_label_manager.py +8 -2
lamindb/models/artifact.py +354 -177
lamindb/models/artifact_set.py +122 -0
lamindb/models/can_curate.py +4 -1
lamindb/models/collection.py +79 -56
lamindb/models/core.py +1 -1
lamindb/models/feature.py +78 -47
lamindb/models/has_parents.py +24 -9
lamindb/models/project.py +3 -3
lamindb/models/query_manager.py +221 -22
lamindb/models/query_set.py +251 -206
lamindb/models/record.py +211 -344
lamindb/models/run.py +59 -5
lamindb/models/save.py +9 -5
lamindb/models/schema.py +673 -196
lamindb/models/transform.py +5 -14
lamindb/models/ulabel.py +8 -5
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
lamindb-1.5.0.dist-info/RECORD +108 -0
lamindb-1.3.2.dist-info/RECORD +0 -95
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0

lamindb/models/artifact_set.py ADDED Viewed

@@ -0,0 +1,122 @@
+from __future__ import annotations
+from collections.abc import Iterable, Iterator
+from typing import TYPE_CHECKING, Literal
+from lamin_utils import logger
+from lamindb_setup.core._docs import doc_args
+from ..core._mapped_collection import MappedCollection
+from ..core.storage._backed_access import _open_dataframe
+from .artifact import Artifact, _track_run_input
+from .collection import Collection, _load_concat_artifacts
+if TYPE_CHECKING:
+    from anndata import AnnData
+    from pandas import DataFrame
+    from polars import LazyFrame as PolarsLazyFrame
+    from pyarrow.dataset import Dataset as PyArrowDataset
+    from upath import UPath
+UNORDERED_WARNING = (
+    "this query set is unordered, consider using `.order_by()` first "
+    "to avoid opening the artifacts in an arbitrary order"
+)
+class ArtifactSet(Iterable):
+    """Abstract class representing sets of artifacts returned by queries.
+    This class automatically extends :class:`~lamindb.models.BasicQuerySet`
+    and :class:`~lamindb.models.QuerySet` when the base model is :class:`~lamindb.Artifact`.
+    Examples:
+        >>> artifacts = ln.Artifact.filter(otype="AnnData")
+        >>> artifacts # an instance of ArtifactQuerySet inheriting from ArtifactSet
+    """
+    @doc_args(Collection.load.__doc__)
+    def load(
+        self,
+        join: Literal["inner", "outer"] = "outer",
+        is_run_input: bool | None = None,
+        **kwargs,
+    ) -> DataFrame | AnnData:
+        """{}"""  # noqa: D415
+        if not self.ordered:  # type: ignore
+            logger.warning(UNORDERED_WARNING)
+        artifacts: list[Artifact] = list(self)
+        concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
+        # track only if successful
+        _track_run_input(artifacts, is_run_input)
+        return concat_object
+    @doc_args(Collection.open.__doc__)
+    def open(
+        self,
+        engine: Literal["pyarrow", "polars"] = "pyarrow",
+        is_run_input: bool | None = None,
+        **kwargs,
+    ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
+        """{}"""  # noqa: D415
+        if not self.ordered:  # type: ignore
+            logger.warning(UNORDERED_WARNING)
+        artifacts: list[Artifact] = list(self)
+        paths: list[UPath] = [artifact.path for artifact in artifacts]
+        dataframe = _open_dataframe(paths, engine=engine, **kwargs)
+        # track only if successful
+        _track_run_input(artifacts, is_run_input)
+        return dataframe
+    @doc_args(Collection.mapped.__doc__)
+    def mapped(
+        self,
+        layers_keys: str | list[str] | None = None,
+        obs_keys: str | list[str] | None = None,
+        obsm_keys: str | list[str] | None = None,
+        obs_filter: dict[str, str | list[str]] | None = None,
+        join: Literal["inner", "outer"] | None = "inner",
+        encode_labels: bool | list[str] = True,
+        unknown_label: str | dict[str, str] | None = None,
+        cache_categories: bool = True,
+        parallel: bool = False,
+        dtype: str | None = None,
+        stream: bool = False,
+        is_run_input: bool | None = None,
+    ) -> MappedCollection:
+        """{}"""  # noqa: D415
+        if not self.ordered:  # type: ignore
+            logger.warning(UNORDERED_WARNING)
+        artifacts: list[Artifact] = []
+        paths: list[UPath] = []
+        for artifact in self:
+            if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
+                logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
+                continue
+            elif not stream:
+                paths.append(artifact.cache())
+            else:
+                paths.append(artifact.path)
+            artifacts.append(artifact)
+        ds = MappedCollection(
+            paths,
+            layers_keys,
+            obs_keys,
+            obsm_keys,
+            obs_filter,
+            join,
+            encode_labels,
+            unknown_label,
+            cache_categories,
+            parallel,
+            dtype,
+        )
+        # track only if successful
+        _track_run_input(artifacts, is_run_input)
+        return ds

lamindb/models/can_curate.py CHANGED Viewed

@@ -57,6 +57,7 @@ def _inspect(
     mute: bool = False,
     organism: str | Record | None = None,
     source: Record | None = None,
+    from_source: bool = True,
     strict_source: bool = False,
 ) -> pd.DataFrame | dict[str, list[str]]:
     """{}"""  # noqa: D415
@@ -94,7 +95,7 @@ def _inspect(
     )
     nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
-    if len(nonval) > 0 and hasattr(registry, "source_id"):
+    if from_source and len(nonval) > 0 and hasattr(registry, "source_id"):
         try:
             public_result = registry.public(
                 organism=organism_record, source=source
@@ -463,6 +464,7 @@ class CanCurate:
         mute: bool = False,
         organism: Union[str, Record, None] = None,
         source: Record | None = None,
+        from_source: bool = True,
         strict_source: bool = False,
     ) -> InspectResult:
         """Inspect if values are mappable to a field.
@@ -506,6 +508,7 @@ class CanCurate:
             strict_source=strict_source,
             organism=organism,
             source=source,
+            from_source=from_source,
         )
     @classmethod

lamindb/models/collection.py CHANGED Viewed

@@ -24,7 +24,7 @@ from lamindb.base.fields import (
 from ..base.ids import base62_20
 from ..core._mapped_collection import MappedCollection
-from ..core.storage._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
+from ..core.storage._backed_access import _open_dataframe
 from ..errors import FieldValidationError
 from ..models._is_versioned import process_revises
 from ._is_versioned import IsVersioned
@@ -48,8 +48,9 @@ from .record import (
 from .run import Run, TracksRun, TracksUpdates
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Iterable, Iterator
+    from polars import LazyFrame as PolarsLazyFrame
     from pyarrow.dataset import Dataset as PyArrowDataset
     from ..core.storage import UPath
@@ -94,6 +95,39 @@ if TYPE_CHECKING:
 #         return feature_sets_union
+def _load_concat_artifacts(
+    artifacts: list[Artifact], join: Literal["inner", "outer"] = "outer", **kwargs
+) -> pd.DataFrame | ad.AnnData:
+    suffixes = {artifact.suffix for artifact in artifacts}
+    # Why is that? - Sergei
+    if len(suffixes) != 1:
+        raise ValueError(
+            "Can only load collections where all artifacts have the same suffix"
+        )
+    # because we're tracking data flow on the collection-level, here, we don't
+    # want to track it on the artifact-level
+    first_object = artifacts[0].load(is_run_input=False)
+    is_dataframe = isinstance(first_object, pd.DataFrame)
+    is_anndata = isinstance(first_object, ad.AnnData)
+    if not is_dataframe and not is_anndata:
+        raise ValueError(f"Unable to concatenate {suffixes.pop()} objects.")
+    objects = [first_object]
+    artifact_uids = [artifacts[0].uid]
+    for artifact in artifacts[1:]:
+        objects.append(artifact.load(is_run_input=False))
+        artifact_uids.append(artifact.uid)
+    if is_dataframe:
+        concat_object = pd.concat(objects, join=join, **kwargs)
+    elif is_anndata:
+        label = kwargs.pop("label", "artifact_uid")
+        keys = kwargs.pop("keys", artifact_uids)
+        concat_object = ad.concat(objects, join=join, label=label, keys=keys, **kwargs)
+    return concat_object
 class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
     """Collections of artifacts.
@@ -325,11 +359,13 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
             artifact: An artifact to add to the collection.
             run: The run that creates the new version of the collection.
-        Examples::
+        Examples:
+            ::
-            collection_v1 = ln.Collection(artifact, key="My collection").save()
-            collection_v2 = collection.append(another_artifact)  # returns a new version of the collection
-            collection_v2.save()  # save the new version
+                collection_v1 = ln.Collection(artifact, key="My collection").save()
+                collection_v2 = collection.append(another_artifact)  # returns a new version of the collection
+                collection_v2.save()  # save the new version
         """
         return Collection(  # type: ignore
@@ -340,13 +376,25 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
             run=run,
         )
-    def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
-        """Return a cloud-backed pyarrow Dataset.
+    def open(
+        self,
+        engine: Literal["pyarrow", "polars"] = "pyarrow",
+        is_run_input: bool | None = None,
+        **kwargs,
+    ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
+        """Open a dataset for streaming.
+        Works for `pyarrow` and `polars` compatible formats
+        (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
-        Works for `pyarrow` compatible formats.
+        Args:
+            engine: Which module to use for lazy loading of a dataframe
+                from `pyarrow` or `polars` compatible formats.
+            is_run_input: Whether to track this artifact as run input.
+            **kwargs: Keyword arguments for `pyarrow.dataset.dataset` or `polars.scan_*` functions.
         Notes:
-            For more info, see tutorial: :doc:`/arrays`.
+            For more info, see guide: :doc:`/arrays`.
         """
         if self._state.adding:
             artifacts = self._artifacts
@@ -354,31 +402,12 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
         else:
             artifacts = self.ordered_artifacts.all()
         paths = [artifact.path for artifact in artifacts]
-        # this checks that the filesystem is the same for all paths
-        # this is a requirement of pyarrow.dataset.dataset
-        fs = paths[0].fs
-        for path in paths[1:]:
-            # this assumes that the filesystems are cached by fsspec
-            if path.fs is not fs:
-                raise ValueError(
-                    "The collection has artifacts with different filesystems, this is not supported."
-                )
-        if not _is_pyarrow_dataset(paths):
-            suffixes = {path.suffix for path in paths}
-            suffixes_str = ", ".join(suffixes)
-            err_msg = (
-                "This collection is not compatible with pyarrow.dataset.dataset(), "
-            )
-            err_msg += (
-                f"the artifacts have incompatible file types: {suffixes_str}"
-                if len(suffixes) > 1
-                else f"the file type {suffixes_str} is not supported by pyarrow."
-            )
-            raise ValueError(err_msg)
-        dataset = _open_pyarrow_dataset(paths)
+        dataframe = _open_dataframe(paths, engine=engine, **kwargs)
         # track only if successful
+        # is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
         _track_run_input(self, is_run_input)
-        return dataset
+        return dataframe
     def mapped(
         self,
@@ -401,8 +430,8 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
         <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
         virtually concatenating `AnnData` arrays.
-        If your `AnnData` collection is in the cloud, move them into a local
-        cache first via :meth:`~lamindb.Collection.cache`.
+        By default (`stream=False`) `AnnData` arrays are moved into a local
+        cache first.
         `__getitem__` of the `MappedCollection` object takes a single integer index
         and returns a dictionary with the observation data sample for this index from
@@ -414,7 +443,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
             For a guide, see :doc:`docs:scrna-mappedcollection`.
-            This method currently only works for collections of `AnnData` artifacts.
+            This method currently only works for collections or query sets of `AnnData` artifacts.
         Args:
             layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
@@ -443,6 +472,11 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
             >>> ds = ln.Collection.get(description="my collection")
             >>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
             >>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
+            >>> # also works for query sets of artifacts, '...' represents some filtering condition
+            >>> # additional filtering on artifacts of the collection
+            >>> mapped = collection.artifacts.all().filter(...).order_by("-created_at").mapped()
+            >>> # or directly from a query set of artifacts
+            >>> mapped = ln.Artifact.filter(..., otype="AnnData").order_by("-created_at").mapped()
         """
         path_list = []
         if self._state.adding:
@@ -472,6 +506,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
             dtype,
         )
         # track only if successful
+        # is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
         _track_run_input(self, is_run_input)
         return ds
@@ -488,6 +523,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
         path_list = []
         for artifact in self.ordered_artifacts.all():
             path_list.append(artifact.cache())
+        # is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
         _track_run_input(self, is_run_input)
         return path_list
@@ -496,29 +532,16 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
         join: Literal["inner", "outer"] = "outer",
         is_run_input: bool | None = None,
         **kwargs,
-    ) -> Any:
-        """Stage and load to memory.
+    ) -> pd.DataFrame | ad.AnnData:
+        """Cache and load to memory.
-        Returns in-memory representation if possible such as a concatenated `DataFrame` or `AnnData` object.
+        Returns an in-memory concatenated `DataFrame` or `AnnData` object.
         """
         # cannot call _track_run_input here, see comment further down
-        all_artifacts = self.ordered_artifacts.all()
-        suffixes = [artifact.suffix for artifact in all_artifacts]
-        if len(set(suffixes)) != 1:
-            raise RuntimeError(
-                "Can only load collections where all artifacts have the same suffix"
-            )
-        # because we're tracking data flow on the collection-level, here, we don't
-        # want to track it on the artifact-level
-        objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
-        artifact_uids = [artifact.uid for artifact in all_artifacts]
-        if isinstance(objects[0], pd.DataFrame):
-            concat_object = pd.concat(objects, join=join)
-        elif isinstance(objects[0], ad.AnnData):
-            concat_object = ad.concat(
-                objects, join=join, label="artifact_uid", keys=artifact_uids
-            )
-        # only call it here because there might be errors during concat
+        artifacts = self.ordered_artifacts.all()
+        concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
+        # only call it here because there might be errors during load or concat
+        # is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
         _track_run_input(self, is_run_input)
         return concat_object

lamindb/models/core.py CHANGED Viewed

@@ -24,7 +24,7 @@ if TYPE_CHECKING:
 class Storage(Record, TracksRun, TracksUpdates):
-    """Storage locations.
+    """Storage locations of artifacts such as S3 buckets or local directories.
     A storage location is either a directory/folder (local or in the cloud) or
     an entire S3/GCP bucket.

lamindb/models/feature.py CHANGED Viewed

@@ -143,40 +143,50 @@ def parse_cat_dtype(
 def serialize_dtype(
-    dtype: Record | FieldAttr | list[Record], is_itype: bool = False
+    dtype: Registry | Record | FieldAttr | list[Record] | list[Registry] | str,
+    is_itype: bool = False,
 ) -> str:
     """Converts a data type object into its string representation."""
+    from .ulabel import ULabel
     if (
         not isinstance(dtype, list)
         and hasattr(dtype, "__name__")
         and dtype.__name__ in FEATURE_DTYPES
     ):
         dtype_str = dtype.__name__
+    elif dtype is dict:
+        dtype_str = "dict"
+    elif is_itype and isinstance(dtype, str):
+        if dtype not in "Feature":
+            parse_cat_dtype(
+                dtype_str=dtype, is_itype=True
+            )  # throws an error if invalid
+        dtype_str = dtype
     elif isinstance(dtype, (ExtensionDtype, np.dtype)):
         dtype_str = serialize_pandas_dtype(dtype)
     else:
-        error_message = (
-            "dtype has to be a record, a record field, or a list of records, not {}"
-        )
-        if isinstance(dtype, Registry):
-            dtype = [dtype]
-        elif isinstance(dtype, DeferredAttribute):
+        error_message = "dtype has to be a registry, a ulabel subtype, a registry field, or a list of registries or fields, not {}"
+        if isinstance(dtype, (Registry, DeferredAttribute, ULabel)):
             dtype = [dtype]
         elif not isinstance(dtype, list):
             raise ValueError(error_message.format(dtype))
         dtype_str = ""
-        for single_dtype in dtype:
-            if not isinstance(single_dtype, Registry) and not isinstance(
-                single_dtype, DeferredAttribute
-            ):
-                raise ValueError(error_message.format(single_dtype))
-            if isinstance(single_dtype, Registry):
-                dtype_str += single_dtype.__get_name_with_module__() + "|"
+        for one_dtype in dtype:
+            if not isinstance(one_dtype, (Registry, DeferredAttribute, ULabel)):
+                raise ValueError(error_message.format(one_dtype))
+            if isinstance(one_dtype, Registry):
+                dtype_str += one_dtype.__get_name_with_module__() + "|"
+            elif isinstance(one_dtype, ULabel):
+                assert one_dtype.is_type, (  # noqa: S101
+                    f"ulabel has to be a type if acting as dtype, {one_dtype} has `is_type` False"
+                )
+                dtype_str += f"ULabel[{one_dtype.name}]"
             else:
+                name = one_dtype.field.name
+                field_ext = f".{name}" if name != "name" else ""
                 dtype_str += (
-                    single_dtype.field.model.__get_name_with_module__()
-                    + f".{single_dtype.field.name}"
-                    + "|"
+                    one_dtype.field.model.__get_name_with_module__() + field_ext + "|"
                 )
         dtype_str = dtype_str.rstrip("|")
         if not is_itype:
@@ -332,7 +342,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
     _name_field: str = "name"
     _aux_fields: dict[str, tuple[str, type]] = {
-        "0": ("default_value", bool),
+        "0": ("default_value", Any),  # type: ignore
         "1": ("nullable", bool),
         "2": ("coerce_dtype", bool),
     }
@@ -499,24 +509,11 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
         super().save(*args, **kwargs)
         return self
-    @property
-    def coerce_dtype(self) -> bool:
-        """Whether dtypes should be coerced during validation.
-        For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
-        """
-        if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]:  # type: ignore
-            return self._aux["af"]["2"]  # type: ignore
-        else:
-            return False
-    @coerce_dtype.setter
-    def coerce_dtype(self, value: bool) -> None:
-        if self._aux is None:  # type: ignore
-            self._aux = {}  # type: ignore
-        if "af" not in self._aux:
-            self._aux["af"] = {}
-        self._aux["af"]["2"] = value
+    def with_config(self, optional: bool | None = None) -> tuple[Feature, dict]:
+        """Pass addtional configurations to the schema."""
+        if optional is not None:
+            return self, {"optional": optional}
+        return self, {}
     @property
     def default_value(self) -> Any:
@@ -532,12 +529,9 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
             return None
     @default_value.setter
-    def default_value(self, value: bool) -> None:
-        if self._aux is None:  # type: ignore
-            self._aux = {}  # type: ignore
-        if "af" not in self._aux:
-            self._aux["af"] = {}
-        self._aux["af"]["0"] = value
+    def default_value(self, value: str | None) -> None:
+        self._aux = self._aux or {}
+        self._aux.setdefault("af", {})["0"] = value
     @property
     def nullable(self) -> bool:
@@ -568,11 +562,48 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
     @nullable.setter
     def nullable(self, value: bool) -> None:
         assert isinstance(value, bool), value  # noqa: S101
-        if self._aux is None:
-            self._aux = {}
-        if "af" not in self._aux:
-            self._aux["af"] = {}
-        self._aux["af"]["1"] = value
+        self._aux = self._aux or {}
+        self._aux.setdefault("af", {})["1"] = value
+    @property
+    def coerce_dtype(self) -> bool:
+        """Whether dtypes should be coerced during validation.
+        For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
+        """
+        if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]:  # type: ignore
+            return self._aux["af"]["2"]  # type: ignore
+        else:
+            return False
+    @coerce_dtype.setter
+    def coerce_dtype(self, value: bool) -> None:
+        self._aux = self._aux or {}
+        self._aux.setdefault("af", {})["2"] = value
+    # we'll enable this later
+    # @property
+    # def observational_unit(self) -> Literal["Artifact", "Observation"]:
+    #     """Default observational unit on which the feature is measured.
+    #     Currently, we only make a distinction between artifact-level and observation-level features.
+    #     For example, a feature `"ml_split"` that stores `"test"` & `"train"` labels is typically defined on the artifact level.
+    #     When accessing `artifact.features.get_values(["ml_split"])`, you expect a single value, either `"test"` or `"train"`.
+    #     However, when accessing an artifact annotation with a feature that's defined on the observation-level, say `"cell_type"`, you expect a set of values. So,
+    #     `artifact.features.get_values(["cell_type_from_expert"])` should return a set: `{"T cell", "B cell"}`.
+    #     The value of `observational_unit` is currently auto-managed: if using `artifact.featueres.add_values()`,
+    #     it will be set to `Artifact`. In a curator, the value depends on whether it's an artifact- or observation-level slot
+    #     (e.g. `.uns` is artifact-level in `AnnData` whereas `.obs` is observation-level).
+    #     Note: This attribute might in the future be used to distinguish different types of observational units (e.g. single cells vs. physical samples vs. study subjects etc.).
+    #     """
+    #     if self._expect_many:
+    #         return "Observation"  # this here might be replaced with the specific observational unit
+    #     else:
+    #         return "Artifact"
 class FeatureValue(Record, TracksRun):

lamindb/models/has_parents.py CHANGED Viewed

@@ -4,12 +4,15 @@ from __future__ import annotations
 import builtins
 from typing import TYPE_CHECKING, Literal
+import lamindb_setup as ln_setup
 from lamin_utils import logger
 from .record import format_field_value, get_name_field
 from .run import Run
 if TYPE_CHECKING:
+    from graphviz import Digraph
     from lamindb.base.types import StrField
     from .artifact import Artifact
@@ -78,7 +81,7 @@ class HasParents:
         if not isinstance(field, str):
             field = field.field.name
-        return _view_parents(
+        return view_parents(
             record=self,  # type: ignore
             field=field,
             with_children=with_children,
@@ -101,7 +104,7 @@ def _transform_emoji(transform: Transform):
         return TRANSFORM_EMOJIS["pipeline"]
-def _view(u):
+def view_digraph(u: Digraph):
     from graphviz.backend import ExecutableNotFound
     try:
@@ -117,7 +120,7 @@ def _view(u):
                 # call to display()
                 display(u._repr_mimebundle_(), raw=True)
         else:
-            return u
+            return u.view()
     except (FileNotFoundError, RuntimeError, ExecutableNotFound):  # pragma: no cover
         logger.error(
             "please install the graphviz executable on your system:\n  - Ubuntu: `sudo"
@@ -126,7 +129,9 @@ def _view(u):
         )
-def view_lineage(data: Artifact | Collection, with_children: bool = True) -> None:
+def view_lineage(
+    data: Artifact | Collection, with_children: bool = True, return_graph: bool = False
+) -> Digraph | None:
     """Graph of data flow.
     Notes:
@@ -136,6 +141,13 @@ def view_lineage(data: Artifact | Collection, with_children: bool = True) -> Non
         >>> collection.view_lineage()
         >>> artifact.view_lineage()
     """
+    if ln_setup.settings.instance.is_on_hub:
+        instance_slug = ln_setup.settings.instance.slug
+        entity_slug = data.__class__.__name__.lower()
+        logger.important(
+            f"explore at: https://lamin.ai/{instance_slug}/{entity_slug}/{data.uid}"
+        )
     import graphviz
     df_values = _get_all_parent_runs(data)
@@ -189,10 +201,13 @@ def view_lineage(data: Artifact | Collection, with_children: bool = True) -> Non
         shape="box",
     )
-    _view(u)
+    if return_graph:
+        return u
+    else:
+        return view_digraph(u)
-def _view_parents(
+def view_parents(
     record: Record,
     field: str,
     with_children: bool = False,
@@ -258,7 +273,7 @@ def _view_parents(
             u.node(row["target"], label=row["target_label"])
             u.edge(row["source"], row["target"], color="dimgrey")
-    _view(u)
+    view_digraph(u)
 def _get_parents(
@@ -505,14 +520,14 @@ def _get_all_child_runs(data: Artifact | Collection) -> list:
             run_inputs_outputs += [(r, outputs_run)]
             child_runs.update(
-                Run.filter(
+                Run.filter(  # type: ignore
                     **{f"input_{name}s__uid__in": [i.uid for i in outputs_run]}
                 ).list()
             )
             # for artifacts, also include collections in the lineage
             if name == "artifact":
                 child_runs.update(
-                    Run.filter(
+                    Run.filter(  # type: ignore
                         input_collections__uid__in=[i.uid for i in outputs_run]
                     ).list()
                 )

lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl

lamindb 1.3.2py3-none-any.whl → 1.5.0py3-none-any.whl