PyPI - lamindb - Versions diffs - 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

lamindb 1.3.2py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

lamindb/__init__.py +52 -36
lamindb/_finish.py +17 -10
lamindb/_tracked.py +1 -1
lamindb/base/__init__.py +3 -1
lamindb/base/fields.py +40 -22
lamindb/base/ids.py +1 -94
lamindb/base/types.py +2 -0
lamindb/base/uids.py +117 -0
lamindb/core/_context.py +216 -133
lamindb/core/_settings.py +38 -25
lamindb/core/datasets/__init__.py +11 -4
lamindb/core/datasets/_core.py +5 -5
lamindb/core/datasets/_small.py +0 -93
lamindb/core/datasets/mini_immuno.py +172 -0
lamindb/core/loaders.py +1 -1
lamindb/core/storage/_backed_access.py +100 -6
lamindb/core/storage/_polars_lazy_df.py +51 -0
lamindb/core/storage/_pyarrow_dataset.py +15 -30
lamindb/core/storage/objects.py +6 -0
lamindb/core/subsettings/__init__.py +2 -0
lamindb/core/subsettings/_annotation_settings.py +11 -0
lamindb/curators/__init__.py +7 -3559
lamindb/curators/_legacy.py +2056 -0
lamindb/curators/core.py +1546 -0
lamindb/errors.py +11 -0
lamindb/examples/__init__.py +27 -0
lamindb/examples/schemas/__init__.py +12 -0
lamindb/examples/schemas/_anndata.py +25 -0
lamindb/examples/schemas/_simple.py +19 -0
lamindb/integrations/_vitessce.py +8 -5
lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
lamindb/models/__init__.py +12 -2
lamindb/models/_describe.py +21 -4
lamindb/models/_feature_manager.py +384 -301
lamindb/models/_from_values.py +1 -1
lamindb/models/_is_versioned.py +5 -15
lamindb/models/_label_manager.py +8 -2
lamindb/models/artifact.py +354 -177
lamindb/models/artifact_set.py +122 -0
lamindb/models/can_curate.py +4 -1
lamindb/models/collection.py +79 -56
lamindb/models/core.py +1 -1
lamindb/models/feature.py +78 -47
lamindb/models/has_parents.py +24 -9
lamindb/models/project.py +3 -3
lamindb/models/query_manager.py +221 -22
lamindb/models/query_set.py +251 -206
lamindb/models/record.py +211 -344
lamindb/models/run.py +59 -5
lamindb/models/save.py +9 -5
lamindb/models/schema.py +673 -196
lamindb/models/transform.py +5 -14
lamindb/models/ulabel.py +8 -5
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
lamindb-1.5.0.dist-info/RECORD +108 -0
lamindb-1.3.2.dist-info/RECORD +0 -95
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0

lamindb/models/artifact.py CHANGED Viewed

@@ -5,10 +5,11 @@ import os
 import shutil
 from collections import defaultdict
 from pathlib import Path, PurePath, PurePosixPath
-from typing import TYPE_CHECKING, Any, Union, overload
+from typing import TYPE_CHECKING, Any, Literal, Union, overload
 import fsspec
 import lamindb_setup as ln_setup
+import numpy as np
 import pandas as pd
 from anndata import AnnData
 from django.db import connections, models
@@ -38,7 +39,6 @@ from lamindb.errors import FieldValidationError
 from lamindb.models.query_set import QuerySet
 from ..base.users import current_user_id
-from ..core._compat import is_package_installed
 from ..core.loaders import load_to_memory
 from ..core.storage import (
     LocalPathClasses,
@@ -48,6 +48,11 @@ from ..core.storage import (
     write_to_disk,
 )
 from ..core.storage._anndata_accessor import _anndata_n_observations
+from ..core.storage._backed_access import (
+    _track_writes_factory,
+    backed_access,
+)
+from ..core.storage._polars_lazy_df import POLARS_SUFFIXES
 from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES
 from ..core.storage._tiledbsoma import _soma_n_observations
 from ..core.storage.paths import (
@@ -61,7 +66,6 @@ from ..core.storage.paths import (
 from ..errors import IntegrityError, InvalidArgument, ValidationError
 from ..models._is_versioned import (
     create_uid,
-    message_update_key_in_version_family,
 )
 from ._django import get_artifact_with_related
 from ._feature_manager import (
@@ -69,6 +73,7 @@ from ._feature_manager import (
     ParamManager,
     ParamManagerArtifact,
     add_label_feature_links,
+    filter_base,
     get_label_links,
 )
 from ._is_versioned import IsVersioned
@@ -86,7 +91,7 @@ from .record import (
     _get_record_kwargs,
     record_repr,
 )
-from .run import ParamValue, Run, TracksRun, TracksUpdates, User
+from .run import Param, ParamValue, Run, TracksRun, TracksUpdates, User
 from .schema import Schema
 from .ulabel import ULabel
@@ -105,9 +110,10 @@ except ImportError:
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Iterable, Iterator
     from mudata import MuData  # noqa: TC004
+    from polars import LazyFrame as PolarsLazyFrame
     from pyarrow.dataset import Dataset as PyArrowDataset
     from spatialdata import SpatialData  # noqa: TC004
     from tiledbsoma import Collection as SOMACollection
@@ -210,17 +216,6 @@ def process_data(
     if not overwritten, data gets stored in default storage
     """
-    supported_data_types = [pd.DataFrame, AnnData]
-    if is_package_installed("mudata"):
-        from mudata import MuData
-        supported_data_types.append(MuData)
-    if is_package_installed("spatialdata"):
-        from spatialdata import SpatialData
-        supported_data_types.append(SpatialData)
-    supported_data_types = tuple(supported_data_types)  # type: ignore
     if key is not None:
         key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
         # use suffix as the (adata) format if the format is not provided
@@ -228,7 +223,8 @@ def process_data(
             format = key_suffix[1:]
     else:
         key_suffix = None
-    if isinstance(data, (str, Path, UPath)):  # UPathStr, spelled out
+    if isinstance(data, (str, Path, UPath)):
         access_token = (
             default_storage._access_token
             if hasattr(default_storage, "_access_token")
@@ -239,6 +235,7 @@ def process_data(
         # for example into a temporary url
         if path.protocol not in {"http", "https"}:
             path = path.resolve()
         storage, use_existing_storage_key = process_pathlike(
             path,
             default_storage=default_storage,
@@ -247,28 +244,37 @@ def process_data(
         )
         suffix = extract_suffix_from_path(path)
         memory_rep = None
-    elif isinstance(data, supported_data_types):
+    elif (
+        isinstance(data, pd.DataFrame)
+        or isinstance(data, AnnData)
+        or data_is_mudata(data)
+        or data_is_spatialdata(data)
+    ):
         storage = default_storage
         memory_rep = data
         suffix = infer_suffix(data, format)
     else:
         raise NotImplementedError(
-            f"Do not know how to create a artifact object from {data}, pass a path instead!"
+            f"Do not know how to create an Artifact from {data}, pass a path instead."
         )
+    # Check for suffix consistency
     if key_suffix is not None and key_suffix != suffix and not is_replace:
         # consciously omitting a trailing period
-        if isinstance(data, (str, Path, UPath)):
+        if isinstance(data, (str, Path, UPath)):  # UPathStr, spelled out
             message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
         else:
             message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
         raise InvalidArgument(message)
     # in case we have an in-memory representation, we need to write it to disk
-    from lamindb import settings
+    if memory_rep is not None:
+        from lamindb import settings
-    if isinstance(data, supported_data_types):
         path = settings.cache_dir / f"{provisional_uid}{suffix}"
         write_to_disk(data, path)
         use_existing_storage_key = False
     return memory_rep, path, suffix, storage, use_existing_storage_key
@@ -311,10 +317,9 @@ def get_stat_or_artifact(
         result = Artifact.objects.using(instance).filter(hash=hash).all()
         artifact_with_same_hash_exists = len(result) > 0
     else:
-        storage_id = settings.storage.id
         result = (
             Artifact.objects.using(instance)
-            .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
+            .filter(Q(hash=hash) | Q(key=key, storage=settings.storage.record))
             .order_by("-created_at")
             .all()
         )
@@ -533,28 +538,24 @@ def data_is_anndata(data: AnnData | UPathStr) -> bool:
 def data_is_mudata(data: MuData | UPathStr) -> bool:
-    if is_package_installed("mudata"):
-        from mudata import MuData
-        if isinstance(data, MuData):
-            return True
+    # We are not importing MuData here to keep loaded modules minimal
+    if hasattr(data, "__class__") and data.__class__.__name__ == "MuData":
+        return True
     if isinstance(data, (str, Path)):
         return UPath(data).suffix == ".h5mu"
     return False
 def data_is_spatialdata(data: SpatialData | UPathStr) -> bool:
-    if is_package_installed("spatialdata"):
-        from spatialdata import SpatialData
-        if isinstance(data, SpatialData):
-            return True
-        if isinstance(data, (str, Path)):
-            if UPath(data).suffix == ".zarr":
-                # TODO: inconsistent with anndata, where we run the storage
-                # check only for local, expensive for cloud
-                return identify_zarr_type(data, check=False) == "spatialdata"
-        return False
+    # We are not importing SpatialData here to keep loaded modules minimal
+    if hasattr(data, "__class__") and data.__class__.__name__ == "SpatialData":
+        return True
+    if isinstance(data, (str, Path)):
+        if UPath(data).suffix == ".zarr":
+            # TODO: inconsistent with anndata, where we run the storage
+            # check only for local, expensive for cloud
+            return identify_zarr_type(data, check=False) == "spatialdata"
+    return False
 def _check_otype_artifact(
@@ -763,15 +764,15 @@ def _describe_sqlite(self, print_types: bool = False):  # for artifact & collect
         return tree
-def describe_artifact_collection(self):  # for artifact & collection
-    from ._describe import print_rich_tree
+def describe_artifact_collection(self, return_str: bool = False) -> str | None:
+    from ._describe import format_rich_tree
     if not self._state.adding and connections[self._state.db].vendor == "postgresql":
         tree = _describe_postgres(self)
     else:
         tree = _describe_sqlite(self)
-    print_rich_tree(tree)
+    return format_rich_tree(tree, return_str=return_str)
 def validate_feature(feature: Feature, records: list[Record]) -> None:
@@ -962,55 +963,66 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
         run: `Run | None = None` The run that creates the artifact.
-    .. dropdown:: Typical storage formats & their API accessors
+    Examples:
-        Arrays:
+        Create an artifact **from a local file or folder**::
-        - Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
-        - Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
-        - Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
+            artifact = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
+            artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
-        Non-arrays:
+        Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance.
+        If you create an artifact **from a remote file or folder**, lamindb merely registers the S3 `key` and avoids copying the data::
-        - Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
-        - Fastq: `.fastq` ⟷ /
-        - VCF: `.vcf` ⟷ /
-        - QC: `.html` ⟷ /
+            artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
-        You'll find these values in the `suffix` & `accessor` fields.
+        If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_df()`, `.from_anndata()`, ... constructors::
-        LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
+            schema = ln.Schema(itype=ln.Feature)  # a schema that merely enforces that feature names exist in the Feature registry
+            artifact = ln.Artifact.from_df("./my_file.parquet", key="my_dataset.parquet", schema=schema).save()  # validated and annotated
-    See Also:
-        :class:`~lamindb.Storage`
-            Storage locations for artifacts.
-        :class:`~lamindb.Collection`
-            Collections of artifacts.
-        :meth:`~lamindb.Artifact.from_df`
-            Create an artifact from a `DataFrame`.
-        :meth:`~lamindb.Artifact.from_anndata`
-            Create an artifact from an `AnnData`.
+        You can make a **new version** of an artifact by passing an existing `key`::
-    Examples:
+            artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save()
+            artifact_v2.versions.df()  # see all versions
+        You can write artifacts to other storage locations by switching the current default storage location (:attr:`~lamindb.core.Settings.storage`)::
+            ln.settings.storage = "s3://some-bucket"
-        Create an artifact by passing `key`:
+        Sometimes you want to **avoid mapping the artifact into a path hierarchy**, and you only pass `description`::
-        >>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
-        >>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
+            artifact = ln.Artifact("./my_folder", description="My folder").save()
+            artifact_v2 = ln.Artifact("./my_folder", revises=old_artifact).save()  # need to version based on `revises`, a shared description does not trigger a new version
-        Calling `.save()` uploads the file to the default storage location of your lamindb instance.
-        (If it's a local instance, the "upload" is a mere copy operation.)
+    Notes:
-        If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
+        .. dropdown:: Typical storage formats & their API accessors
-        >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
+            Arrays:
-        You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
+            - Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
+            - Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
+            - Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
-        >>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
-        >>> artifact_v2.versions.df()  # see all versions
+            Non-arrays:
-        .. dropdown:: Why does the API look this way?
+            - Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
+            - Fastq: `.fastq` ⟷ /
+            - VCF: `.vcf` ⟷ /
+            - QC: `.html` ⟷ /
+            You'll find these values in the `suffix` & `accessor` fields.
+            LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
+        .. dropdown:: Will artifacts get duplicated?
+            If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact.
+            In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()`
+            detects the duplication and will return the existing artifact.
+        .. dropdown:: Why does the constructor look the way it looks?
             It's inspired by APIs building on AWS S3.
@@ -1031,18 +1043,15 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
                 bucket = quilt3.Bucket('mybucket')
                 bucket.put_file('hello.txt', '/tmp/hello.txt')
-        Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
-        >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
-        >>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
-        Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
-        >>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
-        If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
-        the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
-        detects the duplication and will return the existing artifact.
+    See Also:
+        :class:`~lamindb.Storage`
+            Storage locations for artifacts.
+        :class:`~lamindb.Collection`
+            Collections of artifacts.
+        :meth:`~lamindb.Artifact.from_df`
+            Create an artifact from a `DataFrame`.
+        :meth:`~lamindb.Artifact.from_anndata`
+            Create an artifact from an `AnnData`.
     """
@@ -1055,6 +1064,8 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
     params: ParamManager = ParamManagerArtifact  # type: ignore
     """Param manager.
+    What features are for dataset-like artifacts, parameters are for model-like artifacts & runs.
     Example::
         artifact.params.add_values({
@@ -1071,23 +1082,23 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
     features: FeatureManager = FeatureManager  # type: ignore
     """Feature manager.
-    Features denote dataset dimensions, i.e., the variables that measure labels & numbers.
+    Typically, you annotate a dataset with features by defining a `Schema` and passing it to the `Artifact` constructor.
-    Annotate with features & values::
+    Here is how to do annotate an artifact ad hoc::
        artifact.features.add_values({
             "species": organism,  # here, organism is an Organism record
             "scientist": ['Barbara McClintock', 'Edgar Anderson'],
             "temperature": 27.6,
-            "study": "Candidate marker study"
+            "experiment": "Experiment 1"
        })
-    Query for features & values::
+    Query artifacts by features::
-        ln.Artifact.features.filter(scientist="Barbara McClintock")
+        ln.Artifact.filter(scientist="Barbara McClintock")
     Features may or may not be part of the artifact content in storage. For
-    instance, the :class:`~lamindb.Curator` flow validates the columns of a
+    instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
     `DataFrame`-like artifact and annotates it with features corresponding to
     these columns. `artifact.features.add_values`, by contrast, does not
     validate the content of the artifact.
@@ -1100,22 +1111,22 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         To annotate with labels, you typically use the registry-specific accessors,
         for instance :attr:`~lamindb.Artifact.ulabels`::
-            candidate_marker_study = ln.ULabel(name="Candidate marker study").save()
-            artifact.ulabels.add(candidate_marker_study)
+            experiment = ln.ULabel(name="Experiment 1").save()
+            artifact.ulabels.add(experiment)
         Similarly, you query based on these accessors::
-            ln.Artifact.filter(ulabels__name="Candidate marker study").all()
+            ln.Artifact.filter(ulabels__name="Experiment 1").all()
         Unlike the registry-specific accessors, the `.labels` accessor provides
         a way of associating labels with features::
-            study = ln.Feature(name="study", dtype="cat").save()
-            artifact.labels.add(candidate_marker_study, feature=study)
+            experiment = ln.Feature(name="experiment", dtype="cat").save()
+            artifact.labels.add(experiment, feature=study)
         Note that the above is equivalent to::
-            artifact.features.add_values({"study": candidate_marker_study})
+            artifact.features.add_values({"experiment": experiment})
         """
         from ._label_manager import LabelManager
@@ -1343,15 +1354,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
                 f"Only {valid_keywords} can be passed, you passed: {kwargs}"
             )
         if revises is not None and key is not None and revises.key != key:
-            note = message_update_key_in_version_family(
-                suid=revises.stem_uid,
-                existing_key=revises.key,
-                new_key=key,
-                registry="Artifact",
-            )
-            raise ValueError(
-                f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
-            )
+            logger.warning(f"renaming artifact from '{revises.key}' to {key}")
         if revises is not None:
             if not isinstance(revises, Artifact):
                 raise TypeError("`revises` has to be of type `Artifact`")
@@ -1431,11 +1434,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             kwargs["uid"] = uid
         # only set key now so that we don't do a look-up on it in case revises is passed
-        if revises is not None and revises.key is not None:
-            assert revises.key.endswith(kwargs["suffix"]), (  # noqa: S101
-                revises.key,
-                kwargs["suffix"],
-            )
+        if revises is not None and revises.key is not None and kwargs["key"] is None:
             kwargs["key"] = revises.key
         kwargs["kind"] = kind
@@ -1530,15 +1529,84 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             - Guide: :doc:`docs:registries`
             - Method in `Record` base class: :meth:`~lamindb.models.Record.get`
-        Examples::
+        Examples:
+            ::
-            artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
-            artifact = ln.Arfifact.get(key="my_datasets/my_file.parquet")
+                artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
+                artifact = ln.Arfifact.get(key="examples/my_file.parquet")
         """
         from .query_set import QuerySet
         return QuerySet(model=cls).get(idlike, **expressions)
+    @classmethod
+    def filter(
+        cls,
+        *queries,
+        **expressions,
+    ) -> QuerySet:
+        """Query a set of artifacts.
+        Args:
+            *queries: `Q` expressions.
+            **expressions: Features, params, fields via the Django query syntax.
+        See Also:
+            - Guide: :doc:`docs:registries`
+        Examples:
+            Query by fields::
+                ln.Arfifact.filter(key="examples/my_file.parquet")
+            Query by features::
+                ln.Arfifact.filter(cell_type_by_model__name="T cell")
+            Query by params::
+                ln.Arfifact.filter(hyperparam_x=100)
+        """
+        from .query_set import QuerySet
+        if expressions:
+            keys_normalized = [key.split("__")[0] for key in expressions]
+            field_or_feature_or_param = keys_normalized[0].split("__")[0]
+            if field_or_feature_or_param in Artifact.__get_available_fields__():
+                return QuerySet(model=cls).filter(*queries, **expressions)
+            elif all(
+                features_validated := Feature.validate(
+                    keys_normalized, field="name", mute=True
+                )
+            ):
+                return filter_base(FeatureManager, **expressions)
+            elif all(
+                params_validated := Param.validate(
+                    keys_normalized, field="name", mute=True
+                )
+            ):
+                return filter_base(ParamManagerArtifact, **expressions)
+            else:
+                if sum(features_validated) < sum(params_validated):
+                    params = ", ".join(
+                        sorted(np.array(keys_normalized)[~params_validated])
+                    )
+                    message = f"param names: {params}"
+                else:
+                    features = ", ".join(
+                        sorted(np.array(keys_normalized)[~params_validated])
+                    )
+                    message = f"feature names: {features}"
+                fields = ", ".join(sorted(cls.__get_available_fields__()))
+                raise InvalidArgument(
+                    f"You can query either by available fields: {fields}\n"
+                    f"Or fix invalid {message}"
+                )
+        else:
+            return QuerySet(model=cls).filter(*queries, **expressions)
     @classmethod
     def from_df(
         cls,
@@ -1548,9 +1616,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         description: str | None = None,
         run: Run | None = None,
         revises: Artifact | None = None,
+        schema: Schema | None = None,
         **kwargs,
     ) -> Artifact:
-        """Create from `DataFrame`, validate & link features.
+        """Create from `DataFrame`, optionally validate & annotate.
         Args:
             df: A `DataFrame` object.
@@ -1559,6 +1628,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             description: A description.
             revises: An old version of the artifact.
             run: The run that creates the artifact.
+            schema: A schema that defines how to validate & annotate.
         See Also:
             :meth:`~lamindb.Collection`
@@ -1566,19 +1636,30 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             :class:`~lamindb.Feature`
                 Track features.
-        Example::
+        Example:
-            import lamindb as ln
+            No validation and annotation::
+                import lamindb as ln
+                df = ln.core.datasets.mini_immuno.get_dataset1()
+                artifact = ln.Artifact.from_df(df, key="examples/dataset1.parquet").save()
+            With validation and annotation.
+            .. literalinclude:: scripts/curate_dataframe_flexible.py
+               :language: python
+            Under-the-hood, this used the following schema.
+            .. literalinclude:: scripts/define_valid_features.py
+               :language: python
+            Valid features & labels were defined as:
+            .. literalinclude:: scripts/define_mini_immuno_features_labels.py
+               :language: python
-            df = ln.core.datasets.df_iris_in_meter_batch1()
-            df.head()
-            #>   sepal_length sepal_width petal_length petal_width iris_organism_code
-            #> 0        0.051       0.035        0.014       0.002                 0
-            #> 1        0.049       0.030        0.014       0.002                 0
-            #> 2        0.047       0.032        0.013       0.002                 0
-            #> 3        0.046       0.031        0.015       0.002                 0
-            #> 4        0.050       0.036        0.014       0.002                 0
-            artifact = ln.Artifact.from_df(df, key="iris/result_batch1.parquet").save()
         """
         artifact = Artifact(  # type: ignore
             data=df,
@@ -1591,6 +1672,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             **kwargs,
         )
         artifact.n_observations = len(df)
+        if schema is not None:
+            from ..curators import DataFrameCurator
+            curator = DataFrameCurator(artifact, schema)
+            curator.validate()
+            artifact.schema = schema
+            artifact._curator = curator
         return artifact
     @classmethod
@@ -1602,9 +1690,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         description: str | None = None,
         run: Run | None = None,
         revises: Artifact | None = None,
+        schema: Schema | None = None,
         **kwargs,
     ) -> Artifact:
-        """Create from ``AnnData``, validate & link features.
+        """Create from `AnnData`, optionally validate & annotate.
         Args:
             adata: An `AnnData` object or a path of AnnData-like.
@@ -1613,6 +1702,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             description: A description.
             revises: An old version of the artifact.
             run: The run that creates the artifact.
+            schema: A schema that defines how to validate & annotate.
         See Also:
@@ -1621,12 +1711,31 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             :class:`~lamindb.Feature`
                 Track features.
-        Example::
+        Example:
-            import lamindb as ln
+            No validation and annotation::
+                import lamindb as ln
+                adata = ln.core.datasets.anndata_with_obs()
+                artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
+            With validation and annotation.
+            .. literalinclude:: scripts/curate_anndata_flexible.py
+               :language: python
+            Under-the-hood, this used the following schema.
+            .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
+               :language: python
+            This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the `var.T` schema, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`.
+            If one doesn't transpose, one would annotate with the schema of `var`, i.e., `[gene_symbol, gene_type]`.
+            .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png
+               :width: 800px
-            adata = ln.core.datasets.anndata_with_obs()
-            artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
         """
         if not data_is_anndata(adata):
             raise ValueError(
@@ -1654,6 +1763,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             # and the proper path through create_path for cloud paths
             obj_for_obs = artifact.path
         artifact.n_observations = _anndata_n_observations(obj_for_obs)
+        if schema is not None:
+            from ..curators import AnnDataCurator
+            curator = AnnDataCurator(artifact, schema)
+            curator.validate()
+            artifact.schema = schema
+            artifact._curator = curator
         return artifact
     @classmethod
@@ -1665,9 +1781,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         description: str | None = None,
         run: Run | None = None,
         revises: Artifact | None = None,
+        schema: Schema | None = None,
         **kwargs,
     ) -> Artifact:
-        """Create from ``MuData``, validate & link features.
+        """Create from `MuData`, optionally validate & annotate.
         Args:
             mdata: A `MuData` object.
@@ -1676,6 +1793,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             description: A description.
             revises: An old version of the artifact.
             run: The run that creates the artifact.
+            schema: A schema that defines how to validate & annotate.
         See Also:
             :meth:`~lamindb.Collection`
@@ -1704,6 +1822,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         )
         if not isinstance(mdata, UPathStr):
             artifact.n_observations = mdata.n_obs
+        if schema is not None:
+            from ..curators import MuDataCurator
+            curator = MuDataCurator(artifact, schema)
+            curator.validate()
+            artifact.schema = schema
+            artifact._curator = curator
         return artifact
     @classmethod
@@ -1715,17 +1840,19 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         description: str | None = None,
         run: Run | None = None,
         revises: Artifact | None = None,
+        schema: Schema | None = None,
         **kwargs,
     ) -> Artifact:
-        """Create from ``SpatialData``, validate & link features.
+        """Create from `SpatialData`, optionally validate & annotate.
         Args:
-            mdata: A `SpatialData` object.
+            sdata: A `SpatialData` object.
             key: A relative path within default storage,
                 e.g., `"myfolder/myfile.zarr"`.
             description: A description.
             revises: An old version of the artifact.
             run: The run that creates the artifact.
+            schema: A schema that defines how to validate & annotate.
         See Also:
             :meth:`~lamindb.Collection`
@@ -1733,11 +1860,21 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             :class:`~lamindb.Feature`
                 Track features.
-        Example::
+        Example:
-            import lamindb as ln
+            No validation and annotation::
+                import lamindb as ln
+                artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()
-            artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save()
+            With validation and annotation.
+            .. literalinclude:: scripts/define_schema_spatialdata.py
+                :language: python
+            .. literalinclude:: scripts/curate_spatialdata.py
+                :language: python
         """
         if not data_is_spatialdata(sdata):
             raise ValueError(
@@ -1755,6 +1892,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         )
         # ill-defined https://scverse.zulipchat.com/#narrow/channel/315824-spatial/topic/How.20to.20calculate.20the.20number.20of.20observations.3F
         # artifact.n_observations = ...
+        if schema is not None:
+            from ..curators import SpatialDataCurator
+            curator = SpatialDataCurator(artifact, schema)
+            curator.validate()
+            artifact.schema = schema
+            artifact._curator = curator
         return artifact
     @classmethod
@@ -2022,29 +2166,39 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         self._old_suffix = self.suffix
     def open(
-        self, mode: str = "r", is_run_input: bool | None = None, **kwargs
-    ) -> Union[
-        AnnDataAccessor,
-        BackedAccessor,
-        SOMACollection,
-        SOMAExperiment,
-        SOMAMeasurement,
-        PyArrowDataset,
-    ]:
-        """Return a cloud-backed data object.
+        self,
+        mode: str = "r",
+        engine: Literal["pyarrow", "polars"] = "pyarrow",
+        is_run_input: bool | None = None,
+        **kwargs,
+    ) -> (
+        AnnDataAccessor
+        | BackedAccessor
+        | SOMACollection
+        | SOMAExperiment
+        | SOMAMeasurement
+        | PyArrowDataset
+        | Iterator[PolarsLazyFrame]
+    ):
+        """Open a dataset for streaming.
         Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
-        `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
+        `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` or `polars` compatible formats
+        (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
         Args:
             mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
                 otherwise should be always `"r"` (read-only mode).
+            engine: Which module to use for lazy loading of a dataframe
+                from `pyarrow` or `polars` compatible formats.
+                This has no effect if the artifact is not a dataframe, i.e.
+                if it is an `AnnData,` `hdf5`, `zarr` or `tiledbsoma` object.
             is_run_input: Whether to track this artifact as run input.
             **kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection,
-                `pyarrow.dataset.dataset`.
+                `pyarrow.dataset.dataset`, `polars.scan_*` function.
         Notes:
-            For more info, see tutorial: :doc:`/arrays`.
+            For more info, see guide: :doc:`/arrays`.
         Example::
@@ -2057,6 +2211,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             #> AnnDataAccessor object with n_obs × n_vars = 70 × 765
             #>     constructed for the AnnData object pbmc68k.h5ad
             #>     ...
+            artifact = ln.Artifact.get(key="lndb-storage/df.parquet")
+            artifact.open()
+            #> pyarrow._dataset.FileSystemDataset
         """
         if self._overwrite_versions and not self.is_latest:
             raise ValueError(INCONSISTENT_STATE_MSG)
@@ -2064,6 +2222,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         h5_suffixes = [".h5", ".hdf5", ".h5ad"]
         h5_suffixes += [s + ".gz" for s in h5_suffixes]
         # ignore empty suffix for now
+        df_suffixes = tuple(set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES))
         suffixes = (
             (
                 "",
@@ -2072,7 +2231,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
                 ".tiledbsoma",
             )
             + tuple(h5_suffixes)
-            + PYARROW_SUFFIXES
+            + df_suffixes
             + tuple(
                 s + ".gz" for s in PYARROW_SUFFIXES
             )  # this doesn't work for externally gzipped files, REMOVE LATER
@@ -2080,10 +2239,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         if self.suffix not in suffixes:
             raise ValueError(
                 "Artifact should have a zarr, h5, tiledbsoma object"
-                " or a compatible `pyarrow.dataset.dataset` directory"
+                " or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory"
                 " as the underlying data, please use one of the following suffixes"
                 f" for the object name: {', '.join(suffixes[1:])}."
-                f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
+                f" Or no suffix for a folder with {', '.join(df_suffixes)} files"
                 " (no mixing allowed)."
             )
         if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
@@ -2092,10 +2251,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             )
         from lamindb import settings
-        from lamindb.core.storage._backed_access import (
-            _track_writes_factory,
-            backed_access,
-        )
         using_key = settings._using_key
         filepath, cache_key = filepath_cache_key_from_artifact(
@@ -2116,14 +2271,22 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             ) and not filepath.synchronize(localpath, just_check=True)
         if open_cache:
             try:
-                access = backed_access(localpath, mode, using_key, **kwargs)
+                access = backed_access(
+                    localpath, mode, engine, using_key=using_key, **kwargs
+                )
             except Exception as e:
-                if isinstance(filepath, LocalPathClasses):
+                # also ignore ValueError here because
+                # such errors most probably just imply an incorrect argument
+                if isinstance(filepath, LocalPathClasses) or isinstance(
+                    e, (ImportError, ValueError)
+                ):
                     raise e
                 logger.warning(
                     f"The cache might be corrupted: {e}. Trying to open directly."
                 )
-                access = backed_access(filepath, mode, using_key, **kwargs)
+                access = backed_access(
+                    filepath, mode, engine, using_key=using_key, **kwargs
+                )
                 # happens only if backed_access has been successful
                 # delete the corrupted cache
                 if localpath.is_dir():
@@ -2131,7 +2294,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
                 else:
                     localpath.unlink(missing_ok=True)
         else:
-            access = backed_access(filepath, mode, using_key, **kwargs)
+            access = backed_access(
+                filepath, mode, engine, using_key=using_key, **kwargs
+            )
             if is_tiledbsoma_w:
                 def finalize():
@@ -2304,6 +2469,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             artifact = ln.Artifact.get(key="some.tiledbsoma". is_latest=True)
             artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
         """
+        # we're *not* running the line below because the case `storage is None` triggers user feedback in one case
+        # storage = True if storage is None else storage
         # this first check means an invalid delete fails fast rather than cascading through
         # database and storage permission errors
         if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
@@ -2354,8 +2522,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             # only delete in storage if DB delete is successful
             # DB delete might error because of a foreign key constraint violated etc.
             if self._overwrite_versions and self.is_latest:
-                # includes self
-                for version in self.versions.all():
+                logger.important(
+                    "deleting all versions of this artifact because they all share the same store"
+                )
+                for version in self.versions.all():  # includes self
                     _delete_skip_storage(version)
             else:
                 self._delete_skip_storage()
@@ -2365,7 +2535,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
                 delete_in_storage = False
                 if storage:
                     logger.warning(
-                        "Storage argument is ignored; can't delete storage on an previous version"
+                        "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True"
                     )
             elif self.key is None or self._key_is_virtual:
                 # do not ask for confirmation also if storage is None
@@ -2466,6 +2636,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
                 local_path_cache,
             )
             logger.important(f"moved local artifact to cache: {local_path_cache}")
+        if hasattr(self, "_curator"):
+            curator = self._curator
+            delattr(self, "_curator")
+            curator.save_artifact()
         return self
     def restore(self) -> None:
@@ -2478,14 +2652,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         self._branch_code = 1
         self.save()
-    def describe(self) -> None:
-        """Describe relations of record.
-        Example::
+    def describe(self, return_str: bool = False) -> None:
+        """Describe record including linked records.
-            artifact.describe()
+        Args:
+            return_str: Return a string instead of printing.
         """
-        return describe_artifact_collection(self)
+        return describe_artifact_collection(self, return_str=return_str)
     def _populate_subsequent_runs(self, run: Run) -> None:
         _populate_subsequent_runs_(self, run)
@@ -2525,9 +2698,11 @@ def _save_skip_storage(artifact, **kwargs) -> None:
 class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
     id: int = models.BigAutoField(primary_key=True)
-    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
+    artifact: Artifact = ForeignKey(
+        Artifact, CASCADE, related_name="links_featurevalue"
+    )
     # we follow the lower() case convention rather than snake case for link models
-    featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
+    featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="links_artifact")
     class Meta:
         unique_together = ("artifact", "featurevalue")
@@ -2535,9 +2710,11 @@ class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
 class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
     id: int = models.BigAutoField(primary_key=True)
-    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
+    artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_paramvalue")
     # we follow the lower() case convention rather than snake case for link models
-    paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
+    paramvalue: ParamValue = ForeignKey(
+        ParamValue, PROTECT, related_name="links_artifact"
+    )
     class Meta:
         unique_together = ("artifact", "paramvalue")

lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl

lamindb 1.3.2py3-none-any.whl → 1.5.0py3-none-any.whl