PyPI - lamindb - Versions diffs - 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

lamindb 1.3.2py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

lamindb/__init__.py +52 -36
lamindb/_finish.py +17 -10
lamindb/_tracked.py +1 -1
lamindb/base/__init__.py +3 -1
lamindb/base/fields.py +40 -22
lamindb/base/ids.py +1 -94
lamindb/base/types.py +2 -0
lamindb/base/uids.py +117 -0
lamindb/core/_context.py +216 -133
lamindb/core/_settings.py +38 -25
lamindb/core/datasets/__init__.py +11 -4
lamindb/core/datasets/_core.py +5 -5
lamindb/core/datasets/_small.py +0 -93
lamindb/core/datasets/mini_immuno.py +172 -0
lamindb/core/loaders.py +1 -1
lamindb/core/storage/_backed_access.py +100 -6
lamindb/core/storage/_polars_lazy_df.py +51 -0
lamindb/core/storage/_pyarrow_dataset.py +15 -30
lamindb/core/storage/objects.py +6 -0
lamindb/core/subsettings/__init__.py +2 -0
lamindb/core/subsettings/_annotation_settings.py +11 -0
lamindb/curators/__init__.py +7 -3559
lamindb/curators/_legacy.py +2056 -0
lamindb/curators/core.py +1546 -0
lamindb/errors.py +11 -0
lamindb/examples/__init__.py +27 -0
lamindb/examples/schemas/__init__.py +12 -0
lamindb/examples/schemas/_anndata.py +25 -0
lamindb/examples/schemas/_simple.py +19 -0
lamindb/integrations/_vitessce.py +8 -5
lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
lamindb/models/__init__.py +12 -2
lamindb/models/_describe.py +21 -4
lamindb/models/_feature_manager.py +384 -301
lamindb/models/_from_values.py +1 -1
lamindb/models/_is_versioned.py +5 -15
lamindb/models/_label_manager.py +8 -2
lamindb/models/artifact.py +354 -177
lamindb/models/artifact_set.py +122 -0
lamindb/models/can_curate.py +4 -1
lamindb/models/collection.py +79 -56
lamindb/models/core.py +1 -1
lamindb/models/feature.py +78 -47
lamindb/models/has_parents.py +24 -9
lamindb/models/project.py +3 -3
lamindb/models/query_manager.py +221 -22
lamindb/models/query_set.py +251 -206
lamindb/models/record.py +211 -344
lamindb/models/run.py +59 -5
lamindb/models/save.py +9 -5
lamindb/models/schema.py +673 -196
lamindb/models/transform.py +5 -14
lamindb/models/ulabel.py +8 -5
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
lamindb-1.5.0.dist-info/RECORD +108 -0
lamindb-1.3.2.dist-info/RECORD +0 -95
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0

lamindb/models/schema.py CHANGED Viewed

@@ -1,12 +1,15 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Any, overload
+from typing import TYPE_CHECKING, Any, Type, overload
 import numpy as np
 from django.db import models
 from django.db.models import CASCADE, PROTECT, ManyToManyField
 from lamin_utils import logger
-from lamindb_setup.core.hashing import HASH_LENGTH, hash_set
+from lamindb_setup.core.hashing import HASH_LENGTH, hash_string
+from rich.table import Table
+from rich.text import Text
+from rich.tree import Tree
 from lamindb.base import ids
 from lamindb.base.fields import (
@@ -17,10 +20,11 @@ from lamindb.base.fields import (
     JSONField,
 )
 from lamindb.base.types import FieldAttr, ListLike
-from lamindb.errors import InvalidArgument
+from lamindb.errors import FieldValidationError, InvalidArgument
+from lamindb.models.feature import parse_cat_dtype
-from ..base import deprecated
 from ..errors import ValidationError
+from ._describe import format_rich_tree, highlight_time
 from ._relations import (
     dict_related_model_to_related_name,
     get_related_name,
@@ -36,14 +40,13 @@ from .record import (
     LinkORM,
     Record,
     Registry,
+    _get_record_kwargs,
     init_self_from_db,
     update_attributes,
 )
 from .run import Param, TracksRun, TracksUpdates
 if TYPE_CHECKING:
-    from collections.abc import Iterable
     import pandas as pd
     from django.db.models.query_utils import DeferredAttribute
@@ -80,80 +83,271 @@ def validate_features(features: list[Record]) -> Record:
     return next(iter(feature_types))  # return value in set of cardinality 1
-class Schema(Record, CanCurate, TracksRun):
-    """Schemas.
+def get_features_config(
+    features: list[Record] | tuple[Record, dict],
+) -> tuple[list[Record], list[tuple[Record, dict]]]:
+    """Get features and their config from the return of feature.with_config()."""
+    features_list = []
+    configs = []
+    try:
+        for feature in features:
+            if isinstance(feature, tuple):
+                features_list.append(feature[0])
+                configs.append(feature)  # store the tuple in configs
+            else:
+                features_list.append(feature)
+        return features_list, configs  # type: ignore
+    except TypeError:
+        return features, configs  # type: ignore
-    The simplest schema is a feature set such as the set of columns of a `DataFrame`.
-    A composite schema has multiple components, e.g., for an `AnnData`, one schema for `obs` and another one for `var`.
+def describe_schema(self: Schema) -> Tree:
+    """Create a rich tree visualization of a Schema with its features."""
+    otype = self.otype if hasattr(self, "otype") and self.otype else ""
+    tree = Tree(
+        Text.assemble((self.__class__.__name__, "bold"), (f" {otype}", "bold dim")),
+        guide_style="dim",  # dim the connecting lines
+    )
-    Args:
-        features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
-            records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
-            a set upon instantiation. If you'd like to pass values, use
-            :meth:`~lamindb.Schema.from_values` or
-            :meth:`~lamindb.Schema.from_df`.
-        components: `dict[str, Schema] | None = None` A dictionary mapping component names to
-            their corresponding :class:`~lamindb.Schema` objects for composite schemas.
-        name: `str | None = None` A name.
-        description: `str | None = None` A description.
-        dtype: `str | None = None` The simple type. Defaults to
-            `None` for sets of :class:`~lamindb.Feature` records.
-            Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
-        itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
-        type: `Schema | None = None` A type.
-        is_type: `bool = False` Distinguish types from instances of the type.
-        otype: `str | None = None` An object type to define the structure of a composite schema.
-        minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
-        ordered_set: `bool = False` Whether features are required to be ordered.
-        maximal_set: `bool = False` If `True`, no additional features are allowed.
-        slot: `str | None = None` The slot name when this schema is used as a component in a
-            composite schema.
-        coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
-            during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
+    tree.add(f".uid = '{self.uid}'")
+    tree.add(f".name = '{self.name}'")
+    if self.description:
+        tree.add(f".description = '{self.description}'")
+    if self.itype:
+        tree.add(f".itype = '{self.itype}'")
+    if self.type:
+        tree.add(f".type = '{self.type}'")
+    tree.add(f".ordered_set = {self.ordered_set}")
+    tree.add(f".maximal_set = {self.maximal_set}")
+    if hasattr(self, "created_by") and self.created_by:
+        tree.add(
+            Text.assemble(
+                ".created_by = ",
+                (
+                    self.created_by.handle
+                    if self.created_by.name is None
+                    else f"{self.created_by.handle} ({self.created_by.name})"
+                ),
+            )
+        )
+    if hasattr(self, "created_at") and self.created_at:
+        tree.add(Text.assemble(".created_at = ", highlight_time(str(self.created_at))))
+    members = self.members
+    # Add features section
+    features = tree.add(
+        Text.assemble(
+            (self.itype, "violet"),
+            (" • ", "dim"),
+            (str(members.count()), "pink1"),
+        )
+    )
+    if hasattr(self, "members") and self.members.count() > 0:
+        # create a table for the features
+        feature_table = Table(
+            show_header=True, header_style="dim", box=None, pad_edge=False
+        )
+        # Add columns
+        feature_table.add_column("name", style="", no_wrap=True)
+        feature_table.add_column("dtype", style="", no_wrap=True)
+        feature_table.add_column("optional", style="", no_wrap=True)
+        feature_table.add_column("nullable", style="", no_wrap=True)
+        feature_table.add_column("coerce_dtype", style="", no_wrap=True)
+        feature_table.add_column("default_value", style="", no_wrap=True)
+        # Add rows for each member
+        optionals = self.optionals.get()
+        for member in self.members:
+            feature_table.add_row(
+                member.name,
+                Text(
+                    str(member.dtype)
+                ),  # needs to be wrapped in Text to display correctly
+                "✓" if optionals.filter(uid=member.uid).exists() else "✗",
+                "✓" if member.nullable else "✗",
+                "✓" if member.coerce_dtype else "✗",
+                str(member.default_value) if member.default_value else "unset",
+            )
-    .. dropdown:: Why does LaminDB model schemas, not just features?
+        # Add the table to the features branch
+        features.add(feature_table)
-        1. Performance: Imagine you measure the same panel of 20k transcripts in
-           1M samples. By modeling the panel as a feature set, you can link all
-           your artifacts against one feature set and only need to store 1M
-           instead of 1M x 20k = 20B links.
-        2. Interpretation: Model protein panels, gene panels, etc.
-        3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
+    return tree
-        These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
-    Note:
+class SchemaOptionals:
+    """Manage and access optional features in a schema."""
-        A feature set can be identified by the `hash` of its feature uids.
-        It's stored in the `.hash` field.
+    def __init__(self, schema) -> None:
+        self.schema = schema
-        A `slot` provides a string key to access feature sets. For instance, for the schema of an
-        `AnnData` object, it would be `'obs'` for `adata.obs`.
+    def get_uids(self) -> list[str]:
+        """Get the uids of the optional features.
+        Does **not** need an additional query to the database, while `get()` does.
+        """
+        if (
+            self.schema._aux is not None
+            and "af" in self.schema._aux
+            and "1" in self.schema._aux["af"]
+        ):
+            return self.schema._aux["af"]["1"]
+        else:
+            return []
+    def get(self) -> QuerySet:
+        """Get the optional features."""
+        uids = self.get_uids()
+        if uids:
+            return Feature.objects.filter(uid__in=uids).order_by("links_schema__id")
+        else:
+            return Feature.objects.none()  # empty QuerySet
+    def set(self, features: list[Feature]) -> None:
+        """Set the optional features (overwrites whichever schemas are currently optional)."""
+        if not isinstance(features, list) or not all(
+            isinstance(f, Feature) for f in features
+        ):
+            raise TypeError("features must be a list of Feature records!")
+        self.schema._aux = self.schema._aux or {}
+        if len(features) > 0:
+            self.schema._aux.setdefault("af", {})["1"] = [f.uid for f in features]
+    def remove(self, features: Feature | list[Feature]) -> None:
+        """Make one or multiple features required by removing them from the set of optional features."""
+        if not isinstance(features, list):
+            features = [features]
+        if not all(isinstance(f, Feature) for f in features):
+            raise TypeError("features must be a list of Feature records!")
+        if len(features) > 0:
+            self.schema._aux = self.schema._aux or {}
+            if "1" in self.schema._aux.get("af", {}):
+                for feature in features:
+                    self.schema._aux["af"]["1"].remove(feature.uid)
+    def add(self, features: Feature | list[Feature]) -> None:
+        """Make one or multiple features optional by adding them to the set of optional features."""
+        self.schema._aux = self.schema._aux or {}
+        if not isinstance(features, list):
+            features = [features]
+        if not all(isinstance(f, Feature) for f in features):
+            raise TypeError("features must be a list of Feature records!")
+        if len(features) > 0:
+            if "1" not in self.schema._aux.setdefault("af", {}):
+                self.set(features)
+            else:
+                self.schema._aux.setdefault("af", {})["1"].extend(
+                    [f.uid for f in features]
+                )
+KNOWN_SCHEMAS = {
+    "kMi7B_N88uu-YnbTLDU-DA": "0000000000000000",  # valid_features
+    "1gocc_TJ1RU2bMwDRK-WUA": "0000000000000001",  # valid_ensembl_gene_ids
+    "GTxxM36n9tocphLfdbNt9g": "0000000000000002",  # anndata_ensembl_gene_ids_and_valid_features_in_obs
+}
+class Schema(Record, CanCurate, TracksRun):
+    """Schemas of a dataset such as the set of columns of a `DataFrame`.
+    Composite schemas can have multiple slots, e.g., for an `AnnData`, one schema for slot `obs` and another one for `var`.
+    Args:
+        features: `list[Record] | list[tuple[Feature, dict]] | None = None` Feature
+            records, e.g., `[Feature(...), Feature(...)]` or Features with their config, e.g., `[Feature(...).with_config(optional=True)]`.
+        index: `Feature | None = None` A :class:`~lamindb.Feature` record to validate an index of a `DataFrame` and therefore also, e.g., `AnnData` obs and var indices.
+        slots: `dict[str, Schema] | None = None` A dictionary mapping slot names to :class:`~lamindb.Schema` objects.
+        name: `str | None = None` Name of the Schema.
+        description: `str | None = None` Description of the Schema.
+        flexible: `bool | None = None` Whether to include any feature of the same `itype` in validation
+            and annotation. If no Features are passed, defaults to `True`, otherwise to `False`.
+            This means that if you explicitly pass Features, any additional Features will be disregarded during validation & annotation.
+        type: `Schema | None = None` Type of Schema to group measurements by.
+            Define types like `ln.Schema(name="ProteinPanel", is_type=True)`.
+        is_type: `bool = False` Whether the Schema is a Type.
+        itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
+        otype: `str | None = None` An object type to define the structure of a composite schema (e.g., DataFrame, AnnData).
+        dtype: `str | None = None` The simple type (e.g., "num", "float", "int").
+            Defaults to `None` for sets of :class:`~lamindb.Feature` records and to `"num"` (e.g., for sets of :class:`~bionty.Gene`) otherwise.
+        minimal_set: `bool = True` Whether all passed Features are required by default.
+            See :attr:`~lamindb.Schema.optionals` for more-fine-grained control.
+        maximal_set: `bool = False` Whether additional Features are allowed.
+        ordered_set: `bool = False` Whether Features are required to be ordered.
+        coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
+            during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
     See Also:
-        :meth:`~lamindb.Schema.from_values`
-            Create from values.
-        :meth:`~lamindb.Schema.from_df`
-            Create from dataframe columns.
+        :meth:`~lamindb.Artifact.from_df`
+            Validate & annotate a `DataFrame` with a schema.
+        :meth:`~lamindb.Artifact.from_anndata`
+            Validate & annotate an `AnnData` with a schema.
+        :meth:`~lamindb.Artifact.from_mudata`
+            Validate & annotate an `MuData` with a schema.
+        :meth:`~lamindb.Artifact.from_spatialdata`
+            Validate & annotate a `SpatialData` with a schema.
     Examples:
-        Create a schema (feature set) from df with types:
+        The typical way to create a schema::
-        >>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
-        >>> schema = ln.Schema.from_df(df)
+            import lamindb as ln
+            import bionty as bt
+            import pandas as pd
-        Create a schema (feature set) from features:
+            # a schema with a single required feature
+            schema = ln.Schema(
+                features=[
+                    ln.Feature(name="required_feature", dtype=str).save(),
+                ],
+            ).save()
-        >>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
-        >>> schema = ln.Schema(features)
+            # a schema that constrains feature identifiers to be a valid ensembl gene ids or feature names
+            schema = ln.Schema(itype=bt.Gene.ensembl_gene_id)
+            schema = ln.Schema(itype=ln.Feature)  # is equivalent to itype=ln.Feature.name
+            # a schema that requires a single feature but also validates & annotates any additional features with valid feature names
+            schema = ln.Schema(
+                features=[
+                    ln.Feature(name="required_feature", dtype=str).save(),
+                ],
+                itype=ln.Schema(itype=ln.Feature),
+                flexible=True,
+            ).save()
-        Create a schema (feature set) from identifier values:
+        Passing options to the `Schema` constructor::
-        >>> import bionty as bt
-        >>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
+            # also validate the index
+            schema = ln.Schema(
+                features=[
+                    ln.Feature(name="required_feature", dtype=str).save(),
+                ],
+                index=ln.Feature(name="sample", dtype=ln.ULabel).save(),
+            ).save()
+            # mark a single feature as optional and ignore other features of the same identifier type
+            schema = ln.Schema(
+                features=[
+                    ln.Feature(name="required_feature", dtype=str).save(),
+                    ln.Feature(name="feature2", dtype=int).save().with_config(optional=True),
+                ],
+            ).save()
+        Alternative constructors (:meth:`~lamindb.Schema.from_values`, :meth:`~lamindb.Schema.from_df`)::
+            # parse & validate identifier values
+            schema = ln.Schema.from_values(
+                adata.var["ensemble_id"],
+                field=bt.Gene.ensembl_gene_id,
+                organism="mouse",
+            ).save()
+            # from a dataframe
+            df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
+            schema = ln.Schema.from_df(df)
     """
     class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
@@ -162,34 +356,24 @@ class Schema(Record, CanCurate, TracksRun):
     _name_field: str = "name"
     _aux_fields: dict[str, tuple[str, type]] = {
         "0": ("coerce_dtype", bool),
-        "1": ("_index_feature_uid", str),
+        "1": ("optionals", list[str]),
+        "2": ("flexible", bool),
+        "3": ("index_feature_uid", str),
     }
     id: int = models.AutoField(primary_key=True)
     """Internal id, valid only in one DB instance."""
     uid: str = CharField(editable=False, unique=True, db_index=True, max_length=20)
-    """A universal id (hash of the set of feature values)."""
+    """A universal id.
+    Before lamindb 1.5, it was 20 char long. Since lamindb 1.5, it is 16 char long.
+    """
     name: str | None = CharField(max_length=150, null=True, db_index=True)
     """A name."""
     description: str | None = CharField(null=True, db_index=True)
     """A description."""
-    n = IntegerField()
-    """Number of features in the set."""
-    dtype: str | None = CharField(max_length=64, null=True, editable=False)
-    """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
-    For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
-    """
-    itype: str | None = CharField(
-        max_length=120, db_index=True, null=True, editable=False
-    )
-    """A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
-    Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
-    .. versionchanged:: 1.0.0
-        Was called `registry` before.
-    """
+    n: int = IntegerField()
+    """Number of features in the schema."""
     type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
     """Type of schema.
@@ -203,8 +387,20 @@ class Schema(Record, CanCurate, TracksRun):
     """Records of this type."""
     is_type: bool = BooleanField(default=False, db_index=True, null=True)
     """Distinguish types from instances of the type."""
+    itype: str | None = CharField(
+        max_length=120, db_index=True, null=True, editable=False
+    )
+    """A registry that stores feature identifier types used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
+    Depending on `itype`, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
+    """
     otype: str | None = CharField(max_length=64, db_index=True, null=True)
     """Default Python object type, e.g., DataFrame, AnnData."""
+    dtype: str | None = CharField(max_length=64, null=True, editable=False)
+    """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
+    For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
+    """
     hash: str | None = CharField(
         max_length=HASH_LENGTH, db_index=True, null=True, editable=False
     )
@@ -213,18 +409,19 @@ class Schema(Record, CanCurate, TracksRun):
     For a composite schema, the hash of hashes.
     """
     minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
-    """Whether the schema contains a minimal set of linked features (default `True`).
+    """Whether all passed features are to be considered required by default (default `True`).
-    If `False`, no features are linked to this schema.
-    If `True`, features are linked and considered as a minimally required set in validation.
+    Note that features that are explicitly marked as `optional` via `feature.with_config(optional=True)`
+    are **not** required even if this `minimal_set` is true.
     """
     ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
     """Whether features are required to be ordered (default `False`)."""
     maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
-    """If `False`, additional features are allowed (default `False`).
+    """Whether all features present in the dataset must be in the schema (default `False`).
+    If `False`, additional features are allowed to be present in the dataset.
-    If `True`, the the minimal set is a maximal set and no additional features are allowed.
+    If `True`, no additional features are allowed to be present in the dataset.
     """
     components: Schema = ManyToManyField(
         "self", through="SchemaComponent", symmetrical=False, related_name="composites"
@@ -271,20 +468,22 @@ class Schema(Record, CanCurate, TracksRun):
     @overload
     def __init__(
         self,
-        features: Iterable[Record] | None = None,
-        components: dict[str, Schema] | None = None,
+        features: list[Record] | list[tuple[Feature, dict]] | None = None,
+        index: Feature | None = None,
+        slots: dict[str, Schema] | None = None,
         name: str | None = None,
         description: str | None = None,
-        dtype: str | None = None,
         itype: str | Registry | FieldAttr | None = None,
+        flexible: bool | None = None,
         type: Schema | None = None,
         is_type: bool = False,
         otype: str | None = None,
-        minimal_set: bool = True,
+        dtype: str | Type[int | float | str] | None = None,  # noqa
         ordered_set: bool = False,
+        minimal_set: bool = True,
         maximal_set: bool = False,
-        slot: str | None = None,
         coerce_dtype: bool = False,
+        n: int | None = None,
     ): ...
     @overload
@@ -304,50 +503,152 @@ class Schema(Record, CanCurate, TracksRun):
         if len(args) > 1:
             raise ValueError("Only one non-keyword arg allowed: features")
-        features: Iterable[Record] | None = (
-            args[0] if args else kwargs.pop("features", [])
-        )
-        # typing here anticipates transitioning to a ManyToMany
-        # between composites and components similar to feature_sets
-        # in lamindb v2
-        components: dict[str, Schema] = kwargs.pop("components", {})
+        features: list[Record] | None = args[0] if args else kwargs.pop("features", [])
+        index: Feature | None = kwargs.pop("index", None)
+        slots: dict[str, Schema] = kwargs.pop("slots", {})
         name: str | None = kwargs.pop("name", None)
         description: str | None = kwargs.pop("description", None)
-        dtype: str | None = kwargs.pop("dtype", None)
         itype: str | Record | DeferredAttribute | None = kwargs.pop("itype", None)
+        flexible: bool | None = kwargs.pop("flexible", None)
         type: Feature | None = kwargs.pop("type", None)
         is_type: bool = kwargs.pop("is_type", False)
         otype: str | None = kwargs.pop("otype", None)
+        dtype: str | None = kwargs.pop("dtype", None)
         minimal_set: bool = kwargs.pop("minimal_set", True)
         ordered_set: bool = kwargs.pop("ordered_set", False)
         maximal_set: bool = kwargs.pop("maximal_set", False)
-        slot: str | None = kwargs.pop("slot", None)
-        coerce_dtype: bool | None = kwargs.pop("coerce_dtype", None)
+        coerce_dtype: bool | None = kwargs.pop("coerce_dtype", False)
+        using: bool | None = kwargs.pop("using", None)
+        n_features: int | None = kwargs.pop("n", None)
+        # backward compat
+        if not slots:
+            if "components" in kwargs:
+                logger.warning(
+                    "`components` as a keyword argument is deprecated, please use `slots` instead"
+                )
+                slots = kwargs.pop("components")
         if kwargs:
-            raise ValueError(
-                f"Unexpected keyword arguments: {', '.join(kwargs.keys())}\n"
-                "Valid arguments are: features, description, dtype, itype, type, "
-                "is_type, otype, minimal_set, ordered_set, maximal_set, "
-                "slot, validated_by, coerce_dtype"
+            valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Schema)])
+            raise FieldValidationError(
+                f"Only {valid_keywords} are valid keyword arguments"
             )
+        (
+            features,
+            validated_kwargs,
+            optional_features,
+            features_registry,
+            flexible,
+            list_for_hashing,
+        ) = self._validate_kwargs_calculate_hash(
+            features=features,
+            index=index,
+            slots=slots,
+            name=name,
+            description=description,
+            itype=itype,
+            flexible=flexible,
+            type=type,
+            is_type=is_type,
+            otype=otype,
+            dtype=dtype,
+            minimal_set=minimal_set,
+            ordered_set=ordered_set,
+            maximal_set=maximal_set,
+            coerce_dtype=coerce_dtype,
+            n_features=n_features,
+        )
+        schema = (
+            Schema.objects.using(using)
+            .filter(hash=validated_kwargs["hash"])
+            .one_or_none()
+        )
+        self._list_for_hashing = list_for_hashing
+        if schema is not None:
+            logger.important(f"returning existing schema with same hash: {schema}")
+            init_self_from_db(self, schema)
+            update_attributes(self, validated_kwargs)
+            self.optionals.set(optional_features)
+            return None
+        self._slots: dict[str, Schema] = {}
+        if features:
+            self._features = (get_related_name(features_registry), features)  # type: ignore
+        elif slots:
+            for slot_key, component in slots.items():
+                if component._state.adding:
+                    raise InvalidArgument(
+                        f"schema for {slot_key} {component} must be saved before use"
+                    )
+            self._slots = slots
+        if validated_kwargs["hash"] in KNOWN_SCHEMAS:
+            validated_kwargs["uid"] = KNOWN_SCHEMAS[validated_kwargs["hash"]]
+        else:
+            validated_kwargs["uid"] = ids.base62_16()
+        super().__init__(**validated_kwargs)
+        # manipulating aux fields is easier after calling super().__init__()
+        self.optionals.set(optional_features)
+        self.flexible = flexible
+        if index is not None:
+            self._index_feature_uid = index.uid
+    def _validate_kwargs_calculate_hash(
+        self,
+        features: list[Record],
+        index: Feature | None,
+        slots: dict[str, Schema],
+        name: str | None,
+        description: str | None,
+        itype: str | Record | DeferredAttribute | None,
+        flexible: bool | None,
+        type: Feature | None,
+        is_type: bool,
+        otype: str | None,
+        dtype: str | None,
+        minimal_set: bool,
+        ordered_set: bool,
+        maximal_set: bool,
+        coerce_dtype: bool,
+        n_features: int | None,
+        optional_features_manual: list[Feature] | None = None,
+    ) -> tuple[list[Feature], dict[str, Any], list[Feature], Registry, bool, list[str]]:
+        optional_features = []
+        features_registry: Registry = None
+        if itype is not None:
+            if itype != "Composite":
+                itype = serialize_dtype(itype, is_itype=True)
+        if index is not None:
+            if not isinstance(index, Feature):
+                raise TypeError("index must be a Feature")
+            features.insert(0, index)
         if features:
+            features, configs = get_features_config(features)
             features_registry = validate_features(features)
             itype_compare = features_registry.__get_name_with_module__()
             if itype is not None:
-                assert itype == itype_compare, str(itype_compare)  # noqa: S101
+                assert itype.startswith(itype_compare), str(itype_compare)  # noqa: S101
             else:
                 itype = itype_compare
+            if n_features is not None:
+                if n_features != len(features):
+                    logger.important(f"updating to n {len(features)} features")
             n_features = len(features)
-        else:
+            if features_registry == Feature:
+                optional_features = [
+                    config[0] for config in configs if config[1].get("optional")
+                ]
+                if optional_features:
+                    assert optional_features_manual is None  # noqa: S101
+                if not optional_features and optional_features_manual is not None:
+                    optional_features = optional_features_manual
+        elif n_features is None:
             n_features = -1
         if dtype is None:
             dtype = None if itype is not None and itype == "Feature" else NUMBER_TYPE
         else:
             dtype = get_type_str(dtype)
-        components: dict[str, Schema]
-        if components:
+        flexible_default = n_features < 0
+        if flexible is None:
+            flexible = flexible_default
+        if slots:
             itype = "Composite"
             if otype is None:
                 raise InvalidArgument("Please pass otype != None for composite schemas")
@@ -359,8 +660,8 @@ class Schema(Record, CanCurate, TracksRun):
             "name": name,
             "description": description,
             "type": type,
-            "dtype": dtype,
             "is_type": is_type,
+            "dtype": dtype,
             "otype": otype,
             "n": n_features,
             "itype": itype_str,
@@ -368,35 +669,68 @@ class Schema(Record, CanCurate, TracksRun):
             "ordered_set": ordered_set,
             "maximal_set": maximal_set,
         }
+        n_features_default = -1
+        coerce_dtype_default = False
         if coerce_dtype:
             validated_kwargs["_aux"] = {"af": {"0": coerce_dtype}}
-        if features:
-            hash = hash_set({feature.uid for feature in features})
-        elif components:
-            hash = hash_set({component.hash for component in components.values()})
+        if slots:
+            list_for_hashing = [component.hash for component in slots.values()]
         else:
-            hash = hash_set({str(value) for value in validated_kwargs.values()})
-        validated_kwargs["hash"] = hash
-        validated_kwargs["slot"] = slot
-        schema = Schema.filter(hash=hash).one_or_none()
-        if schema is not None:
-            logger.important(f"returning existing schema with same hash: {schema}")
-            init_self_from_db(self, schema)
-            update_attributes(self, validated_kwargs)
-            return None
-        self._components: dict[str, Schema] = {}
-        if features:
-            self._features = (get_related_name(features_registry), features)  # type: ignore
-        elif components:
-            for slot, component in components.items():
-                if component._state.adding:
-                    raise InvalidArgument(
-                        f"component {slot} {component} must be saved before use"
+            HASH_CODE = {
+                "dtype": "a",
+                "itype": "b",
+                "minimal_set": "c",
+                "ordered_set": "d",
+                "maximal_set": "e",
+                "flexible": "f",
+                "coerce_dtype": "g",
+                "n": "h",
+                "optional": "i",
+                "features_hash": "j",
+            }
+            # we do not want pure informational annotations like otype, name, type, is_type, otype to be part of the hash
+            hash_args = ["dtype", "itype", "minimal_set", "ordered_set", "maximal_set"]
+            list_for_hashing = [
+                f"{HASH_CODE[arg]}={validated_kwargs[arg]}"
+                for arg in hash_args
+                if validated_kwargs[arg] is not None
+            ]
+            # only include in hash if not default so that it's backward compatible with records for which flexible was never set
+            if flexible != flexible_default:
+                list_for_hashing.append(f"{HASH_CODE['flexible']}={flexible}")
+            if coerce_dtype != coerce_dtype_default:
+                list_for_hashing.append(f"{HASH_CODE['coerce_dtype']}={coerce_dtype}")
+            if n_features != n_features_default:
+                list_for_hashing.append(f"{HASH_CODE['n']}={n_features}")
+            if features:
+                if optional_features:
+                    feature_list_for_hashing = [
+                        feature.uid
+                        if feature not in set(optional_features)
+                        else f"{feature.uid}({HASH_CODE['optional']})"
+                        for feature in features
+                    ]
+                else:
+                    feature_list_for_hashing = [feature.uid for feature in features]
+                # order matters if ordered_set is True
+                if ordered_set:
+                    features_hash = hash_string(":".join(feature_list_for_hashing))
+                else:
+                    features_hash = hash_string(
+                        ":".join(sorted(feature_list_for_hashing))
                     )
-            self._components = components
-            self._slots = components
-        validated_kwargs["uid"] = ids.base62_20()
-        super().__init__(**validated_kwargs)
+                list_for_hashing.append(f"{HASH_CODE['features_hash']}={features_hash}")
+        self._list_for_hashing = sorted(list_for_hashing)
+        schema_hash = hash_string(":".join(self._list_for_hashing))
+        validated_kwargs["hash"] = schema_hash
+        return (
+            features,
+            validated_kwargs,
+            optional_features,
+            features_registry,
+            flexible,
+            list_for_hashing,
+        )
     @classmethod
     def from_values(  # type: ignore
@@ -426,13 +760,18 @@ class Schema(Record, CanCurate, TracksRun):
         Raises:
             ValidationError: If some values are not valid.
-        Examples:
+        Example:
+            ::
-            >>> features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
-            >>> schema = ln.Schema.from_values(features)
+                import lamindb as ln
+                import bionty as bt
-            >>> genes = ["ENSG00000139618", "ENSG00000198786"]
-            >>> schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
+                features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
+                schema = ln.Schema.from_values(features)
+                genes = ["ENSG00000139618", "ENSG00000198786"]
+                schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
         """
         if not isinstance(field, FieldAttr):
             raise TypeError(
@@ -496,7 +835,7 @@ class Schema(Record, CanCurate, TracksRun):
                 df.columns, field=field, organism=organism
             )
             schema = Schema(
-                validated_features, name=name, dtype=None, otype="DataFrame"
+                list(validated_features), name=name, dtype=None, otype="DataFrame"
             )
         else:
             dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
@@ -510,10 +849,9 @@ class Schema(Record, CanCurate, TracksRun):
                 source=source,
             )
             schema = Schema(
-                features=validated_features,
+                features=list(validated_features),
                 name=name,
                 dtype=get_type_str(dtype),
-                otype="DataFrame",
             )
         return schema
@@ -521,12 +859,50 @@ class Schema(Record, CanCurate, TracksRun):
         """Save."""
         from .save import bulk_create
+        if not self._state.adding:
+            features = (
+                self._features[1]
+                if hasattr(self, "_features")
+                else (self.members.list() if self.members.exists() else [])
+            )
+            _, validated_kwargs, _, _, _, list_for_hashing = (
+                self._validate_kwargs_calculate_hash(
+                    features=features,  # type: ignore
+                    index=None,  # need to pass None here as otherwise counting double
+                    slots=self._slots if hasattr(self, "_slots") else self.slots,
+                    name=self.name,
+                    description=self.description,
+                    itype=self.itype,
+                    flexible=self.flexible,
+                    type=self.type,
+                    is_type=self.is_type,
+                    otype=self.otype,
+                    dtype=self.dtype,
+                    minimal_set=self.minimal_set,
+                    ordered_set=self.ordered_set,
+                    maximal_set=self.maximal_set,
+                    coerce_dtype=self.coerce_dtype,
+                    n_features=self.n,
+                    optional_features_manual=self.optionals.get(),
+                )
+            )
+            if validated_kwargs["hash"] != self.hash:
+                from .artifact import Artifact
+                datasets = Artifact.filter(schema=self).all()
+                if datasets.exists():
+                    logger.warning(
+                        f"you updated the schema hash and might invalidate datasets that were previously validated with this schema: {datasets.list('uid')}"
+                    )
+                self.hash = validated_kwargs["hash"]
+                self.n = validated_kwargs["n"]
+            self._list_for_hashing = list_for_hashing
         super().save(*args, **kwargs)
-        if hasattr(self, "_components"):
+        if hasattr(self, "_slots"):
             # analogous to save_schema_links in core._data.py
             # which is called to save feature sets in artifact.save()
             links = []
-            for slot, component in self._components.items():
+            for slot, component in self._slots.items():
                 kwargs = {
                     "composite_id": self.id,
                     "component_id": component.id,
@@ -536,12 +912,15 @@ class Schema(Record, CanCurate, TracksRun):
             bulk_create(links, ignore_conflicts=True)
         if hasattr(self, "_features"):
             assert self.n > 0  # noqa: S101
+            using: bool | None = kwargs.pop("using", None)
             related_name, records = self._features
             # only the following method preserves the order
             # .set() does not preserve the order but orders by
             # the feature primary key
             through_model = getattr(self, related_name).through
-            related_model_split = self.itype.split(".")
+            related_model_split = parse_cat_dtype(self.itype, is_itype=True)[
+                "registry_str"
+            ].split(".")
             if len(related_model_split) == 1:
                 related_field = related_model_split[0].lower()
             else:
@@ -551,16 +930,23 @@ class Schema(Record, CanCurate, TracksRun):
                 through_model(**{"schema_id": self.id, related_field_id: record.id})
                 for record in records
             ]
-            through_model.objects.bulk_create(links, ignore_conflicts=True)
+            through_model.objects.using(using).bulk_create(links, ignore_conflicts=True)
+            delattr(self, "_features")
         return self
     @property
     def members(self) -> QuerySet:
-        """A queryset for the individual records of the set."""
+        """A queryset for the individual records in the feature set underlying the schema.
+        Unlike `schema.features`, `schema.genes`, `schema.proteins`, etc., this queryset is ordered and
+        doesn't require knowledge of the entity.
+        """
         if self._state.adding:
             # this should return a queryset and not a list...
             # need to fix this
             return self._features[1]
+        if self.itype == "Composite":
+            return Feature.objects.none()
         related_name = self._get_related_name()
         if related_name is None:
             related_name = "features"
@@ -579,62 +965,108 @@ class Schema(Record, CanCurate, TracksRun):
     @coerce_dtype.setter
     def coerce_dtype(self, value: bool) -> None:
-        if self._aux is None:  # type: ignore
-            self._aux = {}  # type: ignore
-        if "af" not in self._aux:
-            self._aux["af"] = {}
-        self._aux["af"]["0"] = value
-    # @property
-    # def index_feature(self) -> None | Feature:
-    #     # index_feature: `Record | None = None` A :class:`~lamindb.Feature` to validate the index of a `DataFrame`.
-    #     """The uid of the index feature, if `index_feature` was set."""
-    #     if self._index_feature_uid is None:
-    #         return None
-    #     else:
-    #         return self.features.get(uid=self._index_feature_uid)
-    # @property
-    # def _index_feature_uid(self) -> None | str:
-    #     """The uid of the index feature, if `index_feature` was set."""
-    #     if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
-    #         return self._aux["af"]["1"]
-    #     else:
-    #         return None
-    # @_index_feature_uid.setter
-    # def _index_feature_uid(self, value: str) -> None:
-    #     if self._aux is None:
-    #         self._aux = {}
-    #     if "af" not in self._aux:
-    #         self._aux["af"] = {}
-    #     self._aux["af"]["1"] = value
+        self._aux = self._aux or {}
+        self._aux.setdefault("af", {})["0"] = value
+    @property
+    def flexible(self) -> bool:
+        """Indicates how to handle validation and annotation in case features are not defined.
+        Examples:
+            Make a rigid schema flexible::
+                schema = ln.Schema.get(name="my_schema")
+                schema.flexible = True
+                schema.save()
+            During schema creation::
+                # if you're not passing features but just defining the itype, defaults to flexible = True
+                schema = ln.Schema(itype=ln.Feature).save()
+                assert not schema.flexible
+                # if you're passing features, defaults to flexible = False
+                schema = ln.Schema(
+                    features=[ln.Feature(name="my_required_feature", dtype=int).save()],
+                )
+                assert not schema.flexible
+                # you can also validate & annotate features in addition to those that you're explicitly defining:
+                schema = ln.Schema(
+                    features=[ln.Feature(name="my_required_feature", dtype=int).save()],
+                    flexible=True,
+                )
+                assert schema.flexible
+        """
+        if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]:  # type: ignore
+            return self._aux["af"]["2"]  # type: ignore
+        else:
+            return (
+                self.n < 0
+            )  # is the flexible default, needed for backward compat if flexible was never set
+    @flexible.setter
+    def flexible(self, value: bool) -> None:
+        self._aux = self._aux or {}
+        self._aux.setdefault("af", {})["2"] = value
     @property
-    @deprecated("itype")
-    def registry(self) -> str:
-        return self.itype
+    def index(self) -> None | Feature:
+        """The feature configured to act as index.
-    @registry.setter
-    def registry(self, value) -> None:
-        self.itype = value
+        To unset it, set `schema.index` to `None`.
+        """
+        if self._index_feature_uid is None:
+            return None
+        else:
+            return self.features.get(uid=self._index_feature_uid)
+    @index.setter
+    def index(self, value: None | Feature) -> None:
+        if value is None:
+            current_index = self.index
+            self.features.remove(current_index)
+            self._index_feature_uid = value
+        else:
+            self.features.add(value)
+            self._index_feature_uid = value.uid
+    @property
+    def _index_feature_uid(self) -> None | str:
+        """The uid of the index feature."""
+        if self._aux is not None and "af" in self._aux and "3" in self._aux["af"]:
+            return self._aux["af"]["3"]
+        else:
+            return None
+    @_index_feature_uid.setter
+    def _index_feature_uid(self, value: str | None) -> None:
+        self._aux = self._aux or {}
+        if value is None:
+            self._aux.get("af", {}).pop("3")
+        else:
+            self._aux.setdefault("af", {})["3"] = value
     @property
     def slots(self) -> dict[str, Schema]:
         """Slots.
-        Examples::
+        Examples:
-            # define composite schema
-            anndata_schema = ln.Schema(
-                name="small_dataset1_anndata_schema",
-                otype="AnnData",
-                components={"obs": obs_schema, "var": var_schema},
-            ).save()
+            ::
+                # define composite schema
+                anndata_schema = ln.Schema(
+                    name="small_dataset1_anndata_schema",
+                    otype="AnnData",
+                    slots={"obs": obs_schema, "var": var_schema},
+                ).save()
-            # access slots
-            anndata_schema.slots
-            # {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}
+                # access slots
+                anndata_schema.slots
+                # {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}
         """
         if hasattr(self, "_slots"):
             return self._slots
@@ -646,6 +1078,44 @@ class Schema(Record, CanCurate, TracksRun):
             return self._slots
         return {}
+    @property
+    def optionals(self) -> SchemaOptionals:
+        """Manage optional features.
+        Example:
+            ::
+                # a schema with optional "sample_name"
+                schema_optional_sample_name = ln.Schema(
+                    features=[
+                        ln.Feature(name="sample_id", dtype=str).save(),  # required
+                        ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True),  # optional
+                    ],
+                ).save()
+                # raise ValidationError since `sample_id` is required
+                ln.curators.DataFrameCurator(
+                    pd.DataFrame(
+                        {
+                        "sample_name": ["Sample 1", "Sample 2"],
+                        }
+                    ),
+                    schema=schema_optional_sample_name).validate()
+                )
+                # passes because an optional column is missing
+                ln.curators.DataFrameCurator(
+                    pd.DataFrame(
+                        {
+                        "sample_id": ["sample1", "sample2"],
+                        }
+                    ),
+                    schema=schema_optional_sample_name).validate()
+                )
+        """
+        return SchemaOptionals(self)
     def describe(self, return_str=False) -> None | str:
         """Describe schema."""
         message = str(self)
@@ -654,6 +1124,11 @@ class Schema(Record, CanCurate, TracksRun):
             message + "\nslots:"
             for slot, schema in self.slots.items():
                 message += f"\n    {slot}: " + str(schema)
+        else:
+            tree = describe_schema(self)
+            return format_rich_tree(
+                tree, fallback="no linked features", return_str=return_str
+            )
         if return_str:
             return message
         else:
@@ -671,7 +1146,9 @@ def get_type_str(dtype: str | None) -> str | None:
 def _get_related_name(self: Schema) -> str:
     related_models = dict_related_model_to_related_name(self, instance=self._state.db)
-    related_name = related_models.get(self.itype)
+    related_name = related_models.get(
+        parse_cat_dtype(self.itype, is_itype=True)["registry_str"]
+    )
     return related_name

lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl

lamindb 1.3.2py3-none-any.whl → 1.5.0py3-none-any.whl